Red Team Pipeline
Complete guide to running adversarial red-team testing.
Overview
In this tutorial, you will learn:
How the multi-agent red-team system works
How to configure the adversarial pipeline
How to run and monitor red-team sessions
How to analyze attack success patterns
Prerequisites
SpecAlign installed with API credentials configured
Generated seeds file (see Specification Generation)
Understanding of the red-team workflow
Complete Example
"""
Red Team Pipeline Example
This script demonstrates the complete adversarial testing workflow,
from loading seeds to generating DPO preference pairs.
"""
import json
from pathlib import Path
from specalign.core import (
RedTeamOrchestrator,
ContextPool,
create_planner,
create_dual_agent,
create_judges,
)
from specalign.config import load_config
from specalign.utils.logging import setup_logging, ProgressTracker
def setup_components(config: dict):
"""
Initialize all red-team components.
Args:
config: Configuration dictionary
Returns:
Tuple of initialized components
"""
print("Step 1: Initializing components...")
# Context pool for storing successful attacks
context_pool = ContextPool(
max_size=config['redteam']['context_pool']['max_size'],
similarity_threshold=config['redteam']['context_pool']['similarity_threshold']
)
print("✓ Context pool initialized")
# Planner for generating attack strategies
planner = create_planner(
config=config,
context_pool=context_pool
)
print("✓ Planner initialized")
# Dual-role agent (can act as attacker or defender)
agent = create_dual_agent(config)
print("✓ Dual-role agent initialized")
# Safety and quality judges
safety_judge, quality_judge = create_judges(config)
print("✓ Judges initialized")
return context_pool, planner, agent, safety_judge, quality_judge
def load_seeds(seeds_path: str) -> list:
"""Load seed prompts from file."""
print("\nStep 2: Loading seeds...")
with open(seeds_path, 'r') as f:
seeds = json.load(f)
print(f"✓ Loaded {len(seeds)} seeds")
return seeds
def run_red_team_loop(
orchestrator: RedTeamOrchestrator,
seeds: list,
max_rounds: int = 5
) -> list:
"""
Execute the red-team adversarial loop.
Args:
orchestrator: The RedTeamOrchestrator instance
seeds: List of seed prompts
max_rounds: Maximum rounds per seed
Returns:
List of episode results
"""
print("\nStep 3: Running red-team loop...")
print("-" * 50)
episodes = []
tracker = ProgressTracker(total=len(seeds), desc="Processing seeds")
for seed in seeds:
# Run episode for this seed
episode = orchestrator.run_episode(
seed=seed,
max_rounds=max_rounds,
enable_role_swap=True
)
episodes.append(episode)
# Log result
status = "✓ Attack succeeded" if episode['success'] else "✗ Attack failed"
print(f"Seed {seed['id']}: {status}")
if episode['success']:
print(f" └── Violated rules: {episode['violated_rules']}")
tracker.update(1)
tracker.close()
return episodes
def construct_dpo_pairs(episodes: list) -> list:
"""
Construct DPO preference pairs from successful attacks.
Args:
episodes: List of episode results
Returns:
List of DPO preference pairs
"""
print("\nStep 4: Constructing DPO pairs...")
dpo_pairs = []
for episode in episodes:
if not episode['success']:
continue
# Get the successful attack round
attack_round = episode['attack_round']
# Construct preference pair
pair = {
'prompt': attack_round['attacker_prompt'],
'chosen': episode['compliant_response'], # Generated compliant response
'rejected': attack_round['defender_response'], # Violating response
'spec_id': episode['spec_id'],
'violated_rules': episode['violated_rules']
}
dpo_pairs.append(pair)
print(f"✓ Generated {len(dpo_pairs)} DPO pairs")
return dpo_pairs
def analyze_results(episodes: list):
"""Print analysis of red-team results."""
print("\nStep 5: Analyzing results...")
print("-" * 50)
total = len(episodes)
successful = sum(1 for e in episodes if e['success'])
# Collect violated rules
all_violated = []
for e in episodes:
if e['success']:
all_violated.extend(e['violated_rules'])
from collections import Counter
rule_counts = Counter(all_violated)
print(f"Total episodes: {total}")
print(f"Successful attacks: {successful} ({100*successful/total:.1f}%)")
print(f"Unique rules violated: {len(rule_counts)}")
print("\nTop violated rules:")
for rule, count in rule_counts.most_common(5):
print(f" {rule}: {count} times")
def save_outputs(episodes: list, dpo_pairs: list, output_dir: str):
"""Save all outputs to files."""
print("\nStep 6: Saving outputs...")
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Save episodes
episodes_file = output_path / "episodes.jsonl"
with open(episodes_file, 'w') as f:
for ep in episodes:
f.write(json.dumps(ep) + '\n')
print(f"✓ Saved episodes to {episodes_file}")
# Save DPO pairs
dpo_file = output_path / "dpo_dataset.json"
with open(dpo_file, 'w') as f:
json.dump(dpo_pairs, f, indent=2)
print(f"✓ Saved DPO pairs to {dpo_file}")
def main():
"""Run the complete red-team pipeline."""
print("=" * 50)
print("Red Team Pipeline Example")
print("=" * 50)
# Load configuration
config = load_config("config.json")
setup_logging(config['global']['log_level'])
# Initialize components
context_pool, planner, agent, safety_judge, quality_judge = setup_components(config)
# Create orchestrator
orchestrator = RedTeamOrchestrator(
config=config,
planner=planner,
agent=agent,
safety_judge=safety_judge,
quality_judge=quality_judge,
context_pool=context_pool
)
# Load seeds and run
seeds = load_seeds("output/seeds.json")
episodes = run_red_team_loop(orchestrator, seeds, max_rounds=5)
# Process results
dpo_pairs = construct_dpo_pairs(episodes)
analyze_results(episodes)
save_outputs(episodes, dpo_pairs, "output")
print("\n" + "=" * 50)
print("✓ Red-team pipeline complete!")
print("=" * 50)
if __name__ == "__main__":
main()
Expected Output
==================================================
Red Team Pipeline Example
==================================================
Step 1: Initializing components...
✓ Context pool initialized
✓ Planner initialized
✓ Dual-role agent initialized
✓ Judges initialized
Step 2: Loading seeds...
✓ Loaded 100 seeds
Step 3: Running red-team loop...
--------------------------------------------------
Seed seed_001: ✓ Attack succeeded
└── Violated rules: ['R12', 'R15']
Seed seed_002: ✗ Attack failed
Seed seed_003: ✓ Attack succeeded
└── Violated rules: ['R8']
...
Processing seeds: 100%|██████████| 100/100
Step 4: Constructing DPO pairs...
✓ Generated 42 DPO pairs
Step 5: Analyzing results...
--------------------------------------------------
Total episodes: 100
Successful attacks: 42 (42.0%)
Unique rules violated: 15
Top violated rules:
R12: 18 times
R8: 12 times
R15: 9 times
R3: 7 times
R21: 5 times
Step 6: Saving outputs...
✓ Saved episodes to output/episodes.jsonl
✓ Saved DPO pairs to output/dpo_dataset.json
==================================================
✓ Red-team pipeline complete!
==================================================
CLI Alternative
# Run red-team with default settings
specalign redteam output/seeds.json
# With custom parameters
specalign redteam output/seeds.json \
--max-rounds 5 \
--role-swap \
--max-seeds 100 \
--output output/
Key Takeaways
RedTeamOrchestrator coordinates all components in the adversarial loop
Context Pool stores successful attacks for strategy improvement
Planner uses historical successes to generate better attack strategies
Role swapping allows agents to learn from both attacker and defender perspectives
DPO pairs are automatically generated from successful attacks for alignment training
Next Steps
DPO Dataset Generation - Advanced DPO dataset construction strategies
Configuration Reference - Fine-tune red-team parameters
Core Module - Core module API reference