Red Team Pipeline

Complete guide to running adversarial red-team testing.

Overview

In this tutorial, you will learn:

  • How the multi-agent red-team system works

  • How to configure the adversarial pipeline

  • How to run and monitor red-team sessions

  • How to analyze attack success patterns

Prerequisites

  • SpecAlign installed with API credentials configured

  • Generated seeds file (see Specification Generation)

  • Understanding of the red-team workflow

Complete Example

"""
Red Team Pipeline Example

This script demonstrates the complete adversarial testing workflow,
from loading seeds to generating DPO preference pairs.
"""

import json
from pathlib import Path

from specalign.core import (
    RedTeamOrchestrator,
    ContextPool,
    create_planner,
    create_dual_agent,
    create_judges,
)
from specalign.config import load_config
from specalign.utils.logging import setup_logging, ProgressTracker


def setup_components(config: dict):
    """
    Initialize all red-team components.

    Args:
        config: Configuration dictionary

    Returns:
        Tuple of initialized components
    """
    print("Step 1: Initializing components...")

    # Context pool for storing successful attacks
    context_pool = ContextPool(
        max_size=config['redteam']['context_pool']['max_size'],
        similarity_threshold=config['redteam']['context_pool']['similarity_threshold']
    )
    print("✓ Context pool initialized")

    # Planner for generating attack strategies
    planner = create_planner(
        config=config,
        context_pool=context_pool
    )
    print("✓ Planner initialized")

    # Dual-role agent (can act as attacker or defender)
    agent = create_dual_agent(config)
    print("✓ Dual-role agent initialized")

    # Safety and quality judges
    safety_judge, quality_judge = create_judges(config)
    print("✓ Judges initialized")

    return context_pool, planner, agent, safety_judge, quality_judge


def load_seeds(seeds_path: str) -> list:
    """Load seed prompts from file."""
    print("\nStep 2: Loading seeds...")

    with open(seeds_path, 'r') as f:
        seeds = json.load(f)

    print(f"✓ Loaded {len(seeds)} seeds")
    return seeds


def run_red_team_loop(
    orchestrator: RedTeamOrchestrator,
    seeds: list,
    max_rounds: int = 5
) -> list:
    """
    Execute the red-team adversarial loop.

    Args:
        orchestrator: The RedTeamOrchestrator instance
        seeds: List of seed prompts
        max_rounds: Maximum rounds per seed

    Returns:
        List of episode results
    """
    print("\nStep 3: Running red-team loop...")
    print("-" * 50)

    episodes = []
    tracker = ProgressTracker(total=len(seeds), desc="Processing seeds")

    for seed in seeds:
        # Run episode for this seed
        episode = orchestrator.run_episode(
            seed=seed,
            max_rounds=max_rounds,
            enable_role_swap=True
        )

        episodes.append(episode)

        # Log result
        status = "✓ Attack succeeded" if episode['success'] else "✗ Attack failed"
        print(f"Seed {seed['id']}: {status}")

        if episode['success']:
            print(f"  └── Violated rules: {episode['violated_rules']}")

        tracker.update(1)

    tracker.close()
    return episodes


def construct_dpo_pairs(episodes: list) -> list:
    """
    Construct DPO preference pairs from successful attacks.

    Args:
        episodes: List of episode results

    Returns:
        List of DPO preference pairs
    """
    print("\nStep 4: Constructing DPO pairs...")

    dpo_pairs = []

    for episode in episodes:
        if not episode['success']:
            continue

        # Get the successful attack round
        attack_round = episode['attack_round']

        # Construct preference pair
        pair = {
            'prompt': attack_round['attacker_prompt'],
            'chosen': episode['compliant_response'],  # Generated compliant response
            'rejected': attack_round['defender_response'],  # Violating response
            'spec_id': episode['spec_id'],
            'violated_rules': episode['violated_rules']
        }

        dpo_pairs.append(pair)

    print(f"✓ Generated {len(dpo_pairs)} DPO pairs")
    return dpo_pairs


def analyze_results(episodes: list):
    """Print analysis of red-team results."""
    print("\nStep 5: Analyzing results...")
    print("-" * 50)

    total = len(episodes)
    successful = sum(1 for e in episodes if e['success'])

    # Collect violated rules
    all_violated = []
    for e in episodes:
        if e['success']:
            all_violated.extend(e['violated_rules'])

    from collections import Counter
    rule_counts = Counter(all_violated)

    print(f"Total episodes: {total}")
    print(f"Successful attacks: {successful} ({100*successful/total:.1f}%)")
    print(f"Unique rules violated: {len(rule_counts)}")
    print("\nTop violated rules:")
    for rule, count in rule_counts.most_common(5):
        print(f"  {rule}: {count} times")


def save_outputs(episodes: list, dpo_pairs: list, output_dir: str):
    """Save all outputs to files."""
    print("\nStep 6: Saving outputs...")

    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    # Save episodes
    episodes_file = output_path / "episodes.jsonl"
    with open(episodes_file, 'w') as f:
        for ep in episodes:
            f.write(json.dumps(ep) + '\n')
    print(f"✓ Saved episodes to {episodes_file}")

    # Save DPO pairs
    dpo_file = output_path / "dpo_dataset.json"
    with open(dpo_file, 'w') as f:
        json.dump(dpo_pairs, f, indent=2)
    print(f"✓ Saved DPO pairs to {dpo_file}")


def main():
    """Run the complete red-team pipeline."""
    print("=" * 50)
    print("Red Team Pipeline Example")
    print("=" * 50)

    # Load configuration
    config = load_config("config.json")
    setup_logging(config['global']['log_level'])

    # Initialize components
    context_pool, planner, agent, safety_judge, quality_judge = setup_components(config)

    # Create orchestrator
    orchestrator = RedTeamOrchestrator(
        config=config,
        planner=planner,
        agent=agent,
        safety_judge=safety_judge,
        quality_judge=quality_judge,
        context_pool=context_pool
    )

    # Load seeds and run
    seeds = load_seeds("output/seeds.json")
    episodes = run_red_team_loop(orchestrator, seeds, max_rounds=5)

    # Process results
    dpo_pairs = construct_dpo_pairs(episodes)
    analyze_results(episodes)
    save_outputs(episodes, dpo_pairs, "output")

    print("\n" + "=" * 50)
    print("✓ Red-team pipeline complete!")
    print("=" * 50)


if __name__ == "__main__":
    main()

Expected Output

==================================================
Red Team Pipeline Example
==================================================
Step 1: Initializing components...
✓ Context pool initialized
✓ Planner initialized
✓ Dual-role agent initialized
✓ Judges initialized

Step 2: Loading seeds...
✓ Loaded 100 seeds

Step 3: Running red-team loop...
--------------------------------------------------
Seed seed_001: ✓ Attack succeeded
  └── Violated rules: ['R12', 'R15']
Seed seed_002: ✗ Attack failed
Seed seed_003: ✓ Attack succeeded
  └── Violated rules: ['R8']
...
Processing seeds: 100%|██████████| 100/100

Step 4: Constructing DPO pairs...
✓ Generated 42 DPO pairs

Step 5: Analyzing results...
--------------------------------------------------
Total episodes: 100
Successful attacks: 42 (42.0%)
Unique rules violated: 15

Top violated rules:
  R12: 18 times
  R8: 12 times
  R15: 9 times
  R3: 7 times
  R21: 5 times

Step 6: Saving outputs...
✓ Saved episodes to output/episodes.jsonl
✓ Saved DPO pairs to output/dpo_dataset.json

==================================================
✓ Red-team pipeline complete!
==================================================

CLI Alternative

# Run red-team with default settings
specalign redteam output/seeds.json

# With custom parameters
specalign redteam output/seeds.json \
    --max-rounds 5 \
    --role-swap \
    --max-seeds 100 \
    --output output/

Key Takeaways

  1. RedTeamOrchestrator coordinates all components in the adversarial loop

  2. Context Pool stores successful attacks for strategy improvement

  3. Planner uses historical successes to generate better attack strategies

  4. Role swapping allows agents to learn from both attacker and defender perspectives

  5. DPO pairs are automatically generated from successful attacks for alignment training

Next Steps