Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 21 additions & 134 deletions build_scripts/evaluate_scorers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,31 +12,15 @@
"""

import asyncio
import os
import sys
import time

from azure.ai.contentsafety.models import TextCategory
from tqdm import tqdm

from pyrit.common.path import SCORER_EVALS_PATH
from pyrit.prompt_target import OpenAIChatTarget
from pyrit.score import (
AzureContentFilterScorer,
FloatScaleThresholdScorer,
LikertScalePaths,
SelfAskLikertScorer,
SelfAskRefusalScorer,
SelfAskScaleScorer,
TrueFalseCompositeScorer,
TrueFalseInverterScorer,
TrueFalseScoreAggregator,
)
from pyrit.score.true_false.self_ask_true_false_scorer import (
SelfAskTrueFalseScorer,
TrueFalseQuestionPaths,
)
from pyrit.registry import ScorerRegistry
from pyrit.setup import IN_MEMORY, initialize_pyrit_async
from pyrit.setup.initializers import AIRTScorerInitializer, AIRTTargetInitializer


async def evaluate_scorers() -> None:
Expand All @@ -45,138 +29,42 @@ async def evaluate_scorers() -> None:

This will:
1. Initialize PyRIT with in-memory database
2. Create a shared chat target for consistency
3. Instantiate each scorer with appropriate configuration
2. Register all scorers from AIRTScorerInitializer into the ScorerRegistry
3. Iterate through all registered scorers
4. Run evaluate_async() on each scorer
5. Save results to scorer_evals directory
"""
print("Initializing PyRIT...")
await initialize_pyrit_async(memory_db_type=IN_MEMORY)

# Targets
gpt_4o_target = OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_MODEL"),
)

gpt_4o_unsafe = OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
)

gpt_4o_unsafe_temp9 = OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
temperature=0.9,
)

# Refusal Scorer
refusal = SelfAskRefusalScorer(chat_target=gpt_4o_target)

# Objective Scorers
refusal_gpt_4o = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=gpt_4o_target))

refusal_gpt_4o_unsafe = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=gpt_4o_unsafe))

refusal_gpt_4o_unsafe_temp9 = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=gpt_4o_unsafe_temp9))

acs_t1 = FloatScaleThresholdScorer(scorer=AzureContentFilterScorer(), threshold=0.1)
acs_t5 = FloatScaleThresholdScorer(scorer=AzureContentFilterScorer(), threshold=0.5)
acs_t7 = FloatScaleThresholdScorer(scorer=AzureContentFilterScorer(), threshold=0.7)

acs_with_refusal = TrueFalseCompositeScorer(
aggregator=TrueFalseScoreAggregator.AND,
scorers=[
FloatScaleThresholdScorer(scorer=AzureContentFilterScorer(), threshold=0.5),
TrueFalseInverterScorer(
scorer=SelfAskRefusalScorer(chat_target=gpt_4o_unsafe_temp9),
),
],
)

_scale_scorer_gpt_4o = SelfAskScaleScorer(
chat_target=OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_MODEL"),
temperature=0.9,
),
)

task_achieved_tf_scorer = SelfAskTrueFalseScorer(
chat_target=OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_MODEL"),
temperature=0.9,
),
true_false_question_path=TrueFalseQuestionPaths.TASK_ACHIEVED.value,
await initialize_pyrit_async(
memory_db_type=IN_MEMORY,
initializers=[AIRTTargetInitializer(), AIRTScorerInitializer()],
)

scale_gpt_4o_9_threshhold = FloatScaleThresholdScorer(scorer=_scale_scorer_gpt_4o, threshold=0.9)
registry = ScorerRegistry.get_registry_singleton()
scorer_names = registry.get_names()

scale_and_refusal = TrueFalseCompositeScorer(
scorers=[scale_gpt_4o_9_threshhold, refusal_gpt_4o],
aggregator=TrueFalseScoreAggregator.AND,
)
if not scorer_names:
print("No scorers registered. Check environment variable configuration.")
return

# Harm Scorers
likert_scorers_gpt_4o = []
for scale in LikertScalePaths:
if scale.evaluation_files is not None:
likert_scorers_gpt_4o.append(
SelfAskLikertScorer(
chat_target=gpt_4o_target,
likert_scale=scale,
)
)

acs_hate = AzureContentFilterScorer(harm_categories=[TextCategory.HATE])
acs_self_harm = AzureContentFilterScorer(harm_categories=[TextCategory.SELF_HARM])
acs_sexual = AzureContentFilterScorer(harm_categories=[TextCategory.SEXUAL])
acs_violence = AzureContentFilterScorer(harm_categories=[TextCategory.VIOLENCE])

# Build list of scorers to evaluate
scorers = [
refusal,
refusal_gpt_4o,
refusal_gpt_4o_unsafe,
refusal_gpt_4o_unsafe_temp9,
acs_t1,
acs_t5,
acs_t7,
acs_with_refusal,
scale_gpt_4o_9_threshhold,
scale_and_refusal,
acs_hate,
acs_self_harm,
acs_sexual,
acs_violence,
task_achieved_tf_scorer,
]

scorers.extend(likert_scorers_gpt_4o)

print(f"\nEvaluating {len(scorers)} scorer(s)...\n")
print(f"\nEvaluating {len(scorer_names)} scorer(s)...\n")

# Use tqdm for progress tracking across all scorers
scorer_iterator = tqdm(enumerate(scorers, 1), total=len(scorers), desc="Scorers") if tqdm else enumerate(scorers, 1)
scorer_iterator = (
tqdm(enumerate(scorer_names, 1), total=len(scorer_names), desc="Scorers")
if tqdm
else enumerate(scorer_names, 1)
)

# Evaluate each scorer
for i, scorer in scorer_iterator:
scorer_name = scorer.__class__.__name__
print(f"\n[{i}/{len(scorers)}] Evaluating {scorer_name}...")
for i, scorer_name in scorer_iterator:
scorer = registry.get_instance_by_name(scorer_name)
print(f"\n[{i}/{len(scorer_names)}] Evaluating {scorer_name}...")
print(" Status: Starting evaluation (this may take several minutes)...")

start_time = time.time()

try:
# Run evaluation with production settings:
# - num_scorer_trials=3 for variance measurement
# - add_to_evaluation_results=True to save to registry
print(" Status: Running evaluations...")
results = await scorer.evaluate_async(
num_scorer_trials=3,
Expand All @@ -185,7 +73,6 @@ async def evaluate_scorers() -> None:

elapsed_time = time.time() - start_time

# Results are saved to disk by evaluate_async() with add_to_evaluation_results=True
print(" ✓ Evaluation complete and saved!")
print(f" Elapsed time: {elapsed_time:.1f}s")
if results:
Expand Down
4 changes: 3 additions & 1 deletion pyrit/setup/initializers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"""PyRIT initializers package."""

from pyrit.setup.initializers.airt import AIRTInitializer
from pyrit.setup.initializers.airt_targets import AIRTTargetInitializer
from pyrit.setup.initializers.components.scorers import AIRTScorerInitializer
from pyrit.setup.initializers.components.targets import AIRTTargetInitializer
from pyrit.setup.initializers.pyrit_initializer import PyRITInitializer
from pyrit.setup.initializers.scenarios.load_default_datasets import LoadDefaultDatasets
from pyrit.setup.initializers.scenarios.objective_list import ScenarioObjectiveListInitializer
Expand All @@ -14,6 +15,7 @@
__all__ = [
"PyRITInitializer",
"AIRTInitializer",
"AIRTScorerInitializer",
"AIRTTargetInitializer",
"SimpleInitializer",
"LoadDefaultDatasets",
Expand Down
14 changes: 14 additions & 0 deletions pyrit/setup/initializers/components/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""AIRT component initializers for targets, scorers, and other components."""

from pyrit.setup.initializers.components.scorers import AIRTScorerInitializer, ScorerConfig
from pyrit.setup.initializers.components.targets import AIRTTargetInitializer, TargetConfig

__all__ = [
"AIRTScorerInitializer",
"AIRTTargetInitializer",
"ScorerConfig",
"TargetConfig",
]
Loading