Evaluation Example
Script from advanced-evaluation
Source Code
metadata = {
"id": "code:advanced.evaluation.evaluationexample",
"name": "Evaluation Example",
"description": "Script from advanced-evaluation",
"language": "python",
"packages": [],
"args": []
}
"""
Advanced Evaluation Example
This script demonstrates the core evaluation patterns from the advanced-evaluation skill.
It uses pseudocode that works across Python environments without specific dependencies.
"""
# =============================================================================
# DIRECT SCORING EXAMPLE
# =============================================================================
def direct_scoring_example():
"""
Direct scoring: Rate a single response against defined criteria.
Best for objective criteria like accuracy, completeness, instruction following.
"""
# Input
prompt = "Explain quantum entanglement to a high school student"
response = """
Quantum entanglement is like having two magical coins that are connected.
When you flip one and it lands on heads, the other instantly shows tails,
no matter how far apart they are. Scientists call this "spooky action at a distance."
"""
criteria = [
{"name": "Accuracy", "description": "Scientific correctness", "weight": 0.4},
{"name": "Clarity", "description": "Understandable for audience", "weight": 0.3},
{"name": "Engagement", "description": "Interesting and memorable", "weight": 0.3}
]
# System prompt for the evaluator
system_prompt = """You are an expert evaluator. Assess the response against each criterion.
For each criterion:
1. Find specific evidence in the response
2. Score according to the rubric (1-5 scale)
3. Justify your score with evidence
4. Suggest one specific improvement
Be objective and consistent. Base scores on explicit evidence."""
# User prompt structure
user_prompt = f"""## Original Prompt
{prompt}
## Response to Evaluate
{response}
## Criteria
1. **Accuracy** (weight: 0.4): Scientific correctness
2. **Clarity** (weight: 0.3): Understandable for audience
3. **Engagement** (weight: 0.3): Interesting and memorable
## Output Format
Respond with valid JSON:
{{
"scores": [
{{
"criterion": "Accuracy",
"score": 4,
"evidence": ["quote or observation"],
"justification": "why this score",
"improvement": "specific suggestion"
}}
],
"summary": {{
"assessment": "overall quality summary",
"strengths": ["strength 1"],
"weaknesses": ["weakness 1"]
}}
}}"""
# Expected output structure
expected_output = {
"scores": [
{
"criterion": "Accuracy",
"score": 4,
"evidence": ["Correctly uses analogy", "Mentions spooky action at a distance"],
"justification": "Core concept is correct, analogy is appropriate",
"improvement": "Could mention it's a quantum mechanical phenomenon"
},
{
"criterion": "Clarity",
"score": 5,
"evidence": ["Simple coin analogy", "No jargon"],
"justification": "Appropriate for high school level",
"improvement": "None needed"
},
{
"criterion": "Engagement",
"score": 4,
"evidence": ["Magical coins", "Spooky action quote"],
"justification": "Memorable imagery and Einstein quote",
"improvement": "Could add a real-world application"
}
],
"summary": {
"assessment": "Good explanation suitable for the target audience",
"strengths": ["Clear analogy", "Age-appropriate language"],
"weaknesses": ["Could be more comprehensive"]
}
}
# Calculate weighted score
total_weight = sum(c["weight"] for c in criteria)
weighted_score = sum(
s["score"] * next(c["weight"] for c in criteria if c["name"] == s["criterion"])
for s in expected_output["scores"]
) / total_weight
print(f"Weighted Score: {weighted_score:.2f}/5")
return expected_output
# =============================================================================
# PAIRWISE COMPARISON WITH POSITION BIAS MITIGATION
# =============================================================================
def pairwise_comparison_example():
"""
Pairwise comparison: Compare two responses and select the better one.
Includes position swapping to mitigate position bias.
Best for subjective preferences like tone, style, persuasiveness.
"""
prompt = "Explain machine learning to a beginner"
response_a = """
Machine learning is a subset of artificial intelligence that enables
systems to learn and improve from experience without being explicitly
programmed. It uses statistical techniques to give computers the ability
to identify patterns in data.
"""
response_b = """
Imagine teaching a dog a new trick. You show the dog what to do, give
treats when it's right, and eventually it learns. Machine learning works
similarly - we show computers lots of examples, tell them when they're
right, and they learn to recognize patterns on their own.
"""
criteria = ["clarity", "accessibility", "accuracy"]
# System prompt emphasizing bias awareness
system_prompt = """You are an expert evaluator comparing two AI responses.
CRITICAL INSTRUCTIONS:
- Do NOT prefer responses because they are longer
- Do NOT prefer responses based on position (first vs second)
- Focus ONLY on quality according to the specified criteria
- Ties are acceptable when responses are genuinely equivalent"""
# First pass: A first, B second
def evaluate_pass(first_response, second_response, first_label, second_label):
user_prompt = f"""## Original Prompt
{prompt}
## Response {first_label}
{first_response}
## Response {second_label}
{second_response}
## Comparison Criteria
{', '.join(criteria)}
## Output Format
{{
"comparison": [
{{"criterion": "clarity", "winner": "A|B|TIE", "reasoning": "..."}}
],
"result": {{
"winner": "A|B|TIE",
"confidence": 0.0-1.0,
"reasoning": "overall reasoning"
}}
}}"""
return user_prompt
# Position bias mitigation protocol
print("Pass 1: A in first position")
pass1_result = {"winner": "B", "confidence": 0.8}
print("Pass 2: B in first position (swapped)")
pass2_result = {"winner": "A", "confidence": 0.75} # A because B was first
# Map pass2 result back (swap labels)
def map_winner(winner):
return {"A": "B", "B": "A", "TIE": "TIE"}[winner]
pass2_mapped = map_winner(pass2_result["winner"])
print(f"Pass 2 mapped winner: {pass2_mapped}")
# Check consistency
consistent = pass1_result["winner"] == pass2_mapped
if consistent:
final_result = {
"winner": pass1_result["winner"],
"confidence": (pass1_result["confidence"] + pass2_result["confidence"]) / 2,
"position_consistent": True
}
else:
final_result = {
"winner": "TIE",
"confidence": 0.5,
"position_consistent": False,
"bias_detected": True
}
print(f"\nFinal Result: {final_result}")
return final_result
# =============================================================================
# RUBRIC GENERATION
# =============================================================================
def rubric_generation_example():
"""
Generate a domain-specific scoring rubric.
Rubrics reduce evaluation variance by 40-60%.
"""
criterion_name = "Code Readability"
criterion_description = "How easy the code is to understand and maintain"
domain = "software engineering"
scale = "1-5"
strictness = "balanced"
system_prompt = f"""You are an expert in creating evaluation rubrics.
Create clear, actionable rubrics with distinct boundaries between levels.
Strictness: {strictness}
- lenient: Lower bar for passing scores
- balanced: Fair, typical expectations
- strict: High standards, critical evaluation"""
user_prompt = f"""Create a scoring rubric for:
**Criterion**: {criterion_name}
**Description**: {criterion_description}
**Scale**: {scale}
**Domain**: {domain}
Generate:
1. Clear descriptions for each score level
2. Specific characteristics that define each level
3. Brief example text for each level
4. General scoring guidelines
5. Edge cases with guidance"""
# Expected rubric structure
rubric = {
"criterion": criterion_name,
"scale": {"min": 1, "max": 5},
"levels": [
{
"score": 1,
"label": "Poor",
"description": "Code is difficult to understand without significant effort",
"characteristics": [
"No meaningful variable or function names",
"No comments or documentation",
"Deeply nested or convoluted logic"
],
"example": "def f(x): return x[0]*x[1]+x[2]"
},
{
"score": 3,
"label": "Adequate",
"description": "Code is understandable with some effort",
"characteristics": [
"Most variables have meaningful names",
"Basic comments for complex sections",
"Logic is followable but could be cleaner"
],
"example": "def calc_total(items): # calculate sum\n total = 0\n for i in items: total += i\n return total"
},
{
"score": 5,
"label": "Excellent",
"description": "Code is immediately clear and maintainable",
"characteristics": [
"All names are descriptive and consistent",
"Comprehensive documentation",
"Clean, modular structure"
],
"example": "def calculate_total_price(items: List[Item]) -> Decimal:\n '''Calculate the total price of all items.'''\n return sum(item.price for item in items)"
}
],
"scoring_guidelines": [
"Focus on readability, not cleverness",
"Consider the intended audience (team skill level)",
"Consistency matters more than style preference"
],
"edge_cases": [
{
"situation": "Code uses domain-specific abbreviations",
"guidance": "Score based on readability for domain experts, not general audience"
},
{
"situation": "Code is auto-generated",
"guidance": "Apply same standards but note in evaluation"
}
]
}
print("Generated Rubric:")
for level in rubric["levels"]:
print(f" {level['score']}: {level['label']} - {level['description']}")
return rubric
# =============================================================================
# MAIN
# =============================================================================
if __name__ == "__main__":
print("=" * 60)
print("DIRECT SCORING EXAMPLE")
print("=" * 60)
direct_scoring_example()
print("\n" + "=" * 60)
print("PAIRWISE COMPARISON EXAMPLE")
print("=" * 60)
pairwise_comparison_example()
print("\n" + "=" * 60)
print("RUBRIC GENERATION EXAMPLE")
print("=" * 60)
rubric_generation_example()