Fitness Promotion Strategy

L3
ModelContextProtocolPlaywrightShopping Admin

Develop fitness product promotion campaigns by analyzing sales data, creating targeted offers, configuring promotional rules, and implementing cross-selling strategies in admin dashboard.

Created by Fanqing Meng
2025-08-17
Data ExtractionComparative AnalysisInventory ManagementContent Submission

Model Ranking

Click on the dots to view the trajectory of each task run
Model
Run Results
Pass@4
Pass^4
Avg Time
Avg Turns
Input Tokens
Output Tokens
Total Tokens
Claude
claude-4-1-opus
0
/1
--
235.5s
12.0
675,712
1,525
677,237
Claude
claude-4-sonnet
0
/4
318.2s
12.5
1,073,041
1,738
1,074,779
DeepSeek
deepseek-chat
0
/4
230.4s
13.5
309,883
1,558
311,440
Gemini
gemini-2-5-pro
0
/4
100.0s
10.3
1,045,888
3,579
1,049,466
OpenAI
gpt-5
0
/4
214.9s
12.8
753,453
7,350
760,803
Grok
grok-4
0
/4
63.7s
4.3
-
-
-
MoonshotAI
k2
0
/4
139.1s
11.3
155,821
681
156,502
OpenAI
o3
0
/4
138.6s
12.5
440,609
3,209
443,818
Qwen
qwen-3-coder
0
/4
177.2s
13.3
1,015,086
879
1,015,965

Task State

WebArena
view WebArena environment setup for this task

Instruction



Verify

*.py
Python
import asyncio
import sys
import re
import os
import json
from pathlib import Path

def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None
    
    try:
        with open(messages_path, 'r') as f:
            messages = json.load(f)
        
        # Find the last assistant message
        for message in reversed(messages):
            if message.get('role') == 'assistant' and message.get('status') == 'completed':
                content = message.get('content', [])
                for item in content:
                    if item.get('type') == 'output_text':
                        return item.get('text', '')
        
        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None

def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None
    
    # Look for <answer>...</answer> pattern
    match = re.search(r'<answer>(.*?)</answer>', text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None
    
    answer_content = match.group(1).strip()
    
    # Parse each line
    result = {}
    lines = answer_content.split('\n')
    
    # Skip the check for exact number of lines - just parse what we have
    # if len(lines) != 13:
    #     print(f"Error: Expected 13 lines in answer, got {len(lines)}", file=sys.stderr)
    #     return None
    
    for line in lines:
        if '|' in line:
            key, value = line.split('|', 1)
            result[key.strip()] = value.strip()
    
    return result

def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, 'r') as f:
            lines = f.read().strip().split('\n')
        
        expected = {}
        for line in lines:
            if '|' in line:
                key, value = line.split('|', 1)
                expected[key.strip()] = value.strip()
        
        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None

def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False
    
    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, '')
        
        # Special handling for different types of values
        if key in ['Bestseller1', 'Bestseller2', 'Bestseller3']:
            # Check if all parts match (name:price:quantity:sku:inventory:status)
            if ':' in expected_value and ':' in model_value:
                expected_parts = expected_value.split(':')
                model_parts = model_value.split(':')
                if len(expected_parts) == 6 and len(model_parts) == 6:
                    # Compare each part
                    for i, (exp, mod) in enumerate(zip(expected_parts, model_parts)):
                        if i == 1:  # Price field
                            exp_clean = exp.replace('$', '').replace(',', '')
                            mod_clean = mod.replace('$', '').replace(',', '')
                            if exp_clean != mod_clean:
                                mismatches.append(f"{key} price: expected '{exp}', got '{mod}'")
                        elif i == 4:  # Inventory field (may have decimal places)
                            exp_float = float(exp.replace(',', ''))
                            mod_float = float(mod.replace(',', ''))
                            if abs(exp_float - mod_float) > 0.0001:
                                mismatches.append(f"{key} inventory: expected '{exp}', got '{mod}'")
                        else:
                            if exp.lower() != mod.lower():
                                mismatches.append(f"{key} part {i}: expected '{exp}', got '{mod}'")
                else:
                    mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'")
            else:
                if expected_value != model_value:
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'LowestInventoryProduct':
            # Check product name and inventory
            if ':' in expected_value and ':' in model_value:
                expected_name, expected_inv = expected_value.rsplit(':', 1)
                model_name, model_inv = model_value.rsplit(':', 1)
                if expected_name.lower() != model_name.lower():
                    mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'")
                exp_float = float(expected_inv.replace(',', ''))
                mod_float = float(model_inv.replace(',', ''))
                if abs(exp_float - mod_float) > 0.0001:
                    mismatches.append(f"{key} inventory: expected '{expected_inv}', got '{model_inv}'")
            else:
                if expected_value != model_value:
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key in ['TotalRevenue', 'MinimumPurchaseRule']:
            # For price/amount fields, normalize format
            expected_clean = expected_value.replace('$', '').replace(',', '')
            model_clean = model_value.replace('$', '').replace(',', '')
            if expected_clean != model_clean:
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'BestsellerInSearch':
            # Check search term and count
            if expected_value.lower() != model_value.lower():
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'PercentageDiscountRule':
            # Check rule name and percentage
            if ':' in expected_value and ':' in model_value:
                expected_name, expected_pct = expected_value.rsplit(':', 1)
                model_name, model_pct = model_value.rsplit(':', 1)
                if expected_name != model_name:
                    mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'")
                # Normalize percentage (20% vs 20 vs 0.20)
                exp_pct_clean = expected_pct.replace('%', '').strip()
                mod_pct_clean = model_pct.replace('%', '').strip()
                if exp_pct_clean != mod_pct_clean:
                    mismatches.append(f"{key} percentage: expected '{expected_pct}', got '{model_pct}'")
            else:
                if expected_value != model_value:
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'TopCustomer':
            # Check name:email:group
            if ':' in expected_value and ':' in model_value:
                expected_parts = expected_value.split(':')
                model_parts = model_value.split(':')
                if len(expected_parts) == 3 and len(model_parts) == 3:
                    exp_name, exp_email, exp_group = expected_parts
                    mod_name, mod_email, mod_group = model_parts
                    if exp_name != mod_name:
                        mismatches.append(f"{key} name: expected '{exp_name}', got '{mod_name}'")
                    if exp_email.lower() != mod_email.lower():
                        mismatches.append(f"{key} email: expected '{exp_email}', got '{mod_email}'")
                    if exp_group.lower() != mod_group.lower():
                        mismatches.append(f"{key} group: expected '{exp_group}', got '{mod_group}'")
                else:
                    mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'")
            else:
                if expected_value != model_value:
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'MostRecentOrderDate':
            # Date format may vary, do flexible comparison
            if expected_value.lower() == 'none' and model_value.lower() == 'none':
                continue
            elif expected_value != model_value:
                # Could add more flexible date parsing here if needed
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        else:
            # Exact match for other fields (counts, etc.)
            if str(model_value) != str(expected_value):
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
    
    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False
    
    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True

async def verify() -> bool:
    """
    Verifies that the bestseller analysis and promotion task has been completed correctly.
    First checks the model's answer against the expected label,
    then optionally verifies the actual state in the Magento Admin.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"
    
    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False
    
    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)
        
        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)
            
            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print("Warning: Could not parse answer format from model response", file=sys.stderr)
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False

def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)

if __name__ == "__main__":
    main()