Running Shoes Purchase

L3
ModelContextProtocolPlaywrightShopping

Research running footwear by analyzing biomechanical features, comparing cushioning technologies, evaluating durability ratings, considering user preferences, and recommending optimal shoe selections.

Created by Yaoqi Ye
2025-08-17
Search AggregationComparative Analysis

Model Ranking

Click on the dots to view the trajectory of each task run
Model
Run Results
Pass@4
Pass^4
Avg Time
Avg Turns
Input Tokens
Output Tokens
Total Tokens
Grok
grok-4
4
/4
235.6s
18.8
1,137,015
5,280
1,142,295
OpenAI
o4-mini
1
/4
295.6s
15.5
839,781
10,206
849,987
Claude
claude-opus-4-1
0
/1
--
258.1s
13.0
707,798
2,069
709,867
Claude
claude-sonnet-4
0
/4
166.2s
12.3
575,658
2,321
577,979
Claude
claude-sonnet-4-high
0
/4
114.1s
11.5
539,142
2,140
541,282
Claude
claude-sonnet-4-low
0
/4
137.9s
13.8
796,884
2,501
799,385
DeepSeek
deepseek-chat
0
/4
196.4s
12.8
601,329
1,412
602,741
Gemini
gemini-2-5-flash
0
/4
431.0s
49.3
16,342,113
13,748
16,355,861
Gemini
gemini-2-5-pro
0
/4
101.8s
14.5
597,204
2,878
600,082
Z.ai
glm-4-5
0
/4
118.4s
10.5
386,830
2,704
389,534
OpenAI
gpt-4-1
0
/4
61.3s
9.3
294,120
362
294,482
OpenAI
gpt-4-1-mini
0
/4
89.0s
17.0
906,100
1,831
907,931
OpenAI
gpt-4-1-nano
0
/4
103.3s
22.8
2,718,172
824
2,718,996
OpenAI
gpt-5-high
0
/4
558.2s
13.5
546,989
21,307
568,296
OpenAI
gpt-5-low
0
/4
323.6s
18.8
955,558
13,039
968,597
OpenAI
gpt-5-medium
0
/4
313.9s
15.0
636,994
14,203
651,197
OpenAI
gpt-5-mini-high
0
/4
200.1s
23.3
1,698,637
11,026
1,709,662
OpenAI
gpt-5-mini-low
0
/4
56.4s
9.3
292,224
1,536
293,760
OpenAI
gpt-5-mini-medium
0
/4
228.0s
24.0
1,686,755
11,653
1,698,408
OpenAI
gpt-5-nano-high
0
/4
279.7s
32.8
3,836,198
18,762
3,854,960
OpenAI
gpt-5-nano-low
0
/4
181.3s
17.8
912,926
24,585
937,510
OpenAI
gpt-5-nano-medium
0
/4
248.1s
18.5
906,794
43,491
950,285
OpenAI
gpt-oss-120b
0
/4
31.7s
5.8
112,817
1,162
113,979
Grok
grok-code-fast-1
0
/4
65.5s
13.0
628,995
4,667
633,662
MoonshotAI
kimi-k2-0711
0
/4
148.7s
12.8
511,640
1,515
513,155
MoonshotAI
kimi-k2-0905
0
/4
126.8s
13.3
461,275
1,050
462,324
OpenAI
o3
0
/4
152.8s
13.3
641,693
4,165
645,858
Qwen
qwen-3-coder-plus
0
/4
122.9s
12.8
491,912
1,361
493,273
Qwen
qwen-3-max
0
/4
118.9s
13.5
656,324
539
656,863

Task State

WebArena
view WebArena environment setup for this task

Instruction

  1. Find running shoes:

    • Price between 50.0050.00-60.00
    • "running shoe" must appear in product name
    • Choose the one with highest number of reviews
    • Select black or white color, size 10
    • Add to cart with quantity 2
  2. Record from product page: SKU ID, price, number of reviews, review rating

  3. Record cart subtotal

Output Format:

Plaintext
<answer>
SKUID|id
Price|$XX.XX
NumberOfReviews|XX
ReviewRating|XX%
Subtotal|$XX.XX
</answer>


Verify

*.py
Python
import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
                and message.get("type") == "message"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    answer_content = match.group(1).strip()

    # Parse each line
    result = {}
    lines = answer_content.split("\n")

    if len(lines) != 5:
        print(f"Error: Expected 5 lines in answer, got {len(lines)}", file=sys.stderr)
        return None

    for line in lines:
        if "|" in line:
            key, value = line.split("|", 1)
            result[key.strip()] = value.strip()

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key in ["Price", "Subtotal"]:
            # For price fields, only support $XX.XX format
            # Check if model value has correct format
            if not model_value.startswith("$"):
                mismatches.append(
                    f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
                )
            else:
                # Normalize and compare values
                expected_clean = expected_value.replace("$", "").replace(",", "")
                model_clean = model_value.replace("$", "").replace(",", "")
                
                # Allow small tolerance for price calculations (within $0.01)
                try:
                    expected_float = float(expected_clean)
                    model_float = float(model_clean)
                    if abs(expected_float - model_float) > 0.01:
                        mismatches.append(
                            f"{key}: expected '{expected_value}', got '{model_value}'"
                        )
                except ValueError:
                    if expected_clean != model_clean:
                        mismatches.append(
                            f"{key}: expected '{expected_value}', got '{model_value}'"
                        )

        elif key == "SKUID":
            # SKU should match exactly (case-insensitive)
            if model_value.upper() != expected_value.upper():
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "NumberOfReviews":
            # Number of reviews should match exactly
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "ReviewRating":
            # Rating should match exactly (including % sign)
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the running shoes shopping task has been completed correctly.
    Checks the model's answer against the expected label.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()