Running Shoes Purchase

PlaywrightShopping

Research running footwear by analyzing biomechanical features, comparing cushioning technologies, evaluating durability ratings, considering user preferences, and recommending optimal shoe selections.

Created by Yaoqi Ye

2025-08-17

Search AggregationComparative Analysis

Model Ranking

Click on the dots to view the trajectory of each task run

Model	Run Results	Pass@4	Pass^4	Avg Time	Avg Turns	Input Tokens	Output Tokens	Total Tokens
Model	Run Results	Pass@4	Pass^4	Avg Time	Avg Turns	Input Tokens	Output Tokens	Total Tokens
grok-4	4 /4			235.6s	18.8	1,137,015	5,280	1,142,295
o4-mini	1 /4			295.6s	15.5	839,781	10,206	849,987
claude-opus-4-1	0 /1	-	-	258.1s	13.0	707,798	2,069	709,867
claude-sonnet-4	0 /4			166.2s	12.3	575,658	2,321	577,979
claude-sonnet-4-high	0 /4			114.1s	11.5	539,142	2,140	541,282
claude-sonnet-4-low	0 /4			137.9s	13.8	796,884	2,501	799,385
deepseek-chat	0 /4			196.4s	12.8	601,329	1,412	602,741
gemini-2-5-flash	0 /4			431.0s	49.3	16,342,113	13,748	16,355,861
gemini-2-5-pro	0 /4			101.8s	14.5	597,204	2,878	600,082
glm-4-5	0 /4			118.4s	10.5	386,830	2,704	389,534
gpt-4-1	0 /4			61.3s	9.3	294,120	362	294,482
gpt-4-1-mini	0 /4			89.0s	17.0	906,100	1,831	907,931
gpt-4-1-nano	0 /4			103.3s	22.8	2,718,172	824	2,718,996
gpt-5-high	0 /4			558.2s	13.5	546,989	21,307	568,296
gpt-5-low	0 /4			323.6s	18.8	955,558	13,039	968,597
gpt-5-medium	0 /4			313.9s	15.0	636,994	14,203	651,197
gpt-5-mini-high	0 /4			200.1s	23.3	1,698,637	11,026	1,709,662
gpt-5-mini-low	0 /4			56.4s	9.3	292,224	1,536	293,760
gpt-5-mini-medium	0 /4			228.0s	24.0	1,686,755	11,653	1,698,408
gpt-5-nano-high	0 /4			279.7s	32.8	3,836,198	18,762	3,854,960
gpt-5-nano-low	0 /4			181.3s	17.8	912,926	24,585	937,510
gpt-5-nano-medium	0 /4			248.1s	18.5	906,794	43,491	950,285
gpt-oss-120b	0 /4			31.7s	5.8	112,817	1,162	113,979
grok-code-fast-1	0 /4			65.5s	13.0	628,995	4,667	633,662
kimi-k2-0711	0 /4			148.7s	12.8	511,640	1,515	513,155
kimi-k2-0905	0 /4			126.8s	13.3	461,275	1,050	462,324
o3	0 /4			152.8s	13.3	641,693	4,165	645,858
qwen-3-coder-plus	0 /4			122.9s	12.8	491,912	1,361	493,273
qwen-3-max	0 /4			118.9s	13.5	656,324	539	656,863

Task State

WebArena

view WebArena environment setup for this task

Instruction

Find running shoes:
- Price between $50.00-$ 60.00
- "running shoe" must appear in product name
- Choose the one with highest number of reviews
- Select black or white color, size 10
- Add to cart with quantity 2
Record from product page: SKU ID, price, number of reviews, review rating
Record cart subtotal

Output Format:

Plaintext

<answer>
SKUID|id
Price|$XX.XX
NumberOfReviews|XX
ReviewRating|XX%
Subtotal|$XX.XX
</answer>

Verify

Python

import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
                and message.get("type") == "message"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    answer_content = match.group(1).strip()

    # Parse each line
    result = {}
    lines = answer_content.split("\n")

    if len(lines) != 5:
        print(f"Error: Expected 5 lines in answer, got {len(lines)}", file=sys.stderr)
        return None

    for line in lines:
        if "|" in line:
            key, value = line.split("|", 1)
            result[key.strip()] = value.strip()

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key in ["Price", "Subtotal"]:
            # For price fields, only support $XX.XX format
            # Check if model value has correct format
            if not model_value.startswith("$"):
                mismatches.append(
                    f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
                )
            else:
                # Normalize and compare values
                expected_clean = expected_value.replace("$", "").replace(",", "")
                model_clean = model_value.replace("$", "").replace(",", "")
                
                # Allow small tolerance for price calculations (within $0.01)
                try:
                    expected_float = float(expected_clean)
                    model_float = float(model_clean)
                    if abs(expected_float - model_float) > 0.01:
                        mismatches.append(
                            f"{key}: expected '{expected_value}', got '{model_value}'"
                        )
                except ValueError:
                    if expected_clean != model_clean:
                        mismatches.append(
                            f"{key}: expected '{expected_value}', got '{model_value}'"
                        )

        elif key == "SKUID":
            # SKU should match exactly (case-insensitive)
            if model_value.upper() != expected_value.upper():
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "NumberOfReviews":
            # Number of reviews should match exactly
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "ReviewRating":
            # Rating should match exactly (including % sign)
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the running shoes shopping task has been completed correctly.
    Checks the model's answer against the expected label.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()