Gaming Accessories Analysis

PlaywrightShopping

Research gaming peripherals by analyzing technical specifications, comparing performance metrics, evaluating user reviews, tracking price trends, and creating detailed gaming accessory recommendations.

Created by Yaoqi Ye

2025-08-17

Search AggregationComparative AnalysisData Extraction

Model Ranking

Click on the dots to view the trajectory of each task run

Model	Run Results	Pass@4	Pass^4	Avg Time	Avg Turns	Input Tokens	Output Tokens	Total Tokens
Model	Run Results	Pass@4	Pass^4	Avg Time	Avg Turns	Input Tokens	Output Tokens	Total Tokens
gpt-5-high	4 /4			1589.7s	36.0	3,870,137	45,206	3,915,343
gpt-5-low	4 /4			804.1s	42.3	5,084,346	23,323	5,107,669
gpt-5-medium	4 /4			737.6s	35.8	3,446,330	27,901	3,474,231
claude-sonnet-4	2 /4			462.3s	36.8	3,419,556	5,556	3,425,112
grok-code-fast-1	2 /4			142.3s	27.8	2,085,498	7,556	2,093,054
o4-mini	2 /4			790.3s	17.8	867,404	32,258	899,662
claude-opus-4-1	1 /1	-	-	870.0s	37.0	3,654,616	4,436	3,659,052
qwen-3-coder-plus	1 /4			166.3s	13.5	936,264	1,649	937,913
claude-sonnet-4-high	0 /4			342.6s	31.0	2,554,794	4,911	2,559,705
claude-sonnet-4-low	0 /4			327.8s	30.5	2,687,842	4,963	2,692,805
deepseek-chat	0 /4			313.1s	21.8	1,703,545	1,766	1,705,310
gemini-2-5-flash	0 /4			173.3s	25.0	4,120,751	8,100	4,128,851
gemini-2-5-pro	0 /4			381.9s	41.0	4,908,851	13,766	4,922,617
glm-4-5	0 /4			238.3s	21.8	1,325,016	4,035	1,329,050
gpt-4-1	0 /4			97.4s	15.8	1,115,227	1,211	1,116,437
gpt-4-1-mini	0 /4			375.4s	66.3	14,031,876	6,327	14,038,202
gpt-4-1-nano	0 /4			34.8s	10.0	368,905	471	369,375
gpt-5-mini-high	0 /4			671.0s	39.0	5,267,341	46,913	5,314,254
gpt-5-mini-low	0 /4			60.1s	8.3	220,864	2,409	223,273
gpt-5-mini-medium	0 /4			522.6s	48.0	6,822,880	19,524	6,842,404
gpt-5-nano-high	0 /4			643.8s	28.8	2,656,611	113,482	2,770,093
gpt-5-nano-low	0 /4			147.7s	8.8	351,674	27,140	378,814
gpt-5-nano-medium	0 /4			178.8s	16.0	1,084,708	22,173	1,106,880
gpt-oss-120b	0 /4			52.3s	8.8	252,105	2,206	254,310
grok-4	0 /4			354.1s	30.3	2,446,102	7,607	2,453,710
kimi-k2-0711	0 /4			388.6s	29.5	2,436,225	2,095	2,438,320
kimi-k2-0905	0 /4			410.3s	37.0	2,621,140	2,929	2,624,069
o3	0 /4			92.8s	10.3	391,536	3,233	394,769
qwen-3-max	0 /4			736.8s	33.5	4,162,835	932	4,163,767

Task State

WebArena

view WebArena environment setup for this task

Instruction

Task Requirements:

In Video Games category, count products with customer rating 70% or higher in the first 2 pages
Sort products by price (ascending) and identify the cheapest product that has customer reviews
Find product with SKU 'B07D6LSCXZ' (N64 Controller), add to cart with quantity 3
Add products with SKU 'B071DR5V1K' and 'B082LZ4451' to comparison list, then count total products on comparison page
In cart, update N64 Controller quantity to 5 and record the subtotal for this item
Proceed to checkout and fill shipping form:
- Email: test.buyer@example.com
- First Name: Alice
- Last Name: Johnson
- Street Address: 456 Oak Avenue
- Country: United States
- State/Province: California
- City: San Francisco
- Zip Code: 94102
- Phone: 415-555-0123 Then count available shipping methods
Output your findings in this format:

Plaintext

<answer>
Products70Plus|count
CheapestReviewedSKU|sku
CheapestReviewedPrice|price
ComparisonCount|count
N64Subtotal|amount
CheckoutEmail|test.buyer@example.com
ShippingState|California
ShippingMethods|count
</answer>

Verify

Python

import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
                and message.get("type") == "message"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    answer_content = match.group(1).strip()

    # Parse each line
    result = {}
    lines = answer_content.split("\n")

    if len(lines) != 8:
        print(f"Error: Expected 8 lines in answer, got {len(lines)}", file=sys.stderr)
        return None

    for line in lines:
        if "|" in line:
            key, value = line.split("|", 1)
            result[key.strip()] = value.strip()

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key in ["CheapestReviewedPrice", "N64Subtotal"]:
            # For price fields, only support $XX.XX format
            # Check if model value has correct format
            if not model_value.startswith("$"):
                mismatches.append(
                    f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
                )
            else:
                # Normalize and compare values
                expected_clean = expected_value.replace("$", "").replace(",", "")
                model_clean = model_value.replace("$", "").replace(",", "")
                if expected_clean != model_clean:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key == "CheckoutEmail":
            # Email should match exactly (case-insensitive)
            if model_value.lower() != expected_value.lower():
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "Products70Plus":
            # For count fields, allow some flexibility (products might change)
            # But still check if it's a reasonable number
            try:
                model_count = int(model_value)
                expected_count = int(expected_value)
                # Allow up to 2 products difference (in case of dynamic content)
                if abs(model_count - expected_count) > 2:
                    mismatches.append(
                        f"{key}: expected around '{expected_value}', got '{model_value}'"
                    )
            except ValueError:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the gaming accessories analysis task has been completed correctly.
    Checks the model's answer against the expected label.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()