Advanced Product Analysis

PlaywrightShopping

Perform comprehensive product analysis including feature comparisons, price tracking, review aggregation, customer sentiment analysis, and generate detailed recommendation reports for informed purchasing decisions.

Created by Yaoqi Ye

2025-08-17

Data ExtractionComparative AnalysisContent Submission

Model Ranking

Click on the dots to view the trajectory of each task run

Model	Run Results	Pass@4	Pass^4	Avg Time	Avg Turns	Input Tokens	Output Tokens	Total Tokens
Model	Run Results	Pass@4	Pass^4	Avg Time	Avg Turns	Input Tokens	Output Tokens	Total Tokens
claude-sonnet-4	4 /4			301.0s	18.5	1,254,381	3,116	1,257,496
gpt-5-high	4 /4			513.9s	16.3	920,905	14,682	935,587
gpt-5-low	4 /4			298.2s	18.3	1,016,693	8,540	1,025,232
gpt-5-medium	4 /4			249.8s	16.5	926,443	9,176	935,619
grok-4	4 /4			187.7s	16.0	989,376	3,736	993,112
grok-code-fast-1	4 /4			101.1s	18.5	1,185,490	3,791	1,189,281
kimi-k2-0905	4 /4			291.3s	18.3	1,030,095	1,766	1,031,861
qwen-3-coder-plus	4 /4			171.2s	16.8	1,148,679	1,432	1,150,111
gemini-2-5-pro	2 /4			175.6s	20.0	1,485,602	5,585	1,491,187
gpt-5-mini-high	2 /4			190.9s	26.3	2,038,425	7,870	2,046,295
gpt-5-mini-medium	2 /4			129.4s	21.8	1,383,005	3,164	1,386,169
gpt-5-nano-high	2 /4			413.4s	35.5	3,086,858	55,318	3,142,175
kimi-k2-0711	2 /4			248.8s	17.5	931,105	1,239	932,344
o3	2 /4			119.1s	14.5	642,870	2,164	645,035
claude-opus-4-1	1 /1	-	-	452.7s	20.0	1,438,842	3,138	1,441,980
claude-sonnet-4-high	0 /4			177.2s	16.5	1,192,021	3,107	1,195,128
claude-sonnet-4-low	0 /4			175.3s	17.0	1,260,664	3,093	1,263,757
deepseek-chat	0 /4			381.9s	16.3	853,633	941	854,574
gemini-2-5-flash	0 /4			159.6s	25.0	4,586,119	4,405	4,590,524
glm-4-5	0 /4			146.8s	14.5	707,325	2,099	709,423
gpt-4-1	0 /4			78.4s	13.3	624,332	570	624,902
gpt-4-1-mini	0 /4			174.5s	39.5	3,930,134	3,097	3,933,231
gpt-4-1-nano	0 /4			46.0s	9.5	426,125	1,419	427,544
gpt-5-mini-low	0 /4			87.5s	16.8	918,785	994	919,779
gpt-5-nano-low	0 /4			194.2s	20.0	1,169,479	25,060	1,194,539
gpt-5-nano-medium	0 /4			192.4s	23.8	1,624,596	23,258	1,647,854
gpt-oss-120b	0 /4			37.9s	6.8	195,362	1,276	196,638
o4-mini	0 /4			1297.9s	27.5	2,449,085	30,986	2,480,071
qwen-3-max	0 /4			613.1s	33.0	3,978,912	1,041	3,979,954

Task State

WebArena

view WebArena environment setup for this task

Instruction

Task Requirements:

Search for products with 'Ginger' in the Product Name field and price range $50.00 to$ 100.00
Add Q Mixers Premium Ginger Ale product to the comparison list
Find Intel NUC Kit product in Electronics category and add it to the comparison list
From the comparison page:
- Record SKU numbers for both products
- Add all products to cart
Record the total cart value
On the Ginger Ale product detail page, record:
- Number of customer reviews
- Name of the most recent reviewer (on top of the first page)
Output your findings in this format:

Plaintext

<answer>
GingerAleSKU|sku
IntelNUCSKU|sku
CartTotal|amount
ReviewCount|count
LatestReviewer|name
</answer>

Example Output:

Plaintext

<answer>
GingerAleSKU|XXXXXXXXX
IntelNUCSKU|XXXXXXXXX
CartTotal|$XXX.XX
ReviewCount|XX
LatestReviewer|name
</answer>

Verify

Python

import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>xxx</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    answer_content = match.group(1).strip()

    # Parse each line
    result = {}
    lines = answer_content.split("\n")

    if len(lines) != 5:
        print(f"Error: Expected 5 lines in answer, got {len(lines)}", file=sys.stderr)
        return None

    for line in lines:
        if "|" in line:
            key, value = line.split("|", 1)
            result[key.strip()] = value.strip()

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key == "GingerAleSKU":
            # Check exact SKU match
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "IntelNUCSKU":
            # Check exact SKU match
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "CartTotal":
            # For price fields, only support $XX.XX format
            # Check if model value has correct format
            if not model_value.startswith("$"):
                mismatches.append(
                    f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
                )
            else:
                # Normalize and compare values
                expected_clean = expected_value.replace("$", "").replace(",", "")
                model_clean = model_value.replace("$", "").replace(",", "")
                if expected_clean != model_clean:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key == "ReviewCount":
            # Check review count matches
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "LatestReviewer":
            # Check reviewer name (allow partial match for names)
            if expected_value.lower() not in model_value.lower() and model_value.lower() not in expected_value.lower():
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the advanced product analysis task has been completed correctly.
    First checks the model's answer against the expected label.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()