Printer Keyboard Search

PlaywrightShopping

Search and evaluate office equipment by comparing printer specifications, keyboard ergonomics, analyzing user reviews, tracking prices, and generating detailed purchase recommendations report.

Created by Yaoqi Ye

2025-08-17

Search AggregationComparative AnalysisContent Submission

Model Ranking

Click on the dots to view the trajectory of each task run

Model	Run Results	Pass@4	Pass^4	Avg Time	Avg Turns	Input Tokens	Output Tokens	Total Tokens
Model	Run Results	Pass@4	Pass^4	Avg Time	Avg Turns	Input Tokens	Output Tokens	Total Tokens
claude-opus-4-5-high	4 /4			172.7s	19.0	1,180,734	3,678	1,184,412
gpt-5-2-high	4 /4			547.0s	26.0	1,680,509	16,069	1,696,578
gpt-5-low	3 /4			468.5s	23.3	1,967,244	17,619	1,984,863
gpt-5-high	2 /4			1322.5s	34.0	3,364,435	40,260	3,404,695
gemini-2-5-pro	1 /4			113.9s	15.0	651,547	4,449	655,996
gpt-5-medium	1 /4			844.3s	33.3	3,124,476	28,520	3,152,996
gpt-5-mini-high	1 /4			828.7s	38.8	4,677,691	58,023	4,735,714
gpt-5-mini-medium	1 /4			398.1s	31.5	3,223,116	17,562	3,240,678
grok-4	1 /4			331.5s	23.5	2,142,601	7,237	2,149,839
grok-4-fast	1 /4			81.1s	14.8	918,872	3,729	922,600
claude-opus-4-1	0 /1	-	-	216.4s	10.0	474,583	1,824	476,407
claude-sonnet-4	0 /4			168.6s	13.8	742,072	2,527	744,598
claude-sonnet-4-5	0 /4			155.0s	17.8	1,206,196	3,000	1,209,196
claude-sonnet-4-high	0 /4			139.8s	13.5	723,785	2,769	726,554
claude-sonnet-4-low	0 /4			97.5s	10.0	461,677	2,085	463,763
deepseek-chat	0 /4			265.4s	17.0	942,536	1,380	943,916
deepseek-v3-1-terminus	0 /4			116.5s	8.3	392,233	939	393,172
deepseek-v3-1-terminus-thinking	0 /4			716.1s	11.3	532,892	13,533	546,426
deepseek-v3-2-chat	0 /4			331.9s	24.3	1,560,654	2,920	1,563,574
deepseek-v3-2-thinking	0 /4			327.2s	27.8	1,929,032	4,962	1,933,993
gemini-2-5-flash	0 /4			81.7s	13.5	1,128,751	4,459	1,133,209
gemini-3-pro-high	0 /4			397.4s	29.3	4,121,971	12,197	4,134,167
gemini-3-pro-low	0 /4			306.3s	24.5	2,802,659	11,074	2,813,732
glm-4-5	0 /4			165.7s	15.0	802,572	2,235	804,807
gpt-4-1	0 /4			56.7s	9.3	328,684	609	329,293
gpt-4-1-mini	0 /4			136.2s	25.5	1,884,175	2,667	1,886,842
gpt-4-1-nano	0 /4			32.9s	9.5	222,087	559	222,646
gpt-5-mini-low	0 /4			62.3s	9.5	307,720	1,787	309,507
gpt-5-nano-high	0 /4			415.7s	42.5	3,685,970	49,413	3,735,383
gpt-5-nano-low	0 /4			136.5s	17.5	812,000	13,019	825,018
gpt-5-nano-medium	0 /4			319.5s	28.0	1,875,859	50,109	1,925,968
gpt-oss-120b	0 /4			28.1s	5.5	104,375	892	105,266
grok-code-fast-1	0 /4			71.1s	13.0	647,763	5,563	653,326
kimi-k2-0711	0 /4			114.7s	9.8	333,064	1,099	334,162
kimi-k2-0905	0 /4			295.4s	14.8	532,088	1,265	533,353
o3	0 /4			141.9s	15.0	850,138	2,242	852,380
o4-mini	0 /4			485.1s	22.0	1,319,742	15,556	1,335,298
qwen-3-coder-plus	0 /4			1041.6s	28.8	4,426,528	2,090	4,428,618
qwen-3-max	0 /4			338.5s	21.3	1,790,997	633	1,791,630

Task State

WebArena

view WebArena environment setup for this task

Instruction

Search for a printer capable of reducing blue light that:
- Is pink or purple (must be stated in product details, not from image)
- Manufactured in Asia Record SKU ID and price
Find a keyboard with:
- Bluetooth mode (must be stated either stated in details or title)
- Price between $50.00-$100.00
- Highest review rating among matching products Record SKU ID, price, number of reviews, and review rating

Output Format:

Plaintext

<answer>
PrinterSKUID|id
PrinterPrice|$XX.XX
KeyboardSKUID|id
KeyboardPrice|$XX.XX
KeyboardReviews|XX
KeyboardRating|XX%
</answer>

Verify

Python

import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
                and message.get("type") == "message"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    answer_content = match.group(1).strip()

    # Parse each line
    result = {}
    lines = answer_content.split("\n")

    if len(lines) != 6:
        print(f"Error: Expected 6 lines in answer, got {len(lines)}", file=sys.stderr)
        return None

    for line in lines:
        if "|" in line:
            key, value = line.split("|", 1)
            result[key.strip()] = value.strip()

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key in ["PrinterPrice", "KeyboardPrice"]:
            # For price fields, only support $XX.XX format
            # Check if model value has correct format
            if not model_value.startswith("$"):
                mismatches.append(
                    f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
                )
            else:
                # Normalize and compare values
                expected_clean = expected_value.replace("$", "").replace(",", "")
                model_clean = model_value.replace("$", "").replace(",", "")
                if expected_clean != model_clean:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key in ["PrinterSKUID", "KeyboardSKUID"]:
            # SKU should match exactly (case-insensitive)
            if model_value.upper() != expected_value.upper():
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "KeyboardReviews":
            # Number of reviews should match exactly
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "KeyboardRating":
            # Rating should match exactly (including % sign)
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the electronic products task has been completed correctly.
    Checks the model's answer against the expected label.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()