Gaming Accessories Analysis
Research gaming peripherals by analyzing technical specifications, comparing performance metrics, evaluating user reviews, tracking price trends, and creating detailed gaming accessory recommendations.
Model Ranking
Model | Run Results | Pass@4 | Pass^4 | Avg Time | Avg Turns | Input Tokens | Output Tokens | Total Tokens |
|---|---|---|---|---|---|---|---|---|
Model | Run Results | Pass@4 | Pass^4 | Avg Time | Avg Turns | Input Tokens | Output Tokens | Total Tokens |
claude-opus-4-5-high | 4 /4 | 268.4s | 29.0 | 2,302,607 | 4,574 | 2,307,181 | ||
gpt-5-2-high | 4 /4 | 567.8s | 36.5 | 2,790,294 | 14,039 | 2,804,334 | ||
gpt-5-high | 4 /4 | 1589.7s | 36.0 | 3,870,137 | 45,206 | 3,915,343 | ||
gpt-5-low | 4 /4 | 804.1s | 42.3 | 5,084,346 | 23,323 | 5,107,669 | ||
gpt-5-medium | 4 /4 | 737.6s | 35.8 | 3,446,330 | 27,901 | 3,474,231 | ||
gemini-3-pro-low | 3 /4 | 256.6s | 33.8 | 2,947,475 | 7,524 | 2,954,999 | ||
grok-4-fast | 3 /4 | 123.5s | 25.3 | 1,791,187 | 4,404 | 1,795,591 | ||
claude-sonnet-4 | 2 /4 | 462.3s | 36.8 | 3,419,556 | 5,556 | 3,425,112 | ||
gemini-3-pro-high | 2 /4 | 272.2s | 29.0 | 2,374,477 | 7,109 | 2,381,586 | ||
grok-code-fast-1 | 2 /4 | 142.3s | 27.8 | 2,085,498 | 7,556 | 2,093,054 | ||
o4-mini | 2 /4 | 790.3s | 17.8 | 867,404 | 32,258 | 899,662 | ||
claude-opus-4-1 | 1 /1 | - | - | 870.0s | 37.0 | 3,654,616 | 4,436 | 3,659,052 |
qwen-3-coder-plus | 1 /4 | 166.3s | 13.5 | 936,264 | 1,649 | 937,913 | ||
claude-sonnet-4-5 | 0 /4 | 282.9s | 29.3 | 2,493,041 | 4,289 | 2,497,330 | ||
claude-sonnet-4-high | 0 /4 | 342.6s | 31.0 | 2,554,794 | 4,911 | 2,559,705 | ||
claude-sonnet-4-low | 0 /4 | 327.8s | 30.5 | 2,687,842 | 4,963 | 2,692,805 | ||
deepseek-chat | 0 /4 | 313.1s | 21.8 | 1,703,545 | 1,766 | 1,705,310 | ||
deepseek-v3-1-terminus | 0 /4 | 458.7s | 19.8 | 1,446,186 | 2,835 | 1,449,022 | ||
deepseek-v3-1-terminus-thinking | 0 /4 | 749.7s | 11.5 | 703,486 | 14,966 | 718,452 | ||
deepseek-v3-2-chat | 0 /4 | 458.4s | 35.3 | 2,804,713 | 5,360 | 2,810,073 | ||
deepseek-v3-2-thinking | 0 /4 | 696.4s | 56.3 | 4,863,303 | 10,002 | 4,873,305 | ||
gemini-2-5-flash | 0 /4 | 173.3s | 25.0 | 4,120,751 | 8,100 | 4,128,851 | ||
gemini-2-5-pro | 0 /4 | 381.9s | 41.0 | 4,908,851 | 13,766 | 4,922,617 | ||
glm-4-5 | 0 /4 | 238.3s | 21.8 | 1,325,016 | 4,035 | 1,329,050 | ||
gpt-4-1 | 0 /4 | 97.4s | 15.8 | 1,115,227 | 1,211 | 1,116,437 | ||
gpt-4-1-mini | 0 /4 | 375.4s | 66.3 | 14,031,876 | 6,327 | 14,038,202 | ||
gpt-4-1-nano | 0 /4 | 34.8s | 10.0 | 368,905 | 471 | 369,375 | ||
gpt-5-mini-high | 0 /4 | 671.0s | 39.0 | 5,267,341 | 46,913 | 5,314,254 | ||
gpt-5-mini-low | 0 /4 | 60.1s | 8.3 | 220,864 | 2,409 | 223,273 | ||
gpt-5-mini-medium | 0 /4 | 522.6s | 48.0 | 6,822,880 | 19,524 | 6,842,404 | ||
gpt-5-nano-high | 0 /4 | 643.8s | 28.8 | 2,656,611 | 113,482 | 2,770,093 | ||
gpt-5-nano-low | 0 /4 | 147.7s | 8.8 | 351,674 | 27,140 | 378,814 | ||
gpt-5-nano-medium | 0 /4 | 178.8s | 16.0 | 1,084,708 | 22,173 | 1,106,880 | ||
gpt-oss-120b | 0 /4 | 52.3s | 8.8 | 252,105 | 2,206 | 254,310 | ||
grok-4 | 0 /4 | 354.1s | 30.3 | 2,446,102 | 7,607 | 2,453,710 | ||
kimi-k2-0711 | 0 /4 | 388.6s | 29.5 | 2,436,225 | 2,095 | 2,438,320 | ||
kimi-k2-0905 | 0 /4 | 410.3s | 37.0 | 2,621,140 | 2,929 | 2,624,069 | ||
o3 | 0 /4 | 92.8s | 10.3 | 391,536 | 3,233 | 394,769 | ||
qwen-3-max | 0 /4 | 736.8s | 33.5 | 4,162,835 | 932 | 4,163,767 |
Task State
Instruction
Task Requirements:
-
In Video Games category, count products with customer rating 70% or higher in the first 2 pages
-
Sort products by price (ascending) and identify the cheapest product that has customer reviews
-
Find product with SKU 'B07D6LSCXZ' (N64 Controller), add to cart with quantity 3
-
Add products with SKU 'B071DR5V1K' and 'B082LZ4451' to comparison list, then count total products on comparison page
-
In cart, update N64 Controller quantity to 5 and record the subtotal for this item
-
Proceed to checkout and fill shipping form:
- Email: test.buyer@example.com
- First Name: Alice
- Last Name: Johnson
- Street Address: 456 Oak Avenue
- Country: United States
- State/Province: California
- City: San Francisco
- Zip Code: 94102
- Phone: 415-555-0123 Then count available shipping methods
-
Output your findings in this format:
Products70Plus|count
CheapestReviewedSKU|sku
CheapestReviewedPrice|price
ComparisonCount|count
N64Subtotal|amount
CheckoutEmail|test.buyer@example.com
ShippingState|California
ShippingMethods|count
Verify
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, "r") as f:
messages = json.load(f)
# Find the last assistant message
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
and message.get("type") == "message"
):
content = message.get("content", [])
for item in content:
if item.get("type") == "output_text":
return item.get("text", "")
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
return None
# Look for ... pattern
match = re.search(r"(.*?) ", text, re.IGNORECASE | re.DOTALL)
if not match:
return None
answer_content = match.group(1).strip()
# Parse each line
result = {}
lines = answer_content.split("\n")
if len(lines) != 8:
print(f"Error: Expected 8 lines in answer, got {len(lines)}", file=sys.stderr)
return None
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
result[key.strip()] = value.strip()
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, "r") as f:
lines = f.read().strip().split("\n")
expected = {}
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, "")
# Special handling for different types of values
if key in ["CheapestReviewedPrice", "N64Subtotal"]:
# For price fields, only support $XX.XX format
# Check if model value has correct format
if not model_value.startswith("$"):
mismatches.append(
f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
)
else:
# Normalize and compare values
expected_clean = expected_value.replace("$", "").replace(",", "")
model_clean = model_value.replace("$", "").replace(",", "")
if expected_clean != model_clean:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "CheckoutEmail":
# Email should match exactly (case-insensitive)
if model_value.lower() != expected_value.lower():
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "Products70Plus":
# For count fields, allow some flexibility (products might change)
# But still check if it's a reasonable number
try:
model_count = int(model_value)
expected_count = int(expected_value)
# Allow up to 2 products difference (in case of dynamic content)
if abs(model_count - expected_count) > 2:
mismatches.append(
f"{key}: expected around '{expected_value}', got '{model_value}'"
)
except ValueError:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
# Exact match for other fields
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the gaming accessories analysis task has been completed correctly.
Checks the model's answer against the expected label.
"""
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("Error: Could not load expected answer from label.txt", file=sys.stderr)
return False
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if model_response:
print("Found model response, parsing answer format...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if model_answer:
print("\n=== Model Answer Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f"{key}: {value}", file=sys.stderr)
# Compare answers
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nModel answer does not match expected answer", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
return True
else:
print(
"Warning: Could not parse answer format from model response",
file=sys.stderr,
)
return False
else:
print("No model response found", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()