Movie Reviewer Analysis
Analyze movie review patterns by creating reviewer profile, collecting ratings data, tracking review trends, and generating analytical report on community movie preferences and discussions.
Model Ranking
Model | Run Results | Pass@4 | Pass^4 | Avg Time | Avg Turns | Input Tokens | Output Tokens | Total Tokens |
|---|---|---|---|---|---|---|---|---|
Model | Run Results | Pass@4 | Pass^4 | Avg Time | Avg Turns | Input Tokens | Output Tokens | Total Tokens |
claude-opus-4-5-high | 4 /4 | 170.6s | 16.3 | 960,643 | 4,748 | 965,391 | ||
claude-sonnet-4-5 | 1 /4 | 169.2s | 16.8 | 883,236 | 4,387 | 887,623 | ||
deepseek-v3-2-chat | 1 /4 | 381.4s | 24.0 | 1,235,665 | 7,020 | 1,242,685 | ||
kimi-k2-0905 | 1 /4 | 428.2s | 19.5 | 1,411,247 | 4,182 | 1,415,429 | ||
qwen-3-coder-plus | 1 /4 | 572.2s | 23.3 | 2,591,348 | 4,473 | 2,595,821 | ||
claude-opus-4-1 | 0 /1 | - | - | 357.3s | 18.0 | 930,347 | 3,140 | 933,487 |
claude-sonnet-4 | 0 /4 | 195.1s | 16.8 | 722,173 | 3,995 | 726,168 | ||
claude-sonnet-4-high | 0 /4 | 167.0s | 14.8 | 679,852 | 3,846 | 683,697 | ||
claude-sonnet-4-low | 0 /4 | 184.6s | 16.0 | 862,811 | 4,753 | 867,564 | ||
deepseek-chat | 0 /4 | 259.1s | 14.8 | 594,427 | 2,327 | 596,754 | ||
deepseek-v3-1-terminus | 0 /4 | 500.9s | 15.3 | 801,250 | 3,262 | 804,512 | ||
deepseek-v3-1-terminus-thinking | 0 /4 | 1245.0s | 12.8 | 498,713 | 26,397 | 525,110 | ||
deepseek-v3-2-thinking | 0 /4 | 486.9s | 28.5 | 1,607,168 | 9,134 | 1,616,302 | ||
gemini-2-5-flash | 0 /4 | 40.5s | 7.0 | 122,122 | 1,382 | 123,504 | ||
gemini-2-5-pro | 0 /4 | 266.6s | 23.5 | 1,907,176 | 13,799 | 1,920,975 | ||
gemini-3-pro-high | 0 /4 | 192.9s | 16.8 | 909,111 | 9,783 | 918,894 | ||
gemini-3-pro-low | 0 /4 | 206.8s | 18.8 | 1,030,399 | 9,799 | 1,040,198 | ||
glm-4-5 | 0 /4 | 171.9s | 13.0 | 485,049 | 4,129 | 489,177 | ||
gpt-4-1 | 0 /4 | 77.2s | 13.3 | 693,610 | 1,119 | 694,729 | ||
gpt-4-1-mini | 0 /4 | 266.7s | 43.8 | 9,159,946 | 4,906 | 9,164,852 | ||
gpt-4-1-nano | 0 /4 | 91.3s | 15.3 | 232,806 | 822 | 233,628 | ||
gpt-5-2-high | 0 /4 | 1031.5s | 27.3 | 2,552,865 | 33,680 | 2,586,545 | ||
gpt-5-high | 0 /4 | 2171.3s | 24.3 | 2,441,693 | 58,813 | 2,500,506 | ||
gpt-5-low | 0 /4 | 673.2s | 24.0 | 2,000,604 | 24,406 | 2,025,010 | ||
gpt-5-medium | 0 /4 | 800.7s | 26.5 | 3,057,684 | 30,609 | 3,088,293 | ||
gpt-5-mini-high | 0 /4 | 601.7s | 24.8 | 2,676,432 | 49,459 | 2,725,891 | ||
gpt-5-mini-low | 0 /4 | 130.7s | 12.5 | 706,799 | 7,934 | 714,733 | ||
gpt-5-mini-medium | 0 /4 | 328.8s | 26.0 | 2,925,241 | 15,216 | 2,940,456 | ||
gpt-5-nano-high | 0 /4 | 542.1s | 28.0 | 1,160,633 | 55,814 | 1,216,447 | ||
gpt-5-nano-low | 0 /4 | 242.7s | 13.5 | 149,509 | 13,699 | 163,208 | ||
gpt-5-nano-medium | 0 /4 | 372.7s | 21.8 | 545,277 | 31,092 | 576,370 | ||
gpt-oss-120b | 0 /4 | 54.3s | 9.8 | 253,138 | 2,085 | 255,223 | ||
grok-4 | 0 /4 | 270.8s | 14.8 | 720,254 | 8,848 | 729,101 | ||
grok-4-fast | 0 /4 | 79.3s | 15.5 | 714,974 | 6,462 | 721,436 | ||
grok-code-fast-1 | 0 /4 | 98.2s | 15.5 | 751,695 | 9,522 | 761,217 | ||
kimi-k2-0711 | 0 /4 | 192.0s | 17.3 | 691,503 | 2,294 | 693,798 | ||
o3 | 0 /4 | 219.8s | 16.8 | 744,129 | 7,808 | 751,937 | ||
o4-mini | 0 /4 | 1338.3s | 17.8 | 821,401 | 41,368 | 862,768 | ||
qwen-3-max | 0 /4 | 392.7s | 18.8 | 1,574,675 | 4,140 | 1,578,815 |
Task State
Instruction
I need you to analyze wonderful movie discussions on Postmill and create a summary post. This analysis will help me understand the community's engagement with movie topics.
Task Requirements:
-
Register a new account with username 'movie_reviewer_2024' and password 'movie_reviewer_2024'
-
After successful registration, navigate to the movies forum and analyze the posts on the first page:
- Count all posts that have 4-digit years in their titles
-
Identify the top 3 posts with the highest upvote counts from ALL posts on the first page and record their complete details
-
Find these specific posts and record their data:
- The 'Rittenhouse Square' poster post (record exact upvotes and comments)
-
Navigate through the first 5 pages of movies forum and count the total number of image/poster submissions across all 5 pages
-
Create a new submission in the movies forum with:
- Title: "Wonderful Movies Analysis: Community Favorites [2024]"
- Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the pipe, follow the markdown format):
- Total_Year_Posts|FILL_VALUE
- Top1_Title|FILL_VALUE
- Top1_Upvotes|FILL_VALUE
- Top1_Comments|FILL_VALUE
- Top2_Title|FILL_VALUE
- Top2_Upvotes|FILL_VALUE
- Top2_Comments|FILL_VALUE
- Top3_Title|FILL_VALUE
- Top3_Upvotes|FILL_VALUE
- Top3_Comments|FILL_VALUE
- Rittenhouse_Upvotes|FILL_VALUE
- Rittenhouse_Comments|FILL_VALUE
- Total_Image_Posts_5Pages|FILL_VALUEVerify
import asyncio
import sys
import re
import os
from pathlib import Path
from playwright.async_api import (
async_playwright,
TimeoutError as PlaywrightTimeoutError,
)
# 从环境变量读取 base_url,默认回退到原地址
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
print(f"Using base URL: {BASE_URL}")
def parse_key_value_format(text):
"""
Parse the Key|Value format from the submission body using regex.
Works regardless of line breaks.
"""
data = {}
# Define patterns for each field with the pipe separator
patterns = {
"Total_Year_Posts": r"Total_Year_Posts\s*\|\s*(\d+)",
"Top1_Title": r"Top1_Title\s*\|\s*(.+?)(?=\nTop1_Upvotes|$)",
"Top1_Upvotes": r"Top1_Upvotes\s*\|\s*(\d+)",
"Top1_Comments": r"Top1_Comments\s*\|\s*(\d+)",
"Top2_Title": r"Top2_Title\s*\|\s*(.+?)(?=\nTop2_Upvotes|$)",
"Top2_Upvotes": r"Top2_Upvotes\s*\|\s*(\d+)",
"Top2_Comments": r"Top2_Comments\s*\|\s*(\d+)",
"Top3_Title": r"Top3_Title\s*\|\s*(.+?)(?=\nTop3_Upvotes|$)",
"Top3_Upvotes": r"Top3_Upvotes\s*\|\s*(\d+)",
"Top3_Comments": r"Top3_Comments\s*\|\s*(\d+)",
"Rittenhouse_Upvotes": r"Rittenhouse_Upvotes\s*\|\s*(\d+)",
"Rittenhouse_Comments": r"Rittenhouse_Comments\s*\|\s*(\d+)",
"Total_Image_Posts_5Pages": r"Total_Image_Posts_5Pages\s*\|\s*(\d+)",
}
# Extract each field using regex
for key, pattern in patterns.items():
match = re.search(pattern, text, re.DOTALL | re.MULTILINE)
if match:
# For title fields, clean up newlines and extra spaces
value = match.group(1).strip()
if "Title" in key:
# Replace newlines with spaces and normalize whitespace
value = " ".join(value.split())
data[key] = value
return data
def normalize_text(text):
"""
Normalize text for comparison by handling different quote styles and whitespace.
"""
if not isinstance(text, str):
return str(text)
# Replace various quote styles with standard quotes
text = text.replace(""", "'").replace(""", "'")
text = text.replace('"', '"').replace('"', '"')
text = text.replace("&", "&")
# Normalize whitespace
text = " ".join(text.split())
return text.strip()
async def verify() -> bool:
"""
Verifies that the wonderful movies analysis task has been completed correctly.
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
# Navigate to the main page
print("Navigating to forum...", file=sys.stderr)
await page.goto(f"{BASE_URL}/", wait_until="networkidle")
# Check if logged in as movie_reviewer_2024
user_button = page.locator('button:has-text("movie_reviewer_2024")')
if not await user_button.count():
# Try to login
print("Not logged in, attempting to login...", file=sys.stderr)
await page.click('a:has-text("Log in")')
await page.wait_for_load_state("networkidle")
await page.fill('input[name="_username"]', "movie_reviewer_2024")
await page.fill('input[name="_password"]', "movie_reviewer_2024")
await page.click('button:has-text("Log in")')
await page.wait_for_load_state("networkidle")
user_button = page.locator('button:has-text("movie_reviewer_2024")')
if not await user_button.count():
print(
"Error: Login failed for movie_reviewer_2024", file=sys.stderr
)
return False
print("Successfully logged in as movie_reviewer_2024", file=sys.stderr)
else:
print("Already logged in as movie_reviewer_2024", file=sys.stderr)
# Navigate to movies forum
print("Navigating to movies forum...", file=sys.stderr)
await page.goto(
f"{BASE_URL}/f/movies", wait_until="networkidle"
)
# Look for the submission with our specific title
print(
"Looking for submission 'Wonderful Movies Analysis: Community Favorites [2024]'...",
file=sys.stderr,
)
post_link = page.locator(
'a:has-text("Wonderful Movies Analysis: Community Favorites [2024]")'
)
if not await post_link.count():
print(
"Error: Could not find submission with required title",
file=sys.stderr,
)
return False
# Click on the submission to view its content
await post_link.first.click()
await page.wait_for_load_state("networkidle")
# Extract the submission body content
# Try multiple possible selectors for the post body
post_content = None
selectors = [
".submission__body",
".post-body",
".RichText",
'[class*="RichText"]',
'div:has(> p:has-text("Total_Year_Posts"))',
'div:has-text("Total_Year_Posts"):has-text("Total_Image_Posts_5Pages")',
]
for selector in selectors:
content_element = page.locator(selector)
if await content_element.count():
post_content = await content_element.first.inner_text()
if "Total_Year_Posts" in post_content:
print(
f"Found submission content using selector: {selector}",
file=sys.stderr,
)
break
if not post_content or "Total_Year_Posts" not in post_content:
print(
"Error: Could not find submission body with required format",
file=sys.stderr,
)
return False
print("Submission content found, parsing data...", file=sys.stderr)
print(f"Raw content: {post_content[:200]}...", file=sys.stderr)
# Parse the Key: Value format
extracted_data = parse_key_value_format(post_content)
print(f"Extracted data: {extracted_data}", file=sys.stderr)
# Load expected values from label.txt
label_path = Path(__file__).parent / "label.txt"
if label_path.exists():
with open(label_path, "r") as f:
expected_text = f.read().strip()
expected_data = parse_key_value_format(expected_text)
print("Loaded expected values from label.txt", file=sys.stderr)
# Verify all required keys are present
required_keys = [
"Total_Year_Posts",
"Top1_Title",
"Top1_Upvotes",
"Top1_Comments",
"Top2_Title",
"Top2_Upvotes",
"Top2_Comments",
"Top3_Title",
"Top3_Upvotes",
"Top3_Comments",
"Rittenhouse_Upvotes",
"Rittenhouse_Comments",
"Total_Image_Posts_5Pages",
]
missing_keys = []
for key in required_keys:
if key not in extracted_data:
missing_keys.append(key)
if missing_keys:
print(
f"Error: Missing required keys: {', '.join(missing_keys)}",
file=sys.stderr,
)
return False
# Validate data format and content
errors = []
# Check Total_Year_Posts is a number and matches expected
try:
total_posts = int(extracted_data["Total_Year_Posts"])
if "expected_data" in locals() and "Total_Year_Posts" in expected_data:
expected_total = int(expected_data["Total_Year_Posts"])
if total_posts != expected_total:
errors.append(
f"Total_Year_Posts mismatch: got {total_posts}, expected {expected_total}"
)
except ValueError:
errors.append(
f"Total_Year_Posts must be a number, got: {extracted_data['Total_Year_Posts']}"
)
# If we have expected data, compare against it
if "expected_data" in locals():
# Compare each field
for key in required_keys:
if key in expected_data and key in extracted_data:
expected_val = normalize_text(expected_data[key])
actual_val = normalize_text(extracted_data[key])
# For numeric fields, compare as integers
if (
"Upvotes" in key
or "Comments" in key
or key == "Total_Year_Posts"
or key == "Total_Image_Posts_5Pages"
):
try:
expected_int = int(expected_val)
actual_int = int(actual_val)
if expected_int != actual_int:
errors.append(
f"{key} mismatch: got {actual_int}, expected {expected_int}"
)
except ValueError:
errors.append(
f"{key} should be numeric: got '{actual_val}'"
)
else:
# For text fields, compare normalized text
if expected_val != actual_val:
errors.append(
f"{key} mismatch: got '{actual_val}', expected '{expected_val}'"
)
else:
# If no expected data, just do basic validation
for key in required_keys:
if key not in extracted_data:
errors.append(f"Missing required key: {key}")
elif (
not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]"
):
errors.append(f"{key} was not filled in")
if errors:
print(
"Error: Validation failed with the following issues:",
file=sys.stderr,
)
for error in errors:
print(f" - {error}", file=sys.stderr)
return False
# All checks passed
print("Success: Wonderful movies analysis task completed successfully.")
print("- Account movie_reviewer_2024 verified")
print(
"- Submission 'Wonderful Movies Analysis: Community Favorites [2024]' found"
)
print(f"- Total posts with years: {extracted_data['Total_Year_Posts']}")
print("- Top 3 posts by upvotes identified and documented")
print(
f"- Rittenhouse Square data: {extracted_data['Rittenhouse_Upvotes']} upvotes, {extracted_data['Rittenhouse_Comments']} comments"
)
print(
f"- Total image posts across 5 pages: {extracted_data['Total_Image_Posts_5Pages']}"
)
print("- All data in correct Key|Value format")
return True
except PlaywrightTimeoutError as e:
print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
return False
except Exception as e:
print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()