Cloudflare Turnstile Challenge
L3
PlaywrightEval Web
Navigate websites with Cloudflare Turnstile protection, handle security challenges, bypass bot detection mechanisms, and successfully access protected content using automated browser interactions.
Created by Allison Zhan
2025-07-27
User Interaction
Model Ranking
Model | Run Results | Pass@4 | Pass^4 | Avg Time | Avg Turns | Input Tokens | Output Tokens | Total Tokens |
---|---|---|---|---|---|---|---|---|
gpt-5 | 1 /4 | 983.4s | 35.3 | 968,881 | 48,334 | 1,017,215 | ||
o3 | 1 /4 | 373.2s | 34.8 | 781,361 | 16,506 | 797,867 | ||
claude-4-1-opus | 0 /1 | - | - | 565.0s | 25.0 | 548,500 | 7,703 | 556,203 |
claude-4-sonnet | 0 /4 | 317.5s | 30.8 | 1,072,987 | 6,500 | 1,079,487 | ||
deepseek-chat | 0 /4 | 508.3s | 34.8 | 888,423 | 6,492 | 894,915 | ||
gemini-2-5-pro | 0 /4 | 59.0s | 6.5 | 21,491 | 1,431 | 22,922 | ||
grok-4 | 0 /4 | 118.4s | 12.3 | - | - | - | ||
k2 | 0 /4 | 350.5s | 27.3 | 411,861 | 4,807 | 416,668 | ||
qwen-3-coder | 0 /4 | 147.5s | 27.3 | 347,807 | 2,941 | 350,747 |
Task State
Instruction
Verify
Python
#!/usr/bin/env python3
"""
Simplified verification script for Playwright Cloudflare Turnstile authentication task.
This script only verifies that the model successfully reported capturing the expected
success message by checking the last assistant message in messages.json.
"""
import sys
import json
import os
# Expected success message that agent should capture
EXPECTED_SUCCESS_MESSAGE = "Authentication successful! Security challenge verified."
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, 'r') as f:
messages = json.load(f)
# Find the last assistant message with status completed
for message in reversed(messages):
if (message.get('role') == 'assistant' and
message.get('status') == 'completed' and
message.get('type') == 'message'):
content = message.get('content', [])
# Extract text from content
if isinstance(content, list):
for item in content:
if isinstance(item, dict) and item.get('type') in ['text', 'output_text']:
return item.get('text', '')
elif isinstance(content, str):
return content
print("Warning: No completed assistant message found", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def verify():
"""
Verifies that the model's last response contains the expected success message.
"""
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if not model_response:
print("No model response found", file=sys.stderr)
return False
print(f"\nModel response (first 500 chars): {model_response[:500]}...", file=sys.stderr)
# Check if the expected success message is in the model's response
if EXPECTED_SUCCESS_MESSAGE in model_response:
print(f"\n✓ Success message found: '{EXPECTED_SUCCESS_MESSAGE}'", file=sys.stderr)
return True
else:
print(f"\n✗ Success message NOT found: '{EXPECTED_SUCCESS_MESSAGE}'", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = verify()
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()