Extraction Table

L3
ModelContextProtocolPlaywrightEval Web

Extract structured data from complex web tables, parse multi-level headers, handle dynamic content loading, transform data formats, and export comprehensive datasets.

Created by Arvin Xu
2025-08-18
Data Extraction

Model Ranking

Click on the dots to view the trajectory of each task run
Model
Run Results
Pass@4
Pass^4
Avg Time
Avg Turns
Input Tokens
Output Tokens
Total Tokens
OpenAI
gpt-5
4
/4
485.0s
20.3
652,775
21,704
674,479
OpenAI
o3
1
/4
97.7s
10.3
134,942
5,502
140,443
Claude
claude-4-1-opus
0
/1
--
151.3s
4.0
33,113
2,322
35,435
Claude
claude-4-sonnet
0
/4
108.2s
6.3
81,135
4,716
85,850
DeepSeek
deepseek-chat
0
/4
33.7s
3.0
15,364
258
15,622
Gemini
gemini-2-5-pro
0
/4
130.9s
13.3
388,505
7,930
396,435
Grok
grok-4
0
/4
163.5s
6.8
-
-
-
MoonshotAI
k2
0
/4
208.0s
8.8
147,060
3,653
150,713
Qwen
qwen-3-coder
0
/4
38.0s
7.0
152,535
890
153,425

Task State


Instruction



Verify

*.py
Python
#!/usr/bin/env python3
"""
Verification script for checking Playwright web data extraction tasks.

This script verifies whether the model successfully extracted CSV format data from web pages
by checking the last assistant message in messages.json.
"""

import sys
import json
import os
import re
import csv
from io import StringIO

# Expected CSV header (must match exactly, including spaces)
EXPECTED_HEADER_LINE = "Title, Rating, Likes, Views, Replies"
EXPECTED_HEADERS = ["Title", "Rating", "Likes", "Views", "Replies"]
# Exact number of data rows (must match data.csv exactly)
EXPECTED_DATA_ROWS = 97


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"| MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("| Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, 'r') as f:
            messages = json.load(f)

        # Find the last assistant message with status completed
        for message in reversed(messages):
            if (message.get('role') == 'assistant' and
                message.get('status') == 'completed' and
                message.get('type') == 'message'):
                content = message.get('content', [])
                # Extract text from content
                if isinstance(content, list):
                    for item in content:
                        if isinstance(item, dict) and item.get('type') in ['text', 'output_text']:
                            return item.get('text', '')
                elif isinstance(content, str):
                    return content

        print("| Warning: No completed assistant message found", file=sys.stderr)
        return None
    except Exception as e:
        print(f"| Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def extract_csv_from_response(response):
    """
    Extract CSV data from model response.
    """
    # Look for CSV code blocks
    csv_pattern = r'```(?:csv)?\s*\n(.*?)\n```'
    matches = re.findall(csv_pattern, response, re.DOTALL | re.IGNORECASE)

    if matches:
        return matches[-1].strip()  # Return the last CSV block

    # If no code block found, try to find CSV data starting with header
    lines = response.split('\n')
    csv_start = -1

    # Stricter header matching: look for lines containing "Title" and "Rating"
    for i, line in enumerate(lines):
        if "Title" in line and "Rating" in line and "Likes" in line:
            csv_start = i
            break

    if csv_start >= 0:
        # Extract from header until empty line or non-CSV format line
        csv_lines = []
        for line in lines[csv_start:]:
            line = line.strip()
            if not line or not (',' in line):
                if csv_lines:  # If we already have data, stop at empty line
                    break
                continue
            csv_lines.append(line)
            if len(csv_lines) > 100:  # Prevent extracting too many rows
                break

        return '\n'.join(csv_lines)

    return None


def validate_csv_data(csv_text):
    """
    Validate CSV data format and content, must match data.csv exactly.
    """
    if not csv_text:
        return False, "CSV data not found"

    try:
        lines = csv_text.strip().split('\n')

        # Check total number of rows (1 header row + data rows)
        expected_total_rows = EXPECTED_DATA_ROWS + 1
        if len(lines) != expected_total_rows:
            return False, f"| CSV total row count mismatch, expected: {expected_total_rows} rows, actual: {len(lines)} rows"

        # Check header row format (must match exactly)
        header_line = lines[0].strip()
        if header_line != EXPECTED_HEADER_LINE:
            return False, f"| Header format mismatch, expected: '{EXPECTED_HEADER_LINE}', actual: '{header_line}'"

        # Parse CSV to validate structure
        csv_reader = csv.reader(StringIO(csv_text))
        rows = list(csv_reader)

        # Check column count for each row
        expected_columns = len(EXPECTED_HEADERS)
        for i, row in enumerate(rows):
            if len(row) != expected_columns:
                return False, f"| Row {i+1} column count incorrect, expected: {expected_columns} columns, actual: {len(row)} columns"

        # Validate data row format
        valid_rows = 0
        for i, row in enumerate(rows[1:], 2):  # Skip header, start from row 2
            # Check if each column has data
            if not all(cell.strip() for cell in row):
                return False, f"| Row {i} contains empty data"

            # Check numeric column format (Rating, Likes, Views, Replies should not have quotes)
            for col_idx, col_name in [(1, "Rating"), (2, "Likes"), (3, "Views"), (4, "Replies")]:
                value = row[col_idx].strip()

                # Check for quotes (should not have any)
                if value.startswith('"') and value.endswith('"'):
                    return False, f"| Row {i} {col_name} should not have quotes, actual: {value}"

                # Check numeric format
                if col_name == "Rating":
                    try:
                        float(value)
                    except ValueError:
                        return False, f"| Row {i} {col_name} should be a number, actual: {value}"
                else:
                    if not value.isdigit():
                        return False, f"| Row {i} {col_name} should be pure digits, actual: {value}"

            valid_rows += 1

        # Validate number of data rows
        if valid_rows != EXPECTED_DATA_ROWS:
            return False, f"| Valid data row count mismatch, expected: {EXPECTED_DATA_ROWS} rows, actual: {valid_rows} rows"

        return True, f"| CSV validation successful: format matches data.csv exactly, {valid_rows} valid data rows"

    except Exception as e:
        return False, f"| CSV format parsing error: {str(e)}"


def verify():
    """
    Verify if the model's response contains correct CSV data extraction results.
    """
    # Get model response
    model_response = get_model_response()

    if not model_response:
        print("| Model response not found", file=sys.stderr)
        return False

    print(f"|\n| Model response (first 500 characters): {model_response[:500]}...", file=sys.stderr)

    # Extract CSV data from response
    csv_data = extract_csv_from_response(model_response)

    if not csv_data:
        print("|\n| ✗ CSV data not found in response", file=sys.stderr)
        return False

    print(f"|\n| Found CSV data (first 300 characters):\n| {csv_data[:300]}...", file=sys.stderr)

    # Validate CSV data
    is_valid, message = validate_csv_data(csv_data)

    if is_valid:
        print(f"|\n| ✓ {message}", file=sys.stderr)
        return True
    else:
        print(f"|\n| ✗ CSV validation failed: {message}", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = verify()
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()