Individual Comments

L3
ModelContextProtocolFilesystemLegal Document

Extract and analyze individual reviewer comments on legal clauses across multiple document versions to understand personal perspectives.

Created by Lingjun Chen
2025-08-15
Data ExtractionCross ReferencingPattern Analysis

Model Ranking

Click on the dots to view the trajectory of each task run
Model
Run Results
Pass@4
Pass^4
Avg Time
Avg Turns
Input Tokens
Output Tokens
Total Tokens
OpenAI
gpt-5
4
/4
153.2s
6.8
172,826
7,674
180,500
OpenAI
o3
4
/4
98.5s
9.5
287,857
4,595
292,452
Gemini
gemini-2-5-pro
3
/4
110.2s
9.3
307,835
9,416
317,251
Grok
grok-4
2
/4
125.1s
7.5
-
-
-
Claude
claude-4-1-opus
0
/1
--
156.2s
7.0
193,146
1,875
195,021
Claude
claude-4-sonnet
0
/4
84.9s
6.0
184,560
1,813
186,373
DeepSeek
deepseek-chat
0
/4
165.3s
8.3
320,694
1,948
322,641
MoonshotAI
k2
0
/4
95.3s
7.0
171,845
1,434
173,278
Qwen
qwen-3-coder
0
/4
137.6s
10.3
488,648
1,419
490,067

Task State

Task Initial State Files
Download ZIP package to view the complete file structure
legal_document/ └── legal_files/ ├── Preferred_Stock_Purchase_Agreement_v0.txt ├── Preferred_Stock_Purchase_Agreement_v1.txt ├── Preferred_Stock_Purchase_Agreement_v2.txt ├── Preferred_Stock_Purchase_Agreement_v3.txt ├── Preferred_Stock_Purchase_Agreement_v4.txt ├── Preferred_Stock_Purchase_Agreement_v5.txt ├── Preferred_Stock_Purchase_Agreement_v6.txt ├── Preferred_Stock_Purchase_Agreement_v7.txt ├── Preferred_Stock_Purchase_Agreement_v8.txt ├── Preferred_Stock_Purchase_Agreement_v9.txt └── Preferred_Stock_Purchase_Agreement_v10.txt

Instruction



Verify

*.py
Python
#!/usr/bin/env python3
"""
Verification script for Legal Document Individual Comments Task
"""

import sys
from pathlib import Path
import csv
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_output_file_exists(test_dir: Path) -> bool:
    """Verify that the individual_comment.csv file exists."""
    output_file = test_dir / "individual_comment.csv"
    
    if not output_file.exists():
        print("❌ File 'individual_comment.csv' not found")
        return False
    
    print("✅ Output file 'individual_comment.csv' found")
    return True

def verify_csv_format(test_dir: Path) -> bool:
    """Verify that the CSV file has the correct format."""
    output_file = test_dir / "individual_comment.csv"
    
    try:
        with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            rows = list(reader)
            
            if not rows:
                print("❌ CSV file is empty")
                return False
            
            # Check if there are at least 2 rows (header + data)
            if len(rows) < 2:
                print("❌ CSV file has insufficient rows")
                return False
            
            # Check if header row has correct number of columns
            header = rows[0]
            if len(header) != 7:  # First column (can be anything) + 6 clauses
                print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 7")
                return False
            
            # Check if data rows have correct number of columns
            for i, row in enumerate(rows[1:], 1):
                if len(row) != 7:
                    print(f"❌ Data row {i} has incorrect number of columns: {len(row)}, expected 7")
                    return False
            
            print("✅ CSV format is correct")
            return True
            
    except Exception as e:
        print(f"❌ Error reading CSV file: {e}")
        return False

def verify_csv_content(test_dir: Path) -> bool:
    """Verify that the CSV content matches the expected answer exactly."""
    output_file = test_dir / "individual_comment.csv"
    
    try:
        with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            rows = list(reader)
            
            # Expected data based on answer.csv
            expected_data = {
                "Bill Harvey": ["0", "2", "3", "1", "1", "1"],
                "Michelle Jackson": ["0", "1", "2", "1", "1", "1"],
                "David Russel": ["2", "1", "1", "2", "1", "1"],
                "Tony Taylor": ["2", "0", "1", "2", "1", "1"]
            }
            
            # Expected header columns (excluding first column which can be anything)
            expected_header_columns = ["1.1", "1.3", "4.6", "4.16", "6.8", "6.16"]
            
            # Verify header has correct number of columns
            header = rows[0]
            if len(header) != 7:  # First column + 6 clauses
                print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 7")
                return False
            
            # Check if all expected clause columns are present (allow order to be different)
            # Allow first column to be anything, so we check columns 1-6
            header_clauses = header[1:7]
            missing_clauses = []
            for expected_clause in expected_header_columns:
                if expected_clause not in header_clauses:
                    missing_clauses.append(expected_clause)
            
            if missing_clauses:
                print(f"❌ Missing expected clause columns: {missing_clauses}")
                return False
            
            # Check if there are extra clause columns
            extra_clauses = []
            for clause in header_clauses:
                if clause not in expected_header_columns:
                    extra_clauses.append(clause)
            
            if extra_clauses:
                print(f"❌ Unexpected extra clause columns: {extra_clauses}")
                return False
            
            # Create a mapping from expected clause order to actual column indices
            clause_mapping = {}
            for i, clause in enumerate(header_clauses):
                if clause in expected_header_columns:
                    clause_mapping[clause] = i
            
            # Parse the CSV data into a dictionary with correct column mapping
            csv_data = {}
            for row in rows[1:]:
                if len(row) >= 7:
                    name = row[0]
                    # Map values according to the expected clause order
                    values = []
                    for expected_clause in expected_header_columns:
                        col_index = clause_mapping[expected_clause] + 1  # +1 because we skip first column
                        values.append(row[col_index])
                    csv_data[name] = values
            
            # Check if all expected names are present
            missing_names = []
            for expected_name in expected_data:
                if expected_name not in csv_data:
                    missing_names.append(expected_name)
            
            if missing_names:
                print(f"❌ Missing expected names: {missing_names}")
                return False
            
            # Check if there are extra names
            extra_names = []
            for name in csv_data:
                if name not in expected_data:
                    extra_names.append(name)
            
            if extra_names:
                print(f"❌ Unexpected extra names: {extra_names}")
                return False
            
            # Check values for each person
            for name, expected_values in expected_data.items():
                actual_values = csv_data[name]
                
                if actual_values != expected_values:
                    print(f"❌ Values mismatch for {name}:")
                    print(f"   Expected: {expected_values}")
                    print(f"   Got:      {actual_values}")
                    return False
            
            print("✅ CSV content matches expected answer exactly")
            return True
            
    except Exception as e:
        print(f"❌ Error verifying CSV content: {e}")
        return False

def verify_data_accuracy(test_dir: Path) -> bool:
    """Verify that the data values are accurate (all values are non-negative integers)."""
    output_file = test_dir / "individual_comment.csv"
    
    try:
        with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            rows = list(reader)
            
            # Skip header row
            for i, row in enumerate(rows[1:], 1):
                if len(row) >= 7:
                    name = row[0]
                    values = row[1:7]
                    
                    for j, value in enumerate(values, 1):
                        try:
                            int_val = int(value)
                            if int_val < 0:
                                print(f"❌ Row {i}, column {j}: negative value '{value}' for {name}")
                                return False
                        except ValueError:
                            print(f"❌ Row {i}, column {j}: non-integer value '{value}' for {name}")
                            return False
            
            print("✅ All data values are valid non-negative integers")
            return True
            
    except Exception as e:
        print(f"❌ Error verifying data accuracy: {e}")
        return False

def verify_file_location(test_dir: Path) -> bool:
    """Verify that the file is in the main directory (not in a subdirectory)."""
    output_file = test_dir / "individual_comment.csv"
    
    if output_file.exists():
        print("✅ File is located in the main directory")
        return True
    else:
        print("❌ File is not in the main directory")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Legal Document Individual Comments Task...")
    
    # Define verification steps
    verification_steps = [
        ("Output File Exists", verify_output_file_exists),
        ("CSV Format", verify_csv_format),
        ("CSV Content", verify_csv_content),
        ("Data Accuracy", verify_data_accuracy),
        ("File Location", verify_file_location),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Legal document individual comments task completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()