Solution Tracing
L3
FilesystemLegal Document
Trace the evolution of clause resolutions across document versions to identify who first proposed each final accepted solution.
Created by Lingjun Chen
2025-08-15
Cross ReferencingPattern Analysis
Model Ranking
Click on the dots to view the trajectory of each task run
Model | Run Results | Pass@4 | Pass^4 | Avg Time | Avg Turns | Input Tokens | Output Tokens | Total Tokens |
---|---|---|---|---|---|---|---|---|
grok-4 | 2 /4 | 135.0s | 8.5 | - | - | - | ||
o3 | 1 /4 | 100.8s | 11.0 | 476,073 | 4,338 | 480,411 | ||
claude-4-1-opus | 0 /1 | - | - | 143.6s | 8.0 | 294,946 | 1,614 | 296,560 |
claude-4-sonnet | 0 /4 | 105.8s | 6.3 | 312,807 | 1,849 | 314,656 | ||
deepseek-chat | 0 /4 | 230.8s | 11.8 | 804,553 | 1,888 | 806,441 | ||
gemini-2-5-pro | 0 /4 | 84.5s | 8.5 | 362,702 | 5,962 | 368,664 | ||
gpt-5 | 0 /4 | 94.1s | 7.3 | 277,953 | 5,102 | 283,054 | ||
k2 | 0 /4 | 156.8s | 9.0 | 360,384 | 1,455 | 361,839 | ||
qwen-3-coder | 0 /4 | 104.2s | 7.5 | 317,115 | 1,661 | 318,776 |
Task State
Task Initial State Files
Download ZIP package to view the complete file structure
legal_document/
└── legal_files/
├── Preferred_Stock_Purchase_Agreement_v0.txt
├── Preferred_Stock_Purchase_Agreement_v1.txt
├── Preferred_Stock_Purchase_Agreement_v2.txt
├── Preferred_Stock_Purchase_Agreement_v3.txt
├── Preferred_Stock_Purchase_Agreement_v4.txt
├── Preferred_Stock_Purchase_Agreement_v5.txt
├── Preferred_Stock_Purchase_Agreement_v6.txt
├── Preferred_Stock_Purchase_Agreement_v7.txt
├── Preferred_Stock_Purchase_Agreement_v8.txt
├── Preferred_Stock_Purchase_Agreement_v9.txt
└── Preferred_Stock_Purchase_Agreement_v10.txt
Instruction
Verify
Python
#!/usr/bin/env python3
"""
Verification script for Legal Document Solution Tracing Task
"""
import sys
from pathlib import Path
import csv
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_output_file_exists(test_dir: Path) -> bool:
"""Verify that the tracing.csv file exists."""
output_file = test_dir / "tracing.csv"
if not output_file.exists():
print("❌ File 'tracing.csv' not found")
return False
print("✅ Output file 'tracing.csv' found")
return True
def verify_csv_format(test_dir: Path) -> bool:
"""Verify that the CSV file has the correct format."""
output_file = test_dir / "tracing.csv"
try:
with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
rows = list(reader)
if not rows:
print("❌ CSV file is empty")
return False
# Check if there are at least 2 rows (header + data)
if len(rows) < 2:
print("❌ CSV file has insufficient rows")
return False
# Check if header row has correct number of columns
header = rows[0]
if len(header) != 5: # First column (can be anything) + 4 clauses
print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 5")
return False
# Check if data rows have correct number of columns
for i, row in enumerate(rows[1:], 1):
if len(row) != 5:
print(f"❌ Data row {i} has incorrect number of columns: {len(row)}, expected 5")
return False
print("✅ CSV format is correct")
return True
except Exception as e:
print(f"❌ Error reading CSV file: {e}")
return False
def verify_csv_content(test_dir: Path) -> bool:
"""Verify that the CSV content matches the expected answer exactly."""
output_file = test_dir / "tracing.csv"
try:
with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
rows = list(reader)
# Expected data based on answer.csv
expected_data = {
"version_number": ["5", "6", "7", "8"],
"name": ["Bill Harvey", "Michelle Jackson", "Michelle Jackson", "Tony Taylor"]
}
# Expected header columns (excluding first column which can be anything)
expected_header_columns = ["4.6", "4.16", "6.8", "6.16"]
# Verify header has correct number of columns
header = rows[0]
if len(header) != 5: # First column + 4 clauses
print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 5")
return False
# Check if all expected clause columns are present (allow order to be different)
# Allow first column to be anything, so we check columns 1-4
header_clauses = header[1:5]
missing_clauses = []
for expected_clause in expected_header_columns:
if expected_clause not in header_clauses:
missing_clauses.append(expected_clause)
if missing_clauses:
print(f"❌ Missing expected clause columns: {missing_clauses}")
return False
# Check if there are extra clause columns
extra_clauses = []
for clause in header_clauses:
if clause not in expected_header_columns:
extra_clauses.append(clause)
if extra_clauses:
print(f"❌ Unexpected extra clause columns: {extra_clauses}")
return False
# Create a mapping from expected clause order to actual column indices
clause_mapping = {}
for i, clause in enumerate(header_clauses):
if clause in expected_header_columns:
clause_mapping[clause] = i
# Parse the CSV data into a dictionary with correct column mapping
csv_data = {}
for row in rows[1:]:
if len(row) >= 5:
row_type = row[0] # version_number or name
# Map values according to the expected clause order
values = []
for expected_clause in expected_header_columns:
col_index = clause_mapping[expected_clause] + 1 # +1 because we skip first column
values.append(row[col_index])
csv_data[row_type] = values
# Check if all expected row types are present
missing_types = []
for expected_type in expected_data:
if expected_type not in csv_data:
missing_types.append(expected_type)
if missing_types:
print(f"❌ Missing expected row types: {missing_types}")
return False
# Check if there are extra row types
extra_types = []
for row_type in csv_data:
if row_type not in expected_data:
extra_types.append(row_type)
if extra_types:
print(f"❌ Unexpected extra row types: {extra_types}")
return False
# Check values for each row type
for row_type, expected_values in expected_data.items():
actual_values = csv_data[row_type]
if actual_values != expected_values:
print(f"❌ Values mismatch for {row_type}:")
print(f" Expected: {expected_values}")
print(f" Got: {actual_values}")
return False
print("✅ CSV content matches expected answer exactly")
return True
except Exception as e:
print(f"❌ Error verifying CSV content: {e}")
return False
def verify_data_accuracy(test_dir: Path) -> bool:
"""Verify that the data values are accurate."""
output_file = test_dir / "tracing.csv"
try:
with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
rows = list(reader)
# Skip header row
for i, row in enumerate(rows[1:], 1):
if len(row) >= 5:
row_type = row[0]
values = row[1:5]
# Check version_number row
if row_type == "version_number":
for j, value in enumerate(values, 1):
try:
int_val = int(value)
if int_val < 5 or int_val > 8:
print(f"❌ Row {i}, column {j}: version number '{value}' is out of expected range [5-8]")
return False
except ValueError:
print(f"❌ Row {i}, column {j}: non-integer version number '{value}'")
return False
# Check name row
elif row_type == "name":
expected_names = ["Bill Harvey", "Michelle Jackson", "Michelle Jackson", "Tony Taylor"]
for j, value in enumerate(values, 1):
if value not in expected_names:
print(f"❌ Row {i}, column {j}: unexpected name '{value}'")
return False
print("✅ All data values are accurate")
return True
except Exception as e:
print(f"❌ Error verifying data accuracy: {e}")
return False
def verify_file_location(test_dir: Path) -> bool:
"""Verify that the file is in the main directory (not in a subdirectory)."""
output_file = test_dir / "tracing.csv"
if output_file.exists():
print("✅ File is located in the main directory")
return True
else:
print("❌ File is not in the main directory")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Legal Document Solution Tracing Task...")
# Define verification steps
verification_steps = [
("Output File Exists", verify_output_file_exists),
("CSV Format", verify_csv_format),
("CSV Content", verify_csv_content),
("Data Accuracy", verify_data_accuracy),
("File Location", verify_file_location),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Legal document solution tracing task completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()