Budget Computation

L3
ModelContextProtocolFilesystemDesktop Template

Analyze personal expense data extracted from desktop files to create a detailed budget summary report for financial review.

Created by Lingjun Chen
2025-08-14
Data ExtractionPattern Analysis

Model Ranking

Click on the dots to view the trajectory of each task run
Model
Run Results
Pass@4
Pass^4
Avg Time
Avg Turns
Input Tokens
Output Tokens
Total Tokens
Claude
claude-4-1-opus
0
/1
--
125.4s
7.0
35,389
2,139
37,528
Claude
claude-4-sonnet
0
/4
160.0s
7.8
37,404
2,021
39,425
DeepSeek
deepseek-chat
0
/4
149.0s
16.3
78,002
1,695
79,696
Gemini
gemini-2-5-pro
0
/4
230.4s
11.5
37,599
5,349
42,948
OpenAI
gpt-5
0
/4
98.6s
5.0
16,517
6,247
22,764
Grok
grok-4
0
/4
215.9s
6.3
-
-
-
MoonshotAI
k2
0
/4
103.9s
8.0
33,796
1,601
35,397
OpenAI
o3
0
/4
102.5s
18.8
64,097
6,510
70,607
Qwen
qwen-3-coder
0
/4
79.7s
20.5
85,546
1,659
87,206

Task State

Task Initial State Files
Download ZIP package to view the complete file structure
desktop_template/ ├── Archives/ │ ├── backup_contacts.csv │ └── tax_documents_2022.csv ├── Desktop/ │ └── contacts.csv ├── Documents/ │ ├── Personal/ │ │ └── tax_info_2023.csv │ ├── Projects/ │ │ └── budget_tracker.csv │ ├── Work/ │ │ ├── client_list.csv │ │ └── timesheet.csv │ ├── budget.csv │ └── important_dates.csv ├── Downloads/ │ ├── expenses.csv │ ├── fitness_log.csv │ └── price_comparisons.csv ├── Temp/ │ └── test_data.csv ├── book_list.txt ├── bookmark_export.txt ├── calculations.txt ├── correspondence_2023.txt ├── draft_letter.txt ├── emergency_contacts.txt ├── example.txt └── experiment_results.txt

Instruction



Verify

*.py
Python
#!/usr/bin/env python3
"""
Verification script for Budget Computation Task
"""

import sys
from pathlib import Path
import os
from collections import Counter

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_total_budget_file_exists(test_dir: Path) -> bool:
    """Verify that the total_budget.txt file exists."""
    budget_file = test_dir / "total_budget.txt"
    
    if not budget_file.exists():
        print("❌ File 'total_budget.txt' not found")
        return False
    
    print("✅ total_budget.txt file found")
    return True

def verify_file_format(test_dir: Path) -> bool:
    """Verify that the total_budget.txt file has proper format."""
    budget_file = test_dir / "total_budget.txt"
    
    try:
        content = budget_file.read_text()
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        
        if len(lines) < 2:
            print("❌ File must contain at least 2 lines (expenses + total)")
            return False
        
        # Check that all lines except the last follow the format file_path;price
        for i, line in enumerate(lines[:-1]):
            if ';' not in line:
                print(f"❌ Line {i+1} does not contain ';' separator: {line}")
                return False
            
            parts = line.split(';')
            if len(parts) != 2:
                print(f"❌ Line {i+1} does not have exactly 2 parts: {line}")
                return False
            
            # Check if second part is a valid number
            try:
                float(parts[1])
            except ValueError:
                print(f"❌ Line {i+1} price is not a valid number: {parts[1]}")
                return False
        
        # Check if last line is a valid number (total)
        try:
            float(lines[-1])
        except ValueError:
            print(f"❌ Last line is not a valid number: {lines[-1]}")
            return False
        
        print("✅ File format is correct")
        return True
        
    except Exception as e:
        print(f"❌ Error reading or parsing file: {e}")
        return False

def verify_expense_entries(test_dir: Path) -> bool:
    """Verify that all 15 required expense entries are present."""
    budget_file = test_dir / "total_budget.txt"
    
    try:
        content = budget_file.read_text()
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        
        # Should have 16 lines total (15 expenses + 1 total)
        if len(lines) != 16:
            print(f"❌ Expected 16 lines (15 expenses + 1 total), found {len(lines)}")
            return False
        
        # Check that we have exactly 15 expense entries
        expense_lines = lines[:-1]  # All lines except the last
        
        if len(expense_lines) != 15:
            print(f"❌ Expected 15 expense entries, found {len(expense_lines)}")
            return False
        
        print("✅ File contains exactly 15 expense entries")
        return True
        
    except Exception as e:
        print(f"❌ Error checking expense entries: {e}")
        return False

def verify_file_paths_and_counts(test_dir: Path) -> bool:
    """Verify that all required file paths are present with correct counts."""
    budget_file = test_dir / "total_budget.txt"
    
    try:
        content = budget_file.read_text()
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        expense_lines = lines[:-1]  # All lines except the last
        
        # Extract file paths from expense lines
        file_paths = []
        for line in expense_lines:
            file_path = line.split(';')[0]
            file_paths.append(file_path)
        
        # Count occurrences of each path
        path_counts = Counter(file_paths)
        
        # Expected file paths and their counts based on answer.txt
        expected_paths = {
            'Archives/tax_documents_2022.csv': 3,
            'Documents/Personal/tax_info_2023.csv': 3,
            'Documents/budget.csv': 3,
            'Downloads/expenses.csv': 3,
            'Downloads/price_comparisons.csv': 3
        }
        
        # Helper function to check if a path contains the expected path
        def path_matches_expected(actual_path: str, expected_path: str) -> bool:
            """Check if actual path contains the expected path (allowing for prefixes like './')"""
            # Remove common prefixes like './', '../', etc.
            normalized_actual = actual_path
            while normalized_actual.startswith('./') or normalized_actual.startswith('../'):
                normalized_actual = normalized_actual[2:] if normalized_actual.startswith('./') else normalized_actual[3:]
            
            # Check if the normalized path contains the expected path
            return expected_path in normalized_actual or normalized_actual == expected_path
        
        # Check if all expected paths are present with correct counts
        for expected_path, expected_count in expected_paths.items():
            # Find matching actual paths
            matching_paths = []
            for actual_path in path_counts.keys():
                if path_matches_expected(actual_path, expected_path):
                    matching_paths.append(actual_path)
            
            if not matching_paths:
                print(f"❌ Missing expected file path: {expected_path}")
                return False
            
            # Sum up the counts from all matching paths
            total_count = sum(path_counts[path] for path in matching_paths)
            if total_count != expected_count:
                print(f"❌ Path {expected_path} has wrong count: expected {expected_count}, found {total_count}")
                print(f"   Matching paths: {matching_paths}")
                return False
        
        # Check if there are any completely unexpected paths (not matching any expected path)
        all_matching_paths = set()
        for expected_path in expected_paths.keys():
            for actual_path in path_counts.keys():
                if path_matches_expected(actual_path, expected_path):
                    all_matching_paths.add(actual_path)
        
        unexpected_paths = set(path_counts.keys()) - all_matching_paths
        if unexpected_paths:
            print(f"❌ Unexpected file paths found: {unexpected_paths}")
            return False
        
        print("✅ All expected file paths are present with correct counts")
        return True
        
    except Exception as e:
        print(f"❌ Error checking file paths: {e}")
        return False

def verify_individual_prices(test_dir: Path) -> bool:
    """Verify that all individual prices match the expected values."""
    budget_file = test_dir / "total_budget.txt"
    
    try:
        content = budget_file.read_text()
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        expense_lines = lines[:-1]  # All lines except the last
        
        # Expected prices based on answer.txt
        expected_expenses = [
            ('Archives/tax_documents_2022.csv', 42000.00),
            ('Archives/tax_documents_2022.csv', 1800.00),
            ('Archives/tax_documents_2022.csv', 950.00),
            ('Documents/Personal/tax_info_2023.csv', 45000.00),
            ('Documents/Personal/tax_info_2023.csv', 2500.00),
            ('Documents/Personal/tax_info_2023.csv', 1200.00),
            ('Documents/budget.csv', 250.00),
            ('Documents/budget.csv', 180.00),
            ('Documents/budget.csv', 120.00),
            ('Downloads/expenses.csv', 45.99),
            ('Downloads/expenses.csv', 99.00),
            ('Downloads/expenses.csv', 234.50),
            ('Downloads/price_comparisons.csv', 879.99),
            ('Downloads/price_comparisons.csv', 289.99),
            ('Downloads/price_comparisons.csv', 74.99)
        ]
        
        # Helper function to check if a path contains the expected path
        def path_matches_expected(actual_path: str, expected_path: str) -> bool:
            """Check if actual path contains the expected path (allowing for prefixes like './')"""
            # Remove common prefixes like './', '../', etc.
            normalized_actual = actual_path
            while normalized_actual.startswith('./') or normalized_actual.startswith('../'):
                normalized_actual = normalized_actual[2:] if normalized_actual.startswith('./') else normalized_actual[3:]
            
            # Check if the normalized path contains the expected path
            return expected_path in normalized_actual or normalized_actual == expected_path
        
        # Parse actual expenses
        actual_expenses = []
        for line in expense_lines:
            parts = line.split(';')
            file_path = parts[0]
            price = float(parts[1])
            actual_expenses.append((file_path, price))
        
        # Create a counter for expected expenses to handle duplicates
        expected_expenses_counter = Counter(expected_expenses)
        actual_expenses_counter = Counter(actual_expenses)
        
        # Check if all expected expenses are present with correct counts
        for expected_expense, expected_count in expected_expenses_counter.items():
            expected_path, expected_price = expected_expense
            
            # Find matching actual expenses
            matching_expenses = []
            for actual_expense, actual_count in actual_expenses_counter.items():
                actual_path, actual_price = actual_expense
                if path_matches_expected(actual_path, expected_path) and abs(actual_price - expected_price) < 0.01:
                    matching_expenses.append(actual_expense)
            
            if not matching_expenses:
                print(f"❌ Missing expected expense: {expected_expense}")
                return False
            
            # Sum up the counts from all matching expenses
            total_count = sum(actual_expenses_counter[expense] for expense in matching_expenses)
            if total_count != expected_count:
                print(f"❌ Expense {expected_expense} has wrong count: expected {expected_count}, found {total_count}")
                print(f"   Matching expenses: {matching_expenses}")
                return False
        
        # Check if there are any completely unexpected expenses (not matching any expected expense)
        all_matching_expenses = set()
        for expected_expense in expected_expenses_counter.keys():
            expected_path, expected_price = expected_expense
            for actual_expense in actual_expenses_counter.keys():
                actual_path, actual_price = actual_expense
                if path_matches_expected(actual_path, expected_path) and abs(actual_price - expected_price) < 0.01:
                    all_matching_expenses.add(actual_expense)
        
        unexpected_expenses = set(actual_expenses_counter.keys()) - all_matching_expenses
        if unexpected_expenses:
            print(f"❌ Unexpected expenses found: {unexpected_expenses}")
            return False
        
        print("✅ All individual prices match expected values")
        return True
        
    except Exception as e:
        print(f"❌ Error checking individual prices: {e}")
        return False

def verify_total_price(test_dir: Path) -> bool:
    """Verify that the total price is correct."""
    budget_file = test_dir / "total_budget.txt"
    
    try:
        content = budget_file.read_text()
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        
        # Get the total from the last line
        total_line = lines[-1]
        try:
            actual_total = float(total_line)
        except ValueError:
            print(f"❌ Last line is not a valid number: {total_line}")
            return False
        
        # Expected total based on answer.txt
        expected_total = 95624.46
        
        if abs(actual_total - expected_total) > 0.01:  # Allow small floating point differences
            print(f"❌ Expected total {expected_total}, found {actual_total}")
            return False
        
        print("✅ Total price is correct")
        return True
        
    except Exception as e:
        print(f"❌ Error checking total price: {e}")
        return False

def verify_total_calculation(test_dir: Path) -> bool:
    """Verify that the total matches the sum of individual expenses."""
    budget_file = test_dir / "total_budget.txt"
    
    try:
        content = budget_file.read_text()
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        expense_lines = lines[:-1]  # All lines except the last
        
        # Calculate sum of individual expenses
        calculated_total = 0.0
        for line in expense_lines:
            price = float(line.split(';')[1])
            calculated_total += price
        
        # Get the stated total from the last line
        stated_total = float(lines[-1])
        
        # Check if they match (allow small floating point differences)
        if abs(calculated_total - stated_total) > 0.01:
            print(f"❌ Total calculation mismatch: calculated {calculated_total:.2f}, stated {stated_total:.2f}")
            return False
        
        print("✅ Total calculation is correct")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying total calculation: {e}")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Budget Computation Task...")
    
    # Define verification steps
    verification_steps = [
        ("Total Budget File Exists", verify_total_budget_file_exists),
        ("File Format", verify_file_format),
        ("Expense Entries Count", verify_expense_entries),
        ("File Paths and Counts", verify_file_paths_and_counts),
        ("Individual Prices", verify_individual_prices),
        ("Total Price", verify_total_price),
        ("Total Calculation", verify_total_calculation),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Budget computation task completed successfully!")
        print("🎉 All verification steps passed")
        print("📊 Summary:")
        print("   - 15 expense entries found")
        print("   - 5 different file paths covered")
        print("   - All individual prices correct")
        print("   - Total price: $95,624.46")
        print("   - Calculation verified")
        sys.exit(0)
    else:
        print("❌ Budget computation task verification: FAIL")
        print("Please check the errors above and ensure all requirements are met")
        sys.exit(1)

if __name__ == "__main__":
    main()