Find Math Paper

L3
ModelContextProtocolFilesystemPapers

Search through academic papers to identify and locate mathematics-related content that satisfies specific mathematical criteria and research requirements.

Created by Xiangyan Liu
2025-08-12
Pattern AnalysisData Extraction

Model Ranking

Click on the dots to view the trajectory of each task run
Model
Run Results
Pass@4
Pass^4
Avg Time
Avg Turns
Input Tokens
Output Tokens
Total Tokens
OpenAI
gpt-5
4
/4
117.4s
6.5
403,187
4,638
407,825
DeepSeek
deepseek-chat
2
/4
271.8s
26.5
624,257
2,528
626,785
Claude
claude-4-1-opus
1
/1
--
140.4s
6.0
421,671
1,378
423,049
Claude
claude-4-sonnet
1
/4
66.9s
6.3
296,938
1,339
298,277
Grok
grok-4
1
/4
103.2s
3.8
-
-
-
MoonshotAI
k2
1
/4
195.7s
13.8
581,037
1,257
582,294
Gemini
gemini-2-5-pro
0
/4
123.5s
10.3
1,413,366
6,884
1,420,249
OpenAI
o3
0
/4
538.1s
78.3
3,198,157
28,183
3,226,339
Qwen
qwen-3-coder
0
/4
306.4s
17.5
1,544,033
1,940
1,545,973

Task State

Task Initial State Files
Download ZIP package to view the complete file structure
papers/ ├── 1707.06347.html ├── 2105.04165.html ├── 2201.11903.html ├── 2303.08774.html ├── 2306.08640.html ├── 2310.02255.html ├── 2310.08446.html ├── 2312.00849.html ├── 2312.07533.html ├── 2312.11805.html ├── 2402.00253.html ├── 2402.03300.html ├── 2403.05530.html ├── 2404.13046.html ├── 2404.14367.html ├── 2404.14396.html ├── 2405.09818.html ├── 2405.13911.html ├── 2405.16473.html ├── 2405.16640.html ├── 2406.08478.html ├── 2406.16852.html ├── 2406.17294.html ├── 2407.01284.html ├── 2407.01509.html ├── 2407.21783.html ├── 2408.03326.html ├── 2408.12528.html ├── 2409.19256.html ├── 2410.05993.html ├── 2410.06166.html ├── 2410.10563.html ├── 2410.13848.html ├── 2410.17885.html ├── 2410.21276.html ├── 2411.07975.html ├── 2411.10442.html ├── 2411.11930.html ├── 2411.14432.html ├── 2412.05271.html ├── 2412.08443.html ├── 2412.10302.html ├── 2412.15115.html ├── 2412.16720.html ├── 2412.17256.html ├── 2412.18319.html ├── 2412.20631.html ├── 2501.04686.html ├── 2501.06186.html ├── 2501.12599.html ├── 2501.12948.html ├── 2501.17811.html ├── 2502.01456.html ├── 2502.09621.html ├── 2502.10391.html ├── 2502.13923.html ├── 2503.01785.html ├── 2503.06520.html ├── 2503.06749.html ├── 2503.07065.html ├── 2503.07365.html ├── 2503.07536.html ├── 2503.10291.html ├── 2503.10615.html ├── 2503.12937.html ├── 2503.13939.html ├── 2503.14476.html ├── 2503.17352.html ├── 2503.18892.html ├── 2503.19786.html ├── 2503.20783.html ├── 2503.21620.html ├── 2503.21776.html ├── 2503.22679.html ├── 2504.02587.html ├── 2504.05599.html ├── 2504.07491.html ├── 2504.07934.html ├── 2504.07954.html ├── 2504.11455.html ├── 2504.14945.html ├── 2504.16656.html ├── 2505.00703.html └── arxiv_2025.bib

Instruction



Verify

*.py
Python
#!/usr/bin/env python3
"""
Verification script for Find Math Paper Task
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_answer_file_exists(test_dir: Path) -> bool:
    """Verify that answer.html exists in the papers directory."""
    answer_file = test_dir  / "answer.html"
    
    if not answer_file.exists():
        print("❌ File 'answer.html' not found")
        return False
    
    print("✅ answer.html found")
    return True

def verify_original_file_removed(test_dir: Path) -> bool:
    """Verify that the original file (2407.01284.html) no longer exists."""
    original_file = test_dir  / "2407.01284.html"
    
    if original_file.exists():
        print("❌ Original file 2407.01284.html still exists")
        return False
    
    print("✅ Original file has been renamed")
    return True

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Find Math Paper Task...")
    
    # Define verification steps
    verification_steps = [
        ("Answer File Exists", verify_answer_file_exists),
        ("Original File Renamed", verify_original_file_removed),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Paper correctly renamed to answer.html!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()