Config Parameter Audit

L3
ModelContextProtocolGithubEasyR1

Investigate configuration changes causing training instability by analyzing commits and identifying related memory issues.

Created by Xiangyan Liu
2025-08-15
Repository AnalysisIssue Management

Model Ranking

Click on the dots to view the trajectory of each task run
Model
Run Results
Pass@4
Pass^4
Avg Time
Avg Turns
Input Tokens
Output Tokens
Total Tokens
OpenAI
gpt-5
1
/4
532.2s
17.5
3,546,439
14,507
3,560,946
Claude
claude-4-1-opus
0
/1
--
577.8s
18.0
3,295,742
2,949
3,298,691
Claude
claude-4-sonnet
0
/4
153.0s
8.5
1,076,750
1,480
1,078,230
DeepSeek
deepseek-chat
0
/4
96.5s
4.5
250,166
354
250,520
Gemini
gemini-2-5-pro
0
/4
54.0s
1.8
11,348
5,022
16,371
Grok
grok-4
0
/4
30.1s
-
-
-
-
MoonshotAI
k2
0
/4
325.4s
16.0
1,614,813
1,466
1,616,279
OpenAI
o3
0
/4
171.6s
11.0
1,053,736
3,837
1,057,572
Qwen
qwen-3-coder
0
/4
593.0s
10.8
2,563,003
1,244
2,564,247

Task State


Instruction



Verify

*.py
Python
import sys
import os
import json
import requests
import re
from typing import Dict, Optional, Tuple
from dotenv import load_dotenv

load_dotenv(".mcp_env")


def _get_github_api(
    endpoint: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    github_org = os.environ.get("GITHUB_EVAL_ORG")
    url = f"https://api.github.com/repos/{github_org}/EasyR1/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _get_analysis_results(headers: Dict[str, str]) -> Optional[Dict]:
    """Get ANALYSIS_RESULTS.json file content."""
    success, file_data = _get_github_api("contents/ANALYSIS_RESULTS.json", headers)
    if not success:
        return None

    # Decode base64 content
    import base64

    content = file_data.get("content", "")
    if content:
        try:
            decoded_content = base64.b64decode(content).decode("utf-8")
            return json.loads(decoded_content)
        except Exception as e:
            print(f"Error parsing JSON: {e}", file=sys.stderr)
            return None
    return None


def _verify_commit_data(results: Dict, headers: Dict[str, str]) -> bool:
    """Verify the commit data is accurate."""
    commit_sha = results.get("target_commit_sha")

    # Validate SHA format
    if not re.match(r"^[a-f0-9]{40}$", commit_sha, re.IGNORECASE):
        print(f"Error: Invalid commit SHA format: {commit_sha}", file=sys.stderr)
        return False

    # Get commit details
    success, commit_data = _get_github_api(f"commits/{commit_sha}", headers)
    if not success:
        print(f"Error: Commit {commit_sha} not found in repository", file=sys.stderr)
        return False

    # Verify author
    expected_author = results.get("commit_author")
    actual_author = commit_data.get("author", {}).get("login")
    if expected_author != actual_author:
        print(
            f"Error: Commit author mismatch. Expected: {expected_author}, Actual: {actual_author}",
            file=sys.stderr,
        )
        return False

    # Verify date format
    commit_date = results.get("commit_date")
    if not re.match(r"^\d{4}-\d{2}-\d{2}$", commit_date):
        print(
            f"Error: Invalid date format: {commit_date}. Expected YYYY-MM-DD",
            file=sys.stderr,
        )
        return False

    return True


def _verify_parameter_changes(results: Dict, headers: Dict[str, str]) -> bool:
    """Verify the parameter changes are accurate."""
    param_changes = results.get("parameter_changes", {})

    # Check required parameters exist
    required_params = [
        "micro_batch_size_per_device_for_update",
        "micro_batch_size_per_device_for_experience",
    ]
    for param in required_params:
        if param not in param_changes:
            print(f"Error: Missing parameter change data for: {param}", file=sys.stderr)
            return False

        change_data = param_changes[param]
        if not all(key in change_data for key in ["before", "after", "line_number"]):
            print(
                f"Error: Incomplete change data for parameter: {param}", file=sys.stderr
            )
            return False

    # Verify specific expected values based on known repository state
    update_param = param_changes.get("micro_batch_size_per_device_for_update", {})
    if update_param.get("before") != 4 or update_param.get("after") != 1:
        print(
            "Error: Incorrect values for micro_batch_size_per_device_for_update",
            file=sys.stderr,
        )
        return False

    experience_param = param_changes.get(
        "micro_batch_size_per_device_for_experience", {}
    )
    if experience_param.get("before") != 16 or experience_param.get("after") != 2:
        print(
            "Error: Incorrect values for micro_batch_size_per_device_for_experience",
            file=sys.stderr,
        )
        return False

    return True


def _get_all_issues_with_keywords(headers: Dict[str, str]) -> set:
    """Find all issues in repository that contain the required keywords."""
    required_keywords = ["oom", "memory", "batch", "显存"]
    keyword_issues = set()

    # Get all issues from repository (both open and closed)
    page = 1
    while True:
        success, issues = _get_github_api(
            f"issues?state=all&per_page=100&page={page}", headers
        )
        if not success or not issues:
            break

        for issue in issues:
            issue_number = issue.get("number")
            title = issue.get("title", "").lower()
            body = issue.get("body", "").lower() if issue.get("body") else ""
            issue_text = title + " " + body

            # Check if any keyword appears in title or body
            for keyword in required_keywords:
                if keyword.lower() in issue_text:
                    keyword_issues.add(issue_number)
                    break

        # If we got less than 100 issues, we're done
        if len(issues) < 100:
            break
        page += 1

    return keyword_issues


def _verify_issue_references(results: Dict, headers: Dict[str, str]) -> bool:
    """Verify the issue references contain the required keywords."""
    issue_number_list = results.get("related_issue_number_list")

    if not isinstance(issue_number_list, list) or len(issue_number_list) == 0:
        print(
            "Error: related_issue_number_list must be a non-empty list",
            file=sys.stderr,
        )
        return False

    # Required keywords to search for (case insensitive)
    required_keywords = ["oom", "memory", "batch", "显存"]

    # First, dynamically find all issues that contain the required keywords
    expected_issues = _get_all_issues_with_keywords(headers)
    print(expected_issues)
    provided_issues = set(issue_number_list)

    # Verify each provided issue contains at least one of the required keywords
    for issue_number in issue_number_list:
        if not isinstance(issue_number, int) or issue_number <= 0:
            print(
                f"Error: Invalid issue number format: {issue_number}", file=sys.stderr
            )
            return False

        # Get issue details
        success, issue_data = _get_github_api(f"issues/{issue_number}", headers)
        if not success:
            print(
                f"Error: Issue #{issue_number} not found in repository", file=sys.stderr
            )
            return False

        # Check if issue title or body contains any required keywords
        title = issue_data.get("title", "").lower()
        body = issue_data.get("body", "").lower() if issue_data.get("body") else ""
        issue_text = title + " " + body

        issue_has_keyword = False
        for keyword in required_keywords:
            if keyword.lower() in issue_text:
                issue_has_keyword = True
                break

        if not issue_has_keyword:
            print(
                f"Error: Issue #{issue_number} does not contain any required keywords: {required_keywords}",
                file=sys.stderr,
            )
            return False

    # Verify agent found exactly the same issues as our dynamic search
    if provided_issues != expected_issues:
        missing = expected_issues - provided_issues
        extra = provided_issues - expected_issues
        if missing:
            print(
                f"Error: Missing issues that contain required keywords: {missing}",
                file=sys.stderr,
            )
        if extra:
            print(
                f"Error: Extra issues that don't contain required keywords: {extra}",
                file=sys.stderr,
            )
        return False

    print(
        f"✓ Found all {len(issue_number_list)} issues containing required keywords: {issue_number_list}"
    )
    return True


def verify() -> bool:
    """
    Programmatically verify that the deep commit analysis meets the requirements.
    """
    # Get GitHub token
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    print("Verifying deep commit analysis completion...")

    # 1. Check ANALYSIS_RESULTS.json exists and is valid JSON
    print("1. Checking ANALYSIS_RESULTS.json exists and is valid...")
    results = _get_analysis_results(headers)
    if not results:
        print("Error: ANALYSIS_RESULTS.json not found or invalid JSON", file=sys.stderr)
        return False

    print("✓ Found valid ANALYSIS_RESULTS.json")

    # 2. Verify commit data accuracy
    print("2. Verifying commit data accuracy...")
    if not _verify_commit_data(results, headers):
        return False

    print("✓ Commit SHA, author, and date verified")

    # 3. Verify parameter changes accuracy
    print("3. Verifying parameter changes accuracy...")
    if not _verify_parameter_changes(results, headers):
        return False

    print("✓ Parameter changes verified with correct before/after values")

    # 4. Verify issue references
    print("4. Verifying issue references...")
    if not _verify_issue_references(results, headers):
        return False

    print("\n✓ Task completed successfully!")
    print("Deep commit analysis results verified:")
    print(f"- Found target commit: {results.get('target_commit_sha')}")
    print(
        "- Verified parameter changes: micro_batch_size_per_device_for_update (4→1), micro_batch_size_per_device_for_experience (16→2)"
    )
    print(
        f"- Verified memory/performance issue correlations: {results.get('related_issue_number_list')}"
    )
    print("- All data obtained through accurate GitHub API analysis")

    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)