Config Parameter Audit
L3
GithubEasyR1
Investigate configuration changes causing training instability by analyzing commits and identifying related memory issues.
Created by Xiangyan Liu
2025-08-15
Repository AnalysisIssue Management
Model Ranking
Click on the dots to view the trajectory of each task run
Model | Run Results | Pass@4 | Pass^4 | Avg Time | Avg Turns | Input Tokens | Output Tokens | Total Tokens |
---|---|---|---|---|---|---|---|---|
gpt-5 | 1 /4 | 532.2s | 17.5 | 3,546,439 | 14,507 | 3,560,946 | ||
claude-4-1-opus | 0 /1 | - | - | 577.8s | 18.0 | 3,295,742 | 2,949 | 3,298,691 |
claude-4-sonnet | 0 /4 | 153.0s | 8.5 | 1,076,750 | 1,480 | 1,078,230 | ||
deepseek-chat | 0 /4 | 96.5s | 4.5 | 250,166 | 354 | 250,520 | ||
gemini-2-5-pro | 0 /4 | 54.0s | 1.8 | 11,348 | 5,022 | 16,371 | ||
grok-4 | 0 /4 | 30.1s | - | - | - | - | ||
k2 | 0 /4 | 325.4s | 16.0 | 1,614,813 | 1,466 | 1,616,279 | ||
o3 | 0 /4 | 171.6s | 11.0 | 1,053,736 | 3,837 | 1,057,572 | ||
qwen-3-coder | 0 /4 | 593.0s | 10.8 | 2,563,003 | 1,244 | 2,564,247 |
Task State
Instruction
Verify
Python
import sys
import os
import json
import requests
import re
from typing import Dict, Optional, Tuple
from dotenv import load_dotenv
load_dotenv(".mcp_env")
def _get_github_api(
endpoint: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
github_org = os.environ.get("GITHUB_EVAL_ORG")
url = f"https://api.github.com/repos/{github_org}/EasyR1/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _get_analysis_results(headers: Dict[str, str]) -> Optional[Dict]:
"""Get ANALYSIS_RESULTS.json file content."""
success, file_data = _get_github_api("contents/ANALYSIS_RESULTS.json", headers)
if not success:
return None
# Decode base64 content
import base64
content = file_data.get("content", "")
if content:
try:
decoded_content = base64.b64decode(content).decode("utf-8")
return json.loads(decoded_content)
except Exception as e:
print(f"Error parsing JSON: {e}", file=sys.stderr)
return None
return None
def _verify_commit_data(results: Dict, headers: Dict[str, str]) -> bool:
"""Verify the commit data is accurate."""
commit_sha = results.get("target_commit_sha")
# Validate SHA format
if not re.match(r"^[a-f0-9]{40}$", commit_sha, re.IGNORECASE):
print(f"Error: Invalid commit SHA format: {commit_sha}", file=sys.stderr)
return False
# Get commit details
success, commit_data = _get_github_api(f"commits/{commit_sha}", headers)
if not success:
print(f"Error: Commit {commit_sha} not found in repository", file=sys.stderr)
return False
# Verify author
expected_author = results.get("commit_author")
actual_author = commit_data.get("author", {}).get("login")
if expected_author != actual_author:
print(
f"Error: Commit author mismatch. Expected: {expected_author}, Actual: {actual_author}",
file=sys.stderr,
)
return False
# Verify date format
commit_date = results.get("commit_date")
if not re.match(r"^\d{4}-\d{2}-\d{2}$", commit_date):
print(
f"Error: Invalid date format: {commit_date}. Expected YYYY-MM-DD",
file=sys.stderr,
)
return False
return True
def _verify_parameter_changes(results: Dict, headers: Dict[str, str]) -> bool:
"""Verify the parameter changes are accurate."""
param_changes = results.get("parameter_changes", {})
# Check required parameters exist
required_params = [
"micro_batch_size_per_device_for_update",
"micro_batch_size_per_device_for_experience",
]
for param in required_params:
if param not in param_changes:
print(f"Error: Missing parameter change data for: {param}", file=sys.stderr)
return False
change_data = param_changes[param]
if not all(key in change_data for key in ["before", "after", "line_number"]):
print(
f"Error: Incomplete change data for parameter: {param}", file=sys.stderr
)
return False
# Verify specific expected values based on known repository state
update_param = param_changes.get("micro_batch_size_per_device_for_update", {})
if update_param.get("before") != 4 or update_param.get("after") != 1:
print(
"Error: Incorrect values for micro_batch_size_per_device_for_update",
file=sys.stderr,
)
return False
experience_param = param_changes.get(
"micro_batch_size_per_device_for_experience", {}
)
if experience_param.get("before") != 16 or experience_param.get("after") != 2:
print(
"Error: Incorrect values for micro_batch_size_per_device_for_experience",
file=sys.stderr,
)
return False
return True
def _get_all_issues_with_keywords(headers: Dict[str, str]) -> set:
"""Find all issues in repository that contain the required keywords."""
required_keywords = ["oom", "memory", "batch", "显存"]
keyword_issues = set()
# Get all issues from repository (both open and closed)
page = 1
while True:
success, issues = _get_github_api(
f"issues?state=all&per_page=100&page={page}", headers
)
if not success or not issues:
break
for issue in issues:
issue_number = issue.get("number")
title = issue.get("title", "").lower()
body = issue.get("body", "").lower() if issue.get("body") else ""
issue_text = title + " " + body
# Check if any keyword appears in title or body
for keyword in required_keywords:
if keyword.lower() in issue_text:
keyword_issues.add(issue_number)
break
# If we got less than 100 issues, we're done
if len(issues) < 100:
break
page += 1
return keyword_issues
def _verify_issue_references(results: Dict, headers: Dict[str, str]) -> bool:
"""Verify the issue references contain the required keywords."""
issue_number_list = results.get("related_issue_number_list")
if not isinstance(issue_number_list, list) or len(issue_number_list) == 0:
print(
"Error: related_issue_number_list must be a non-empty list",
file=sys.stderr,
)
return False
# Required keywords to search for (case insensitive)
required_keywords = ["oom", "memory", "batch", "显存"]
# First, dynamically find all issues that contain the required keywords
expected_issues = _get_all_issues_with_keywords(headers)
print(expected_issues)
provided_issues = set(issue_number_list)
# Verify each provided issue contains at least one of the required keywords
for issue_number in issue_number_list:
if not isinstance(issue_number, int) or issue_number <= 0:
print(
f"Error: Invalid issue number format: {issue_number}", file=sys.stderr
)
return False
# Get issue details
success, issue_data = _get_github_api(f"issues/{issue_number}", headers)
if not success:
print(
f"Error: Issue #{issue_number} not found in repository", file=sys.stderr
)
return False
# Check if issue title or body contains any required keywords
title = issue_data.get("title", "").lower()
body = issue_data.get("body", "").lower() if issue_data.get("body") else ""
issue_text = title + " " + body
issue_has_keyword = False
for keyword in required_keywords:
if keyword.lower() in issue_text:
issue_has_keyword = True
break
if not issue_has_keyword:
print(
f"Error: Issue #{issue_number} does not contain any required keywords: {required_keywords}",
file=sys.stderr,
)
return False
# Verify agent found exactly the same issues as our dynamic search
if provided_issues != expected_issues:
missing = expected_issues - provided_issues
extra = provided_issues - expected_issues
if missing:
print(
f"Error: Missing issues that contain required keywords: {missing}",
file=sys.stderr,
)
if extra:
print(
f"Error: Extra issues that don't contain required keywords: {extra}",
file=sys.stderr,
)
return False
print(
f"✓ Found all {len(issue_number_list)} issues containing required keywords: {issue_number_list}"
)
return True
def verify() -> bool:
"""
Programmatically verify that the deep commit analysis meets the requirements.
"""
# Get GitHub token
github_token = os.environ.get("MCP_GITHUB_TOKEN")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"token {github_token}",
"Accept": "application/vnd.github.v3+json",
}
print("Verifying deep commit analysis completion...")
# 1. Check ANALYSIS_RESULTS.json exists and is valid JSON
print("1. Checking ANALYSIS_RESULTS.json exists and is valid...")
results = _get_analysis_results(headers)
if not results:
print("Error: ANALYSIS_RESULTS.json not found or invalid JSON", file=sys.stderr)
return False
print("✓ Found valid ANALYSIS_RESULTS.json")
# 2. Verify commit data accuracy
print("2. Verifying commit data accuracy...")
if not _verify_commit_data(results, headers):
return False
print("✓ Commit SHA, author, and date verified")
# 3. Verify parameter changes accuracy
print("3. Verifying parameter changes accuracy...")
if not _verify_parameter_changes(results, headers):
return False
print("✓ Parameter changes verified with correct before/after values")
# 4. Verify issue references
print("4. Verifying issue references...")
if not _verify_issue_references(results, headers):
return False
print("\n✓ Task completed successfully!")
print("Deep commit analysis results verified:")
print(f"- Found target commit: {results.get('target_commit_sha')}")
print(
"- Verified parameter changes: micro_batch_size_per_device_for_update (4→1), micro_batch_size_per_device_for_experience (16→2)"
)
print(
f"- Verified memory/performance issue correlations: {results.get('related_issue_number_list')}"
)
print("- All data obtained through accurate GitHub API analysis")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)