Model Leaderboard
Updated at 12/15/2025 06:36:08
• claude-opus only ran pass@1 evaluation
MCP Server Analysis
Detailed performance analysis by MCP server with Pass@1/Pass@4/Pass^4 metrics
Note: view the JSON below for detailed grade information.
JSON
{
"generated_at": "2025-12-15T06:36:08.102520",
"k": 4,
"overall": {
"deepseek-v3-1-terminus-thinking": {
"total_tasks": 127,
"total_agent_execution_time": 373135.9043517113,
"total_input_tokens": 172323078,
"total_output_tokens": 7437158,
"total_tokens": 179760236,
"total_turns": 6603,
"avg_agent_execution_time": 734.5195,
"avg_input_tokens": 339218.6575,
"avg_output_tokens": 14640.0748,
"avg_total_tokens": 353858.7323,
"avg_turns": 12.998,
"per_run_input_tokens": 43080769.5,
"per_run_output_tokens": 1859289.5,
"per_run_cost": 10.5158,
"actual_model_name": "deepseek-v3.1-terminus-thinking",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2126,
"std": 0.0329
},
"pass@4": 0.3701,
"pass^4": 0.0551
},
"deepseek-chat": {
"total_tasks": 127,
"total_agent_execution_time": 137112.82717490196,
"total_input_tokens": 250467772,
"total_output_tokens": 1425635,
"total_tokens": 251893407,
"total_turns": 9872,
"avg_agent_execution_time": 269.9071,
"avg_input_tokens": 493046.7953,
"avg_output_tokens": 2806.3681,
"avg_total_tokens": 495853.1634,
"avg_turns": 19.4331,
"per_run_input_tokens": 62616943,
"per_run_output_tokens": 356408.75,
"per_run_cost": 35.664255,
"actual_model_name": "deepseek-v3.1-non-think",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1673,
"std": 0.0141
},
"pass@4": 0.2835,
"pass^4": 0.0787
},
"gpt-4-1": {
"total_tasks": 127,
"total_agent_execution_time": 30320.14462161064,
"total_input_tokens": 164083441,
"total_output_tokens": 787047,
"total_tokens": 164870488,
"total_turns": 5052,
"avg_agent_execution_time": 59.6853,
"avg_input_tokens": 322998.8996,
"avg_output_tokens": 1549.3051,
"avg_total_tokens": 324548.2047,
"avg_turns": 9.9449,
"per_run_input_tokens": 41020860.25,
"per_run_output_tokens": 196761.75,
"per_run_cost": 83.615814,
"actual_model_name": "gpt-4.1-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0807,
"std": 0.0065
},
"pass@4": 0.126,
"pass^4": 0.0315
},
"claude-opus-4-1": {
"total_tasks": 127,
"total_agent_execution_time": 45950.07896590233,
"total_input_tokens": 74430756,
"total_output_tokens": 653210,
"total_tokens": 75083966,
"total_turns": 2214,
"avg_agent_execution_time": 361.8116,
"avg_input_tokens": 586068.9449,
"avg_output_tokens": 5143.3858,
"avg_total_tokens": 591212.3307,
"avg_turns": 17.4331,
"per_run_input_tokens": 74430756,
"per_run_output_tokens": 653210,
"per_run_cost": 1165.45209,
"actual_model_name": "claude-opus-4-1-20250805",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2992,
"std": 0
}
},
"glm-4-5": {
"total_tasks": 127,
"total_agent_execution_time": 84480.56672096252,
"total_input_tokens": 213186013,
"total_output_tokens": 2078679,
"total_tokens": 215264692,
"total_turns": 9214,
"avg_agent_execution_time": 166.3003,
"avg_input_tokens": 419657.5059,
"avg_output_tokens": 4091.8878,
"avg_total_tokens": 423749.3937,
"avg_turns": 18.1378,
"per_run_input_tokens": 53296503.25,
"per_run_output_tokens": 519669.75,
"per_run_cost": 18.27381,
"actual_model_name": "glm-4.5",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1555,
"std": 0.0116
},
"pass@4": 0.2441,
"pass^4": 0.063
},
"o3": {
"total_tasks": 127,
"total_agent_execution_time": 86067.44522094727,
"total_input_tokens": 210426740,
"total_output_tokens": 4365349,
"total_tokens": 214792089,
"total_turns": 8368,
"avg_agent_execution_time": 169.4241,
"avg_input_tokens": 414225.8661,
"avg_output_tokens": 8593.2067,
"avg_total_tokens": 422819.0728,
"avg_turns": 16.4724,
"per_run_input_tokens": 52606685,
"per_run_output_tokens": 1091337.25,
"per_run_cost": 113.944068,
"actual_model_name": "o3-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2539,
"std": 0.0204
},
"pass@4": 0.4331,
"pass^4": 0.126
},
"claude-sonnet-4-5": {
"total_tasks": 127,
"total_agent_execution_time": 87988.92542219162,
"total_input_tokens": 359671392,
"total_output_tokens": 3158899,
"total_tokens": 362830291,
"total_turns": 8516,
"avg_agent_execution_time": 173.2065,
"avg_input_tokens": 708014.5512,
"avg_output_tokens": 6218.3051,
"avg_total_tokens": 714232.8563,
"avg_turns": 16.7638,
"per_run_input_tokens": 89917848,
"per_run_output_tokens": 789724.75,
"per_run_cost": 281.599415,
"actual_model_name": "claude-sonnet-4-5-20250929",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.3209,
"std": 0.0232
},
"pass@4": 0.4646,
"pass^4": 0.1654
},
"claude-sonnet-4-low": {
"total_tasks": 127,
"total_agent_execution_time": 101288.96435308456,
"total_input_tokens": 602044071,
"total_output_tokens": 2510844,
"total_tokens": 604554915,
"total_turns": 9867,
"avg_agent_execution_time": 199.3877,
"avg_input_tokens": 1185126.124,
"avg_output_tokens": 4942.6063,
"avg_total_tokens": 1190068.7303,
"avg_turns": 19.4232,
"per_run_input_tokens": 150511017.75,
"per_run_output_tokens": 627711,
"per_run_cost": 460.948718,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2736,
"std": 0.017
},
"pass@4": 0.3937,
"pass^4": 0.1811
},
"gemini-2-5-pro": {
"total_tasks": 127,
"total_agent_execution_time": 60629.834964990616,
"total_input_tokens": 238581228,
"total_output_tokens": 3564083,
"total_tokens": 242145311,
"total_turns": 5561,
"avg_agent_execution_time": 119.3501,
"avg_input_tokens": 469648.0866,
"avg_output_tokens": 7015.9114,
"avg_total_tokens": 476663.998,
"avg_turns": 10.9469,
"per_run_input_tokens": 59645307,
"per_run_output_tokens": 891020.75,
"per_run_cost": 162.478579,
"actual_model_name": "gemini-2.5-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1575,
"std": 0.0056
},
"pass@4": 0.2992,
"pass^4": 0.0472
},
"qwen-3-coder-plus": {
"total_tasks": 127,
"total_agent_execution_time": 139339.28879857063,
"total_input_tokens": 722104724,
"total_output_tokens": 1784161,
"total_tokens": 723888885,
"total_turns": 12063,
"avg_agent_execution_time": 274.2899,
"avg_input_tokens": 1421465.9921,
"avg_output_tokens": 3512.128,
"avg_total_tokens": 1424978.1201,
"avg_turns": 23.7461,
"per_run_input_tokens": 180526181,
"per_run_output_tokens": 446040.25,
"per_run_cost": 36.462068,
"actual_model_name": "qwen3-coder-480b-a35b-instruct",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.248,
"std": 0.0205
},
"pass@4": 0.4094,
"pass^4": 0.126
},
"gpt-5-mini-high": {
"total_tasks": 127,
"total_agent_execution_time": 177766.2423055172,
"total_input_tokens": 528738725,
"total_output_tokens": 14617006,
"total_tokens": 543355731,
"total_turns": 9743,
"avg_agent_execution_time": 349.9335,
"avg_input_tokens": 1040824.2618,
"avg_output_tokens": 28773.6339,
"avg_total_tokens": 1069597.8957,
"avg_turns": 19.1791,
"per_run_input_tokens": 132184681.25,
"per_run_output_tokens": 3654251.5,
"per_run_cost": 40.354673,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.3032,
"std": 0.0172
},
"pass@4": 0.4646,
"pass^4": 0.1654
},
"gpt-5-nano-low": {
"total_tasks": 127,
"total_agent_execution_time": 48947.8431391716,
"total_input_tokens": 148338105,
"total_output_tokens": 6497039,
"total_tokens": 154835144,
"total_turns": 6102,
"avg_agent_execution_time": 96.354,
"avg_input_tokens": 292004.1437,
"avg_output_tokens": 12789.4469,
"avg_total_tokens": 304793.5906,
"avg_turns": 12.0118,
"per_run_input_tokens": 37084526.25,
"per_run_output_tokens": 1624259.75,
"per_run_cost": 2.50393,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0433,
"std": 0.0118
},
"pass@4": 0.1024,
"pass^4": 0.0079
},
"gpt-5-medium": {
"total_tasks": 127,
"total_agent_execution_time": 242931.40538144112,
"total_input_tokens": 318853695,
"total_output_tokens": 11128874,
"total_tokens": 329982569,
"total_turns": 7475,
"avg_agent_execution_time": 478.2114,
"avg_input_tokens": 627664.7539,
"avg_output_tokens": 21907.2323,
"avg_total_tokens": 649571.9862,
"avg_turns": 14.7146,
"per_run_input_tokens": 79713423.75,
"per_run_output_tokens": 2782218.5,
"per_run_cost": 127.463965,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5256,
"std": 0.0129
},
"pass@4": 0.685,
"pass^4": 0.3386
},
"claude-sonnet-4": {
"total_tasks": 127,
"total_agent_execution_time": 110881.38604903221,
"total_input_tokens": 324800674,
"total_output_tokens": 2349295,
"total_tokens": 327149969,
"total_turns": 9388,
"avg_agent_execution_time": 218.2704,
"avg_input_tokens": 639371.4055,
"avg_output_tokens": 4624.5965,
"avg_total_tokens": 643996.002,
"avg_turns": 18.4803,
"per_run_input_tokens": 81200168.5,
"per_run_output_tokens": 587323.75,
"per_run_cost": 252.410362,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2815,
"std": 0.0257
},
"pass@4": 0.4488,
"pass^4": 0.126
},
"gpt-5-low": {
"total_tasks": 127,
"total_agent_execution_time": 195995.03595471382,
"total_input_tokens": 325914298,
"total_output_tokens": 9607215,
"total_tokens": 335521513,
"total_turns": 7211,
"avg_agent_execution_time": 385.817,
"avg_input_tokens": 641563.5787,
"avg_output_tokens": 18911.8406,
"avg_total_tokens": 660475.4193,
"avg_turns": 14.1949,
"per_run_input_tokens": 81478574.5,
"per_run_output_tokens": 2401803.75,
"per_run_cost": 125.866256,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4685,
"std": 0.0287
},
"pass@4": 0.6299,
"pass^4": 0.2677
},
"gpt-5-mini-medium": {
"total_tasks": 127,
"total_agent_execution_time": 81238.54882597923,
"total_input_tokens": 374509740,
"total_output_tokens": 5235729,
"total_tokens": 379745469,
"total_turns": 7961,
"avg_agent_execution_time": 159.9184,
"avg_input_tokens": 737223.8976,
"avg_output_tokens": 10306.5531,
"avg_total_tokens": 747530.4508,
"avg_turns": 15.6713,
"per_run_input_tokens": 93627435,
"per_run_output_tokens": 1308932.25,
"per_run_cost": 26.024723,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2736,
"std": 0.0312
},
"pass@4": 0.4567,
"pass^4": 0.0945
},
"gpt-4-1-nano": {
"total_tasks": 127,
"total_agent_execution_time": 19860.039894342422,
"total_input_tokens": 98229466,
"total_output_tokens": 831039,
"total_tokens": 99060505,
"total_turns": 5477,
"avg_agent_execution_time": 39.0946,
"avg_input_tokens": 193365.0906,
"avg_output_tokens": 1635.9035,
"avg_total_tokens": 195000.9941,
"avg_turns": 10.7815,
"per_run_input_tokens": 24557366.5,
"per_run_output_tokens": 207759.75,
"per_run_cost": 2.538841,
"actual_model_name": "gpt-4.1-nano-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"grok-code-fast-1": {
"total_tasks": 127,
"total_agent_execution_time": 79566.39907479286,
"total_input_tokens": 299975054,
"total_output_tokens": 2870082,
"total_tokens": 303880002,
"total_turns": 9356,
"avg_agent_execution_time": 156.6268,
"avg_input_tokens": 590502.0748,
"avg_output_tokens": 5649.7677,
"avg_total_tokens": 598188.9803,
"avg_turns": 18.4173,
"per_run_input_tokens": 74993763.5,
"per_run_output_tokens": 717520.5,
"per_run_cost": 16.075033,
"actual_model_name": "grok-code-fast-1",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2047,
"std": 0.0339
},
"pass@4": 0.3071,
"pass^4": 0.0945
},
"deepseek-v3-2-thinking": {
"total_tasks": 127,
"total_agent_execution_time": 202188.34230470657,
"total_input_tokens": 456794076,
"total_output_tokens": 4461272,
"total_tokens": 461255348,
"total_turns": 13346,
"avg_agent_execution_time": 398.0085,
"avg_input_tokens": 899200.937,
"avg_output_tokens": 8782.0315,
"avg_total_tokens": 907982.9685,
"avg_turns": 26.2717,
"per_run_input_tokens": 114198519,
"per_run_output_tokens": 1115318,
"per_run_cost": 31.279727,
"actual_model_name": "deepseek-v3.2-reasoner",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.3681,
"std": 0.0179
},
"pass@4": 0.5118,
"pass^4": 0.2126
},
"gemini-2-5-flash": {
"total_tasks": 127,
"total_agent_execution_time": 58353.47530055046,
"total_input_tokens": 520239496,
"total_output_tokens": 4469819,
"total_tokens": 524709348,
"total_turns": 5795,
"avg_agent_execution_time": 114.869,
"avg_input_tokens": 1024093.4961,
"avg_output_tokens": 8798.8563,
"avg_total_tokens": 1032892.4173,
"avg_turns": 11.4075,
"per_run_input_tokens": 130059874,
"per_run_output_tokens": 1117454.75,
"per_run_cost": 41.811599,
"actual_model_name": "gemini-2.5-flash",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0906,
"std": 0.0068
},
"pass@4": 0.1811,
"pass^4": 0.0394
},
"gpt-5-nano-high": {
"total_tasks": 127,
"total_agent_execution_time": 157217.0769355297,
"total_input_tokens": 411756625,
"total_output_tokens": 26452351,
"total_tokens": 438208976,
"total_turns": 10031,
"avg_agent_execution_time": 309.4824,
"avg_input_tokens": 810544.5374,
"avg_output_tokens": 52071.5571,
"avg_total_tokens": 862616.0945,
"avg_turns": 19.7461,
"per_run_input_tokens": 102939156.25,
"per_run_output_tokens": 6613087.75,
"per_run_cost": 7.792193,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0512,
"std": 0.0205
},
"pass@4": 0.1417,
"pass^4": 0
},
"gpt-5-mini-low": {
"total_tasks": 127,
"total_agent_execution_time": 32084.921304941177,
"total_input_tokens": 112819524,
"total_output_tokens": 1609354,
"total_tokens": 114428878,
"total_turns": 4226,
"avg_agent_execution_time": 63.1593,
"avg_input_tokens": 222085.6772,
"avg_output_tokens": 3168.0197,
"avg_total_tokens": 225253.6969,
"avg_turns": 8.3189,
"per_run_input_tokens": 28204881,
"per_run_output_tokens": 402338.5,
"per_run_cost": 7.855897,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0827,
"std": 0.0131
},
"pass@4": 0.189,
"pass^4": 0.0079
},
"deepseek-v3-1-terminus": {
"total_tasks": 127,
"total_agent_execution_time": 124399.72146391869,
"total_input_tokens": 236208725,
"total_output_tokens": 1266817,
"total_tokens": 237475542,
"total_turns": 7753,
"avg_agent_execution_time": 244.8813,
"avg_input_tokens": 464977.8051,
"avg_output_tokens": 2493.7343,
"avg_total_tokens": 467471.5394,
"avg_turns": 15.2618,
"per_run_input_tokens": 59052181.25,
"per_run_output_tokens": 316704.25,
"per_run_cost": 12.651154,
"actual_model_name": "deepseek-v3.1-terminus",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1654,
"std": 0.0507
},
"pass@4": 0.2992,
"pass^4": 0.0394
},
"qwen-3-max": {
"total_tasks": 127,
"total_agent_execution_time": 108499.87574696541,
"total_input_tokens": 525760160,
"total_output_tokens": 1516865,
"total_tokens": 527277025,
"total_turns": 12114,
"avg_agent_execution_time": 213.5824,
"avg_input_tokens": 1034960.9449,
"avg_output_tokens": 2985.9547,
"avg_total_tokens": 1037946.8996,
"avg_turns": 23.8465,
"per_run_input_tokens": 131440040,
"per_run_output_tokens": 379216.25,
"per_run_cost": 160.003345,
"actual_model_name": "qwen3-max-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1772,
"std": 0.0131
},
"pass@4": 0.2283,
"pass^4": 0.1102
},
"claude-opus-4-5-high": {
"total_tasks": 127,
"total_agent_execution_time": 110168.67978286743,
"total_input_tokens": 346186842,
"total_output_tokens": 5350914,
"total_tokens": 351537756,
"total_turns": 8229,
"avg_agent_execution_time": 216.8675,
"avg_input_tokens": 681470.1614,
"avg_output_tokens": 10533.2953,
"avg_total_tokens": 692003.4567,
"avg_turns": 16.1988,
"per_run_input_tokens": 86546710.5,
"per_run_output_tokens": 1337728.5,
"per_run_cost": 466.176765,
"actual_model_name": "claude-opus-4-5-20251101",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.4232,
"std": 0.0196
},
"pass@4": 0.5354,
"pass^4": 0.3386
},
"deepseek-v3-2-chat": {
"total_tasks": 127,
"total_agent_execution_time": 151568.43167614937,
"total_input_tokens": 389614196,
"total_output_tokens": 2764224,
"total_tokens": 392378420,
"total_turns": 12317,
"avg_agent_execution_time": 298.3631,
"avg_input_tokens": 766957.0787,
"avg_output_tokens": 5441.3858,
"avg_total_tokens": 772398.4646,
"avg_turns": 24.2461,
"per_run_input_tokens": 97403549,
"per_run_output_tokens": 691056,
"per_run_cost": 26.575381,
"actual_model_name": "deepseek-v3.2-chat",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2972,
"std": 0.0151
},
"pass@4": 0.4646,
"pass^4": 0.1339
},
"kimi-k2-0711": {
"total_tasks": 127,
"total_agent_execution_time": 109108.96008372307,
"total_input_tokens": 236755502,
"total_output_tokens": 1502196,
"total_tokens": 238257698,
"total_turns": 9504,
"avg_agent_execution_time": 214.7814,
"avg_input_tokens": 466054.1378,
"avg_output_tokens": 2957.0787,
"avg_total_tokens": 469011.2165,
"avg_turns": 18.7087,
"per_run_input_tokens": 59188875.5,
"per_run_output_tokens": 375549,
"per_run_cost": 36.452198,
"actual_model_name": "kimi-k2-0711-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1909,
"std": 0.0161
},
"pass@4": 0.315,
"pass^4": 0.1181
},
"gemini-3-pro-high": {
"total_tasks": 127,
"total_agent_execution_time": 112964.39260983467,
"total_input_tokens": 504014234,
"total_output_tokens": 4526799,
"total_tokens": 508541033,
"total_turns": 7972,
"avg_agent_execution_time": 222.3709,
"avg_input_tokens": 992154.0039,
"avg_output_tokens": 8911.0217,
"avg_total_tokens": 1001065.0256,
"avg_turns": 15.6929,
"per_run_input_tokens": 126003558.5,
"per_run_output_tokens": 1131699.75,
"per_run_cost": 265.587514,
"actual_model_name": "gemini-3-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5394,
"std": 0.0039
},
"pass@4": 0.6693,
"pass^4": 0.378
},
"gpt-5-high": {
"total_tasks": 127,
"total_agent_execution_time": 522861.792570591,
"total_input_tokens": 343123887,
"total_output_tokens": 18664477,
"total_tokens": 361788364,
"total_turns": 7952,
"avg_agent_execution_time": 1029.2555,
"avg_input_tokens": 675440.7224,
"avg_output_tokens": 36741.0965,
"avg_total_tokens": 712181.8189,
"avg_turns": 15.6535,
"per_run_input_tokens": 85780971.75,
"per_run_output_tokens": 4666119.25,
"per_run_cost": 153.887407,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5157,
"std": 0.0252
},
"pass@4": 0.6614,
"pass^4": 0.3307
},
"kimi-k2-0905": {
"total_tasks": 127,
"total_agent_execution_time": 250829.567029953,
"total_input_tokens": 473201616,
"total_output_tokens": 2545700,
"total_tokens": 475747316,
"total_turns": 13691,
"avg_agent_execution_time": 493.759,
"avg_input_tokens": 931499.2441,
"avg_output_tokens": 5011.2205,
"avg_total_tokens": 936510.4646,
"avg_turns": 26.9508,
"per_run_input_tokens": 118300404,
"per_run_output_tokens": 636425,
"per_run_cost": 72.571305,
"actual_model_name": "kimi-k2-0905-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2185,
"std": 0.0116
},
"pass@4": 0.315,
"pass^4": 0.126
},
"gpt-5-nano-medium": {
"total_tasks": 127,
"total_agent_execution_time": 79832.2317507267,
"total_input_tokens": 227578931,
"total_output_tokens": 12104517,
"total_tokens": 239683448,
"total_turns": 7366,
"avg_agent_execution_time": 157.1501,
"avg_input_tokens": 447990.0217,
"avg_output_tokens": 23827.7894,
"avg_total_tokens": 471817.811,
"avg_turns": 14.5,
"per_run_input_tokens": 56894732.75,
"per_run_output_tokens": 3026129.25,
"per_run_cost": 4.055188,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.063,
"std": 0.0201
},
"pass@4": 0.1181,
"pass^4": 0.0157
},
"gpt-5-2-high": {
"total_tasks": 127,
"total_agent_execution_time": 372110.88462781906,
"total_input_tokens": 459587889,
"total_output_tokens": 14114690,
"total_tokens": 473702579,
"total_turns": 9978,
"avg_agent_execution_time": 732.5017,
"avg_input_tokens": 904700.5689,
"avg_output_tokens": 27784.8228,
"avg_total_tokens": 932485.3917,
"avg_turns": 19.6417,
"per_run_input_tokens": 114896972.25,
"per_run_output_tokens": 3528672.5,
"per_run_cost": 250.471116,
"actual_model_name": "gpt-5.2-2025-12-11",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5748,
"std": 0.0111
},
"pass@4": 0.6693,
"pass^4": 0.4488
},
"grok-4": {
"total_tasks": 127,
"total_agent_execution_time": 162467.43107247353,
"total_input_tokens": 321821722,
"total_output_tokens": 4277197,
"total_tokens": 326544746,
"total_turns": 8254,
"avg_agent_execution_time": 319.8178,
"avg_input_tokens": 633507.3268,
"avg_output_tokens": 8419.6791,
"avg_total_tokens": 642804.6181,
"avg_turns": 16.248,
"per_run_input_tokens": 80455430.5,
"per_run_output_tokens": 1069299.25,
"per_run_cost": 257.40578,
"actual_model_name": "grok-4-0709",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.3169,
"std": 0.0291
},
"pass@4": 0.4488,
"pass^4": 0.1811
},
"claude-sonnet-4-high": {
"total_tasks": 127,
"total_agent_execution_time": 94260.38231873512,
"total_input_tokens": 576975325,
"total_output_tokens": 2558316,
"total_tokens": 579533641,
"total_turns": 9884,
"avg_agent_execution_time": 185.5519,
"avg_input_tokens": 1135778.1988,
"avg_output_tokens": 5036.0551,
"avg_total_tokens": 1140814.2539,
"avg_turns": 19.4567,
"per_run_input_tokens": 144243831.25,
"per_run_output_tokens": 639579,
"per_run_cost": 442.325179,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2835,
"std": 0.0236
},
"pass@4": 0.4094,
"pass^4": 0.1811
},
"gemini-3-pro-low": {
"total_tasks": 127,
"total_agent_execution_time": 106390.38415026665,
"total_input_tokens": 487353055,
"total_output_tokens": 4490563,
"total_tokens": 491843618,
"total_turns": 7933,
"avg_agent_execution_time": 209.4299,
"avg_input_tokens": 959356.4075,
"avg_output_tokens": 8839.6909,
"avg_total_tokens": 968196.0984,
"avg_turns": 15.6161,
"per_run_input_tokens": 121838263.75,
"per_run_output_tokens": 1122640.75,
"per_run_cost": 257.148216,
"actual_model_name": "gemini-3-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5079,
"std": 0.0205
},
"pass@4": 0.6772,
"pass^4": 0.3071
},
"gpt-oss-120b": {
"total_tasks": 127,
"total_agent_execution_time": 13901.512543916702,
"total_input_tokens": 32768093,
"total_output_tokens": 697653,
"total_tokens": 33465746,
"total_turns": 2741,
"avg_agent_execution_time": 27.3652,
"avg_input_tokens": 64504.1201,
"avg_output_tokens": 1373.3327,
"avg_total_tokens": 65877.4528,
"avg_turns": 5.3957,
"per_run_input_tokens": 8192023.25,
"per_run_output_tokens": 174413.25,
"per_run_cost": 0.638661,
"actual_model_name": "gpt-oss-120b",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0472,
"std": 0.0096
},
"pass@4": 0.1339,
"pass^4": 0
},
"grok-4-fast": {
"total_tasks": 127,
"total_agent_execution_time": 55840.35218334198,
"total_input_tokens": 346217495,
"total_output_tokens": 4517025,
"total_tokens": 350734520,
"total_turns": 7873,
"avg_agent_execution_time": 109.922,
"avg_input_tokens": 681530.502,
"avg_output_tokens": 8891.7815,
"avg_total_tokens": 690422.2835,
"avg_turns": 15.498,
"per_run_input_tokens": 86554373.75,
"per_run_output_tokens": 1129256.25,
"per_run_cost": 17.875503,
"actual_model_name": "grok-4-fast",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2402,
"std": 0.0307
},
"pass@4": 0.3858,
"pass^4": 0.126
},
"o4-mini": {
"total_tasks": 127,
"total_agent_execution_time": 164245.99844169617,
"total_input_tokens": 199695107,
"total_output_tokens": 7910153,
"total_tokens": 207605260,
"total_turns": 7418,
"avg_agent_execution_time": 323.3189,
"avg_input_tokens": 393100.6043,
"avg_output_tokens": 15571.1673,
"avg_total_tokens": 408671.7717,
"avg_turns": 14.6024,
"per_run_input_tokens": 49923776.75,
"per_run_output_tokens": 1977538.25,
"per_run_cost": 63.617323,
"actual_model_name": "o4-mini-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1732,
"std": 0.023
},
"pass@4": 0.315,
"pass^4": 0.063
},
"gpt-4-1-mini": {
"total_tasks": 127,
"total_agent_execution_time": 43548.9137635231,
"total_input_tokens": 595733455,
"total_output_tokens": 964338,
"total_tokens": 596697793,
"total_turns": 8326,
"avg_agent_execution_time": 85.7262,
"avg_input_tokens": 1172703.6516,
"avg_output_tokens": 1898.3031,
"avg_total_tokens": 1174601.9547,
"avg_turns": 16.3898,
"per_run_input_tokens": 148933363.75,
"per_run_output_tokens": 241084.5,
"per_run_cost": 59.959081,
"actual_model_name": "gpt-4.1-mini-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0394,
"std": 0.0096
},
"pass@4": 0.0709,
"pass^4": 0.0157
}
},
"filesystem": {
"deepseek-v3-1-terminus-thinking": {
"total_tasks": 30,
"total_agent_execution_time": 92813.15895938873,
"total_input_tokens": 12850849,
"total_output_tokens": 1807599,
"total_tokens": 14658448,
"total_turns": 1181,
"avg_agent_execution_time": 773.443,
"avg_input_tokens": 107090.4083,
"avg_output_tokens": 15063.325,
"avg_total_tokens": 122153.7333,
"avg_turns": 9.8417,
"per_run_input_tokens": 3212712.25,
"per_run_output_tokens": 451899.75,
"per_run_cost": 1.03167,
"actual_model_name": "deepseek-v3.1-terminus-thinking",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2417,
"std": 0.101
},
"pass@4": 0.4333,
"pass^4": 0.0667
},
"deepseek-chat": {
"total_tasks": 30,
"total_agent_execution_time": 33809.433807849884,
"total_input_tokens": 50559981,
"total_output_tokens": 406171,
"total_tokens": 50966152,
"total_turns": 2860,
"avg_agent_execution_time": 281.7453,
"avg_input_tokens": 421333.175,
"avg_output_tokens": 3384.7583,
"avg_total_tokens": 424717.9333,
"avg_turns": 23.8333,
"per_run_input_tokens": 12639995.25,
"per_run_output_tokens": 101542.75,
"per_run_cost": 7.248989,
"actual_model_name": "deepseek-v3.1-non-think",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1583,
"std": 0.0144
},
"pass@4": 0.2667,
"pass^4": 0.0667
},
"gpt-4-1": {
"total_tasks": 30,
"total_agent_execution_time": 4918.759680747986,
"total_input_tokens": 17274493,
"total_output_tokens": 217587,
"total_tokens": 17492080,
"total_turns": 1114,
"avg_agent_execution_time": 40.9897,
"avg_input_tokens": 143954.1083,
"avg_output_tokens": 1813.225,
"avg_total_tokens": 145767.3333,
"avg_turns": 9.2833,
"per_run_input_tokens": 4318623.25,
"per_run_output_tokens": 54396.75,
"per_run_cost": 9.07242,
"actual_model_name": "gpt-4.1-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.125,
"std": 0.0144
},
"pass@4": 0.2,
"pass^4": 0.0333
},
"claude-opus-4-1": {
"total_tasks": 30,
"total_agent_execution_time": 8033.295060634613,
"total_input_tokens": 8164979,
"total_output_tokens": 130994,
"total_tokens": 8295973,
"total_turns": 491,
"avg_agent_execution_time": 267.7765,
"avg_input_tokens": 272165.9667,
"avg_output_tokens": 4366.4667,
"avg_total_tokens": 276532.4333,
"avg_turns": 16.3667,
"per_run_input_tokens": 8164979,
"per_run_output_tokens": 130994,
"per_run_cost": 132.299235,
"actual_model_name": "claude-opus-4-1-20250805",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.3333,
"std": 0
}
},
"glm-4-5": {
"total_tasks": 30,
"total_agent_execution_time": 15624.920702934265,
"total_input_tokens": 23273606,
"total_output_tokens": 470863,
"total_tokens": 23744469,
"total_turns": 1967,
"avg_agent_execution_time": 130.2077,
"avg_input_tokens": 193946.7167,
"avg_output_tokens": 3923.8583,
"avg_total_tokens": 197870.575,
"avg_turns": 16.3917,
"per_run_input_tokens": 5818401.5,
"per_run_output_tokens": 117715.75,
"per_run_cost": 2.075457,
"actual_model_name": "glm-4.5",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.075,
"std": 0.0144
},
"pass@4": 0.1333,
"pass^4": 0.0333
},
"o3": {
"total_tasks": 30,
"total_agent_execution_time": 33353.133106946945,
"total_input_tokens": 82757119,
"total_output_tokens": 2135261,
"total_tokens": 84892380,
"total_turns": 3455,
"avg_agent_execution_time": 277.9428,
"avg_input_tokens": 689642.6583,
"avg_output_tokens": 17793.8417,
"avg_total_tokens": 707436.5,
"avg_turns": 28.7917,
"per_run_input_tokens": 20689279.75,
"per_run_output_tokens": 533815.25,
"per_run_cost": 45.649082,
"actual_model_name": "o3-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.3583,
"std": 0.0276
},
"pass@4": 0.5,
"pass^4": 0.2667
},
"claude-sonnet-4-5": {
"total_tasks": 30,
"total_agent_execution_time": 11409.54042005539,
"total_input_tokens": 32245290,
"total_output_tokens": 569223,
"total_tokens": 32814513,
"total_turns": 1328,
"avg_agent_execution_time": 95.0795,
"avg_input_tokens": 268710.75,
"avg_output_tokens": 4743.525,
"avg_total_tokens": 273454.275,
"avg_turns": 11.0667,
"per_run_input_tokens": 8061322.5,
"per_run_output_tokens": 142305.75,
"per_run_cost": 26.318554,
"actual_model_name": "claude-sonnet-4-5-20250929",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.325,
"std": 0.0493
},
"pass@4": 0.4333,
"pass^4": 0.1333
},
"claude-sonnet-4-low": {
"total_tasks": 30,
"total_agent_execution_time": 21124.522393226624,
"total_input_tokens": 124738934,
"total_output_tokens": 646826,
"total_tokens": 125385760,
"total_turns": 2247,
"avg_agent_execution_time": 176.0377,
"avg_input_tokens": 1039491.1167,
"avg_output_tokens": 5390.2167,
"avg_total_tokens": 1044881.3333,
"avg_turns": 18.725,
"per_run_input_tokens": 31184733.5,
"per_run_output_tokens": 161706.5,
"per_run_cost": 95.979798,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2333,
"std": 0.0471
},
"pass@4": 0.3667,
"pass^4": 0.1333
},
"gemini-2-5-pro": {
"total_tasks": 30,
"total_agent_execution_time": 15134.337651252747,
"total_input_tokens": 25795900,
"total_output_tokens": 929436,
"total_tokens": 26725336,
"total_turns": 1722,
"avg_agent_execution_time": 126.1195,
"avg_input_tokens": 214965.8333,
"avg_output_tokens": 7745.3,
"avg_total_tokens": 222711.1333,
"avg_turns": 14.35,
"per_run_input_tokens": 6448975,
"per_run_output_tokens": 232359,
"per_run_cost": 19.607823,
"actual_model_name": "gemini-2.5-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2417,
"std": 0.0363
},
"pass@4": 0.4333,
"pass^4": 0.1
},
"qwen-3-coder-plus": {
"total_tasks": 30,
"total_agent_execution_time": 18861.470023155212,
"total_input_tokens": 116688838,
"total_output_tokens": 497514,
"total_tokens": 117186352,
"total_turns": 3387,
"avg_agent_execution_time": 157.1789,
"avg_input_tokens": 972406.9833,
"avg_output_tokens": 4145.95,
"avg_total_tokens": 976552.9333,
"avg_turns": 28.225,
"per_run_input_tokens": 29172209.5,
"per_run_output_tokens": 124378.5,
"per_run_cost": 5.933945,
"actual_model_name": "qwen3-coder-480b-a35b-instruct",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1333,
"std": 0.0667
},
"pass@4": 0.2667,
"pass^4": 0.0333
},
"gpt-5-mini-high": {
"total_tasks": 30,
"total_agent_execution_time": 34673.4765253067,
"total_input_tokens": 64240780,
"total_output_tokens": 3091722,
"total_tokens": 67332502,
"total_turns": 2210,
"avg_agent_execution_time": 288.9456,
"avg_input_tokens": 535339.8333,
"avg_output_tokens": 25764.35,
"avg_total_tokens": 561104.1833,
"avg_turns": 18.4167,
"per_run_input_tokens": 16060195,
"per_run_output_tokens": 772930.5,
"per_run_cost": 5.56091,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.35,
"std": 0.0764
},
"pass@4": 0.4667,
"pass^4": 0.2333
},
"gpt-5-nano-low": {
"total_tasks": 30,
"total_agent_execution_time": 15497.93159866333,
"total_input_tokens": 77157814,
"total_output_tokens": 2280787,
"total_tokens": 79438601,
"total_turns": 2755,
"avg_agent_execution_time": 129.1494,
"avg_input_tokens": 642981.7833,
"avg_output_tokens": 19006.5583,
"avg_total_tokens": 661988.3417,
"avg_turns": 22.9583,
"per_run_input_tokens": 19289453.5,
"per_run_output_tokens": 570196.75,
"per_run_cost": 1.192551,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.125,
"std": 0.0363
},
"pass@4": 0.3,
"pass^4": 0.0333
},
"gpt-5-medium": {
"total_tasks": 30,
"total_agent_execution_time": 37567.695655584335,
"total_input_tokens": 25915698,
"total_output_tokens": 2084979,
"total_tokens": 28000677,
"total_turns": 1207,
"avg_agent_execution_time": 313.0641,
"avg_input_tokens": 215964.15,
"avg_output_tokens": 17374.825,
"avg_total_tokens": 233338.975,
"avg_turns": 10.0583,
"per_run_input_tokens": 6478924.5,
"per_run_output_tokens": 521244.75,
"per_run_cost": 13.311103,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.575,
"std": 0.0363
},
"pass@4": 0.7667,
"pass^4": 0.3667
},
"claude-sonnet-4": {
"total_tasks": 30,
"total_agent_execution_time": 23177.59760403633,
"total_input_tokens": 36265545,
"total_output_tokens": 479856,
"total_tokens": 36745401,
"total_turns": 1922,
"avg_agent_execution_time": 193.1466,
"avg_input_tokens": 302212.875,
"avg_output_tokens": 3998.8,
"avg_total_tokens": 306211.675,
"avg_turns": 16.0167,
"per_run_input_tokens": 9066386.25,
"per_run_output_tokens": 119964,
"per_run_cost": 28.998619,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.275,
"std": 0.0276
},
"pass@4": 0.5,
"pass^4": 0.0667
},
"gpt-5-low": {
"total_tasks": 30,
"total_agent_execution_time": 33070.53626990318,
"total_input_tokens": 34189054,
"total_output_tokens": 1920255,
"total_tokens": 36109309,
"total_turns": 1295,
"avg_agent_execution_time": 275.5878,
"avg_input_tokens": 284908.7833,
"avg_output_tokens": 16002.125,
"avg_total_tokens": 300910.9083,
"avg_turns": 10.7917,
"per_run_input_tokens": 8547263.5,
"per_run_output_tokens": 480063.75,
"per_run_cost": 15.484717,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5417,
"std": 0.0682
},
"pass@4": 0.7333,
"pass^4": 0.3333
},
"gpt-5-mini-medium": {
"total_tasks": 30,
"total_agent_execution_time": 20912.210697889328,
"total_input_tokens": 47800838,
"total_output_tokens": 1509938,
"total_tokens": 49310776,
"total_turns": 1781,
"avg_agent_execution_time": 174.2684,
"avg_input_tokens": 398340.3167,
"avg_output_tokens": 12582.8167,
"avg_total_tokens": 410923.1333,
"avg_turns": 14.8417,
"per_run_input_tokens": 11950209.5,
"per_run_output_tokens": 377484.5,
"per_run_cost": 3.742521,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.3333,
"std": 0.0624
},
"pass@4": 0.5333,
"pass^4": 0.1
},
"gpt-4-1-nano": {
"total_tasks": 30,
"total_agent_execution_time": 3375.341311454773,
"total_input_tokens": 14038124,
"total_output_tokens": 158107,
"total_tokens": 14196231,
"total_turns": 1460,
"avg_agent_execution_time": 28.1278,
"avg_input_tokens": 116984.3667,
"avg_output_tokens": 1317.5583,
"avg_total_tokens": 118301.925,
"avg_turns": 12.1667,
"per_run_input_tokens": 3509531,
"per_run_output_tokens": 39526.75,
"per_run_cost": 0.366764,
"actual_model_name": "gpt-4.1-nano-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"grok-code-fast-1": {
"total_tasks": 30,
"total_agent_execution_time": 9060.72327876091,
"total_input_tokens": 33168391,
"total_output_tokens": 283156,
"total_tokens": 34075918,
"total_turns": 1966,
"avg_agent_execution_time": 75.506,
"avg_input_tokens": 276403.2583,
"avg_output_tokens": 2359.6333,
"avg_total_tokens": 283965.9833,
"avg_turns": 16.3833,
"per_run_input_tokens": 8292097.75,
"per_run_output_tokens": 70789,
"per_run_cost": 1.764603,
"actual_model_name": "grok-code-fast-1",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2333,
"std": 0.0745
},
"pass@4": 0.4,
"pass^4": 0.1
},
"deepseek-v3-2-thinking": {
"total_tasks": 30,
"total_agent_execution_time": 49623.92368507385,
"total_input_tokens": 79279555,
"total_output_tokens": 1229598,
"total_tokens": 80509153,
"total_turns": 3126,
"avg_agent_execution_time": 413.5327,
"avg_input_tokens": 660662.9583,
"avg_output_tokens": 10246.65,
"avg_total_tokens": 670909.6083,
"avg_turns": 26.05,
"per_run_input_tokens": 19819888.75,
"per_run_output_tokens": 307399.5,
"per_run_cost": 5.47433,
"actual_model_name": "deepseek-v3.2-reasoner",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.3667,
"std": 0.0408
},
"pass@4": 0.4667,
"pass^4": 0.2333
},
"gemini-2-5-flash": {
"total_tasks": 30,
"total_agent_execution_time": 7456.854316711426,
"total_input_tokens": 8117152,
"total_output_tokens": 908300,
"total_tokens": 9025452,
"total_turns": 780,
"avg_agent_execution_time": 62.1405,
"avg_input_tokens": 67642.9333,
"avg_output_tokens": 7569.1667,
"avg_total_tokens": 75212.1,
"avg_turns": 6.5,
"per_run_input_tokens": 2029288,
"per_run_output_tokens": 227075,
"per_run_cost": 1.176474,
"actual_model_name": "gemini-2.5-flash",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0833,
"std": 0.0167
},
"pass@4": 0.1333,
"pass^4": 0.0667
},
"gpt-5-nano-high": {
"total_tasks": 30,
"total_agent_execution_time": 24791.744941711426,
"total_input_tokens": 84154565,
"total_output_tokens": 4132124,
"total_tokens": 88286689,
"total_turns": 3235,
"avg_agent_execution_time": 206.5979,
"avg_input_tokens": 701288.0417,
"avg_output_tokens": 34434.3667,
"avg_total_tokens": 735722.4083,
"avg_turns": 26.9583,
"per_run_input_tokens": 21038641.25,
"per_run_output_tokens": 1033031,
"per_run_cost": 1.465144,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0583,
"std": 0.0493
},
"pass@4": 0.1667,
"pass^4": 0
},
"gpt-5-mini-low": {
"total_tasks": 30,
"total_agent_execution_time": 8122.387328386307,
"total_input_tokens": 14746988,
"total_output_tokens": 420637,
"total_tokens": 15167625,
"total_turns": 965,
"avg_agent_execution_time": 67.6866,
"avg_input_tokens": 122891.5667,
"avg_output_tokens": 3505.3083,
"avg_total_tokens": 126396.875,
"avg_turns": 8.0417,
"per_run_input_tokens": 3686747,
"per_run_output_tokens": 105159.25,
"per_run_cost": 1.132005,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.125,
"std": 0.0493
},
"pass@4": 0.3333,
"pass^4": 0.0333
},
"deepseek-v3-1-terminus": {
"total_tasks": 30,
"total_agent_execution_time": 21577.44312930107,
"total_input_tokens": 23277761,
"total_output_tokens": 288361,
"total_tokens": 23566122,
"total_turns": 1554,
"avg_agent_execution_time": 179.812,
"avg_input_tokens": 193981.3417,
"avg_output_tokens": 2403.0083,
"avg_total_tokens": 196384.35,
"avg_turns": 12.95,
"per_run_input_tokens": 5819440.25,
"per_run_output_tokens": 72090.25,
"per_run_cost": 1.279034,
"actual_model_name": "deepseek-v3.1-terminus",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1083,
"std": 0.0493
},
"pass@4": 0.2,
"pass^4": 0.0333
},
"qwen-3-max": {
"total_tasks": 30,
"total_agent_execution_time": 16070.112683296204,
"total_input_tokens": 46747185,
"total_output_tokens": 344106,
"total_tokens": 47091291,
"total_turns": 2313,
"avg_agent_execution_time": 133.9176,
"avg_input_tokens": 389559.875,
"avg_output_tokens": 2867.55,
"avg_total_tokens": 392427.425,
"avg_turns": 19.275,
"per_run_input_tokens": 11686796.25,
"per_run_output_tokens": 86026.5,
"per_run_cost": 14.540315,
"actual_model_name": "qwen3-max-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1083,
"std": 0.0144
},
"pass@4": 0.1333,
"pass^4": 0.1
},
"claude-opus-4-5-high": {
"total_tasks": 30,
"total_agent_execution_time": 10544.068356275558,
"total_input_tokens": 25957802,
"total_output_tokens": 604686,
"total_tokens": 26562488,
"total_turns": 1195,
"avg_agent_execution_time": 87.8672,
"avg_input_tokens": 216315.0167,
"avg_output_tokens": 5039.05,
"avg_total_tokens": 221354.0667,
"avg_turns": 9.9583,
"per_run_input_tokens": 6489450.5,
"per_run_output_tokens": 151171.5,
"per_run_cost": 36.22654,
"actual_model_name": "claude-opus-4-5-20251101",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.4,
"std": 0.0236
},
"pass@4": 0.5,
"pass^4": 0.3333
},
"deepseek-v3-2-chat": {
"total_tasks": 30,
"total_agent_execution_time": 37346.05409193039,
"total_input_tokens": 73324533,
"total_output_tokens": 794151,
"total_tokens": 74118684,
"total_turns": 3057,
"avg_agent_execution_time": 311.2171,
"avg_input_tokens": 611037.775,
"avg_output_tokens": 6617.925,
"avg_total_tokens": 617655.7,
"avg_turns": 25.475,
"per_run_input_tokens": 18331133.25,
"per_run_output_tokens": 198537.75,
"per_run_cost": 5.028821,
"actual_model_name": "deepseek-v3.2-chat",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.25,
"std": 0.0553
},
"pass@4": 0.4333,
"pass^4": 0.0667
},
"kimi-k2-0711": {
"total_tasks": 30,
"total_agent_execution_time": 26702.112154960632,
"total_input_tokens": 44744753,
"total_output_tokens": 439086,
"total_tokens": 45183839,
"total_turns": 2452,
"avg_agent_execution_time": 222.5176,
"avg_input_tokens": 372872.9417,
"avg_output_tokens": 3659.05,
"avg_total_tokens": 376531.9917,
"avg_turns": 20.4333,
"per_run_input_tokens": 11186188.25,
"per_run_output_tokens": 109771.5,
"per_run_cost": 6.986142,
"actual_model_name": "kimi-k2-0711-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2,
"std": 0.0236
},
"pass@4": 0.3,
"pass^4": 0.1333
},
"gemini-3-pro-high": {
"total_tasks": 30,
"total_agent_execution_time": 27516.157546043396,
"total_input_tokens": 75333649,
"total_output_tokens": 1338171,
"total_tokens": 76671820,
"total_turns": 2037,
"avg_agent_execution_time": 229.3013,
"avg_input_tokens": 627780.4083,
"avg_output_tokens": 11151.425,
"avg_total_tokens": 638931.8333,
"avg_turns": 16.975,
"per_run_input_tokens": 18833412.25,
"per_run_output_tokens": 334542.75,
"per_run_cost": 41.681337,
"actual_model_name": "gemini-3-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5917,
"std": 0.0363
},
"pass@4": 0.8,
"pass^4": 0.4
},
"gpt-5-high": {
"total_tasks": 30,
"total_agent_execution_time": 99365.79502940178,
"total_input_tokens": 31734063,
"total_output_tokens": 3026203,
"total_tokens": 34760266,
"total_turns": 1206,
"avg_agent_execution_time": 828.0483,
"avg_input_tokens": 264450.525,
"avg_output_tokens": 25218.3583,
"avg_total_tokens": 289668.8833,
"avg_turns": 10.05,
"per_run_input_tokens": 7933515.75,
"per_run_output_tokens": 756550.75,
"per_run_cost": 17.482402,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.525,
"std": 0.0363
},
"pass@4": 0.7,
"pass^4": 0.3667
},
"kimi-k2-0905": {
"total_tasks": 30,
"total_agent_execution_time": 45185.467045784,
"total_input_tokens": 83615017,
"total_output_tokens": 536182,
"total_tokens": 84151199,
"total_turns": 3152,
"avg_agent_execution_time": 376.5456,
"avg_input_tokens": 696791.8083,
"avg_output_tokens": 4468.1833,
"avg_total_tokens": 701259.9917,
"avg_turns": 26.2667,
"per_run_input_tokens": 20903754.25,
"per_run_output_tokens": 134045.5,
"per_run_cost": 12.877366,
"actual_model_name": "kimi-k2-0905-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1417,
"std": 0.0144
},
"pass@4": 0.2333,
"pass^4": 0.0667
},
"gpt-5-nano-medium": {
"total_tasks": 30,
"total_agent_execution_time": 15545.906517505646,
"total_input_tokens": 55529317,
"total_output_tokens": 2343137,
"total_tokens": 57872454,
"total_turns": 2490,
"avg_agent_execution_time": 129.5492,
"avg_input_tokens": 462744.3083,
"avg_output_tokens": 19526.1417,
"avg_total_tokens": 482270.45,
"avg_turns": 20.75,
"per_run_input_tokens": 13882329.25,
"per_run_output_tokens": 585784.25,
"per_run_cost": 0.92843,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0667,
"std": 0.0527
},
"pass@4": 0.1667,
"pass^4": 0
},
"gpt-5-2-high": {
"total_tasks": 30,
"total_agent_execution_time": 59995.830164432526,
"total_input_tokens": 71557884,
"total_output_tokens": 2696703,
"total_tokens": 74254587,
"total_turns": 1967,
"avg_agent_execution_time": 499.9653,
"avg_input_tokens": 596315.7,
"avg_output_tokens": 22472.525,
"avg_total_tokens": 618788.225,
"avg_turns": 16.3917,
"per_run_input_tokens": 17889471,
"per_run_output_tokens": 674175.75,
"per_run_cost": 40.745035,
"actual_model_name": "gpt-5.2-2025-12-11",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.6083,
"std": 0.0276
},
"pass@4": 0.7,
"pass^4": 0.4667
},
"grok-4": {
"total_tasks": 30,
"total_agent_execution_time": 30793.946169614792,
"total_input_tokens": 29679677,
"total_output_tokens": 1284375,
"total_tokens": 30964052,
"total_turns": 1296,
"avg_agent_execution_time": 256.6162,
"avg_input_tokens": 247330.6417,
"avg_output_tokens": 10703.125,
"avg_total_tokens": 258033.7667,
"avg_turns": 10.8,
"per_run_input_tokens": 7419919.25,
"per_run_output_tokens": 321093.75,
"per_run_cost": 27.076164,
"actual_model_name": "grok-4-0709",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5083,
"std": 0.064
},
"pass@4": 0.7333,
"pass^4": 0.2667
},
"claude-sonnet-4-high": {
"total_tasks": 30,
"total_agent_execution_time": 17209.377204418182,
"total_input_tokens": 106723679,
"total_output_tokens": 670070,
"total_tokens": 107393749,
"total_turns": 2248,
"avg_agent_execution_time": 143.4115,
"avg_input_tokens": 889363.9917,
"avg_output_tokens": 5583.9167,
"avg_total_tokens": 894947.9083,
"avg_turns": 18.7333,
"per_run_input_tokens": 26680919.75,
"per_run_output_tokens": 167517.5,
"per_run_cost": 82.555522,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2333,
"std": 0.0408
},
"pass@4": 0.3667,
"pass^4": 0.1
},
"gemini-3-pro-low": {
"total_tasks": 30,
"total_agent_execution_time": 25109.726654052734,
"total_input_tokens": 70740136,
"total_output_tokens": 1311110,
"total_tokens": 72051246,
"total_turns": 2005,
"avg_agent_execution_time": 209.2477,
"avg_input_tokens": 589501.1333,
"avg_output_tokens": 10925.9167,
"avg_total_tokens": 600427.05,
"avg_turns": 16.7083,
"per_run_input_tokens": 17685034,
"per_run_output_tokens": 327777.5,
"per_run_cost": 39.303398,
"actual_model_name": "gemini-3-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5667,
"std": 0.0408
},
"pass@4": 0.8,
"pass^4": 0.3333
},
"gpt-oss-120b": {
"total_tasks": 30,
"total_agent_execution_time": 2195.19043135643,
"total_input_tokens": 2370211,
"total_output_tokens": 129682,
"total_tokens": 2499893,
"total_turns": 554,
"avg_agent_execution_time": 18.2933,
"avg_input_tokens": 19751.7583,
"avg_output_tokens": 1080.6833,
"avg_total_tokens": 20832.4417,
"avg_turns": 4.6167,
"per_run_input_tokens": 592552.75,
"per_run_output_tokens": 32420.5,
"per_run_cost": 0.051742,
"actual_model_name": "gpt-oss-120b",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0583,
"std": 0.0433
},
"pass@4": 0.1667,
"pass^4": 0
},
"grok-4-fast": {
"total_tasks": 30,
"total_agent_execution_time": 8968.955112218857,
"total_input_tokens": 63006836,
"total_output_tokens": 788211,
"total_tokens": 63795047,
"total_turns": 1539,
"avg_agent_execution_time": 74.7413,
"avg_input_tokens": 525056.9667,
"avg_output_tokens": 6568.425,
"avg_total_tokens": 531625.3917,
"avg_turns": 12.825,
"per_run_input_tokens": 15751709,
"per_run_output_tokens": 197052.75,
"per_run_cost": 3.248868,
"actual_model_name": "grok-4-fast",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2917,
"std": 0.0722
},
"pass@4": 0.5333,
"pass^4": 0.1667
},
"o4-mini": {
"total_tasks": 30,
"total_agent_execution_time": 31632.499047517776,
"total_input_tokens": 35200644,
"total_output_tokens": 1906496,
"total_tokens": 37107140,
"total_turns": 2506,
"avg_agent_execution_time": 263.6042,
"avg_input_tokens": 293338.7,
"avg_output_tokens": 15887.4667,
"avg_total_tokens": 309226.1667,
"avg_turns": 20.8833,
"per_run_input_tokens": 8800161,
"per_run_output_tokens": 476624,
"per_run_cost": 11.777323,
"actual_model_name": "o4-mini-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.25,
"std": 0.0289
},
"pass@4": 0.3667,
"pass^4": 0.1333
},
"gpt-4-1-mini": {
"total_tasks": 30,
"total_agent_execution_time": 6175.697661399841,
"total_input_tokens": 23538288,
"total_output_tokens": 195484,
"total_tokens": 23733772,
"total_turns": 1860,
"avg_agent_execution_time": 51.4641,
"avg_input_tokens": 196152.4,
"avg_output_tokens": 1629.0333,
"avg_total_tokens": 197781.4333,
"avg_turns": 15.5,
"per_run_input_tokens": 5884572,
"per_run_output_tokens": 48871,
"per_run_cost": 2.432022,
"actual_model_name": "gpt-4.1-mini-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0333,
"std": 0
},
"pass@4": 0.0333,
"pass^4": 0.0333
}
},
"github": {
"deepseek-v3-1-terminus-thinking": {
"total_tasks": 23,
"total_agent_execution_time": 64614.89937853813,
"total_input_tokens": 34232908,
"total_output_tokens": 1397648,
"total_tokens": 35630556,
"total_turns": 798,
"avg_agent_execution_time": 702.3359,
"avg_input_tokens": 372096.8261,
"avg_output_tokens": 15191.8261,
"avg_total_tokens": 387288.6522,
"avg_turns": 8.6739,
"per_run_input_tokens": 8558227,
"per_run_output_tokens": 349412,
"per_run_cost": 2.073263,
"actual_model_name": "deepseek-v3.1-terminus-thinking",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1087,
"std": 0.0486
},
"pass@4": 0.2174,
"pass^4": 0
},
"deepseek-chat": {
"total_tasks": 23,
"total_agent_execution_time": 17843.541527748108,
"total_input_tokens": 33336977,
"total_output_tokens": 206404,
"total_tokens": 33543381,
"total_turns": 870,
"avg_agent_execution_time": 193.9515,
"avg_input_tokens": 362358.4457,
"avg_output_tokens": 2243.5217,
"avg_total_tokens": 364601.9674,
"avg_turns": 9.4565,
"per_run_input_tokens": 8334244.25,
"per_run_output_tokens": 51601,
"per_run_cost": 4.753866,
"actual_model_name": "deepseek-v3.1-non-think",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0978,
"std": 0.0188
},
"pass@4": 0.1304,
"pass^4": 0.087
},
"gpt-4-1": {
"total_tasks": 23,
"total_agent_execution_time": 8299.499776601791,
"total_input_tokens": 41020732,
"total_output_tokens": 228765,
"total_tokens": 41249497,
"total_turns": 915,
"avg_agent_execution_time": 90.212,
"avg_input_tokens": 445877.5217,
"avg_output_tokens": 2486.5761,
"avg_total_tokens": 448364.0978,
"avg_turns": 9.9457,
"per_run_input_tokens": 10255183,
"per_run_output_tokens": 57191.25,
"per_run_cost": 20.967896,
"actual_model_name": "gpt-4.1-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0761,
"std": 0.0188
},
"pass@4": 0.087,
"pass^4": 0.0435
},
"claude-opus-4-1": {
"total_tasks": 23,
"total_agent_execution_time": 8974.225908994675,
"total_input_tokens": 14274491,
"total_output_tokens": 134233,
"total_tokens": 14408724,
"total_turns": 248,
"avg_agent_execution_time": 390.1837,
"avg_input_tokens": 620630.0435,
"avg_output_tokens": 5836.2174,
"avg_total_tokens": 626466.2609,
"avg_turns": 10.7826,
"per_run_input_tokens": 14274491,
"per_run_output_tokens": 134233,
"per_run_cost": 224.18484,
"actual_model_name": "claude-opus-4-1-20250805",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2174,
"std": 0
}
},
"glm-4-5": {
"total_tasks": 23,
"total_agent_execution_time": 14126.248623132706,
"total_input_tokens": 44343752,
"total_output_tokens": 336055,
"total_tokens": 44679807,
"total_turns": 1097,
"avg_agent_execution_time": 153.5462,
"avg_input_tokens": 481997.3043,
"avg_output_tokens": 3652.7717,
"avg_total_tokens": 485650.0761,
"avg_turns": 11.9239,
"per_run_input_tokens": 11085938,
"per_run_output_tokens": 84013.75,
"per_run_cost": 3.769258,
"actual_model_name": "glm-4.5",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2283,
"std": 0.0643
},
"pass@4": 0.3478,
"pass^4": 0.1304
},
"o3": {
"total_tasks": 23,
"total_agent_execution_time": 11774.837658882141,
"total_input_tokens": 41508489,
"total_output_tokens": 327279,
"total_tokens": 41835768,
"total_turns": 846,
"avg_agent_execution_time": 127.9874,
"avg_input_tokens": 451179.2283,
"avg_output_tokens": 3557.3804,
"avg_total_tokens": 454736.6087,
"avg_turns": 9.1957,
"per_run_input_tokens": 10377122.25,
"per_run_output_tokens": 81819.75,
"per_run_cost": 21.408803,
"actual_model_name": "o3-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1413,
"std": 0.0361
},
"pass@4": 0.2174,
"pass^4": 0.0435
},
"claude-sonnet-4-5": {
"total_tasks": 23,
"total_agent_execution_time": 20242.79474067688,
"total_input_tokens": 87531964,
"total_output_tokens": 827074,
"total_tokens": 88359038,
"total_turns": 1495,
"avg_agent_execution_time": 220.0304,
"avg_input_tokens": 951434.3913,
"avg_output_tokens": 8989.9348,
"avg_total_tokens": 960424.3261,
"avg_turns": 16.25,
"per_run_input_tokens": 21882991,
"per_run_output_tokens": 206768.5,
"per_run_cost": 68.750501,
"actual_model_name": "claude-sonnet-4-5-20250929",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2935,
"std": 0.089
},
"pass@4": 0.4348,
"pass^4": 0.1739
},
"claude-sonnet-4-low": {
"total_tasks": 23,
"total_agent_execution_time": 15970.095800638199,
"total_input_tokens": 75027894,
"total_output_tokens": 441257,
"total_tokens": 75469151,
"total_turns": 1417,
"avg_agent_execution_time": 173.588,
"avg_input_tokens": 815520.587,
"avg_output_tokens": 4796.2717,
"avg_total_tokens": 820316.8587,
"avg_turns": 15.4022,
"per_run_input_tokens": 18756973.5,
"per_run_output_tokens": 110314.25,
"per_run_cost": 57.925634,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.25,
"std": 0.0361
},
"pass@4": 0.3478,
"pass^4": 0.2174
},
"gemini-2-5-pro": {
"total_tasks": 23,
"total_agent_execution_time": 8400.268857002258,
"total_input_tokens": 15955471,
"total_output_tokens": 529482,
"total_tokens": 16484953,
"total_turns": 501,
"avg_agent_execution_time": 91.3073,
"avg_input_tokens": 173429.0326,
"avg_output_tokens": 5755.2391,
"avg_total_tokens": 179184.2717,
"avg_turns": 5.4457,
"per_run_input_tokens": 3988867.75,
"per_run_output_tokens": 132370.5,
"per_run_cost": 11.957727,
"actual_model_name": "gemini-2.5-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0978,
"std": 0.0188
},
"pass@4": 0.2174,
"pass^4": 0
},
"qwen-3-coder-plus": {
"total_tasks": 23,
"total_agent_execution_time": 29474.414255142212,
"total_input_tokens": 182817170,
"total_output_tokens": 308729,
"total_tokens": 183125899,
"total_turns": 1759,
"avg_agent_execution_time": 320.3741,
"avg_input_tokens": 1987143.1522,
"avg_output_tokens": 3355.75,
"avg_total_tokens": 1990498.9022,
"avg_turns": 19.1196,
"per_run_input_tokens": 45704292.5,
"per_run_output_tokens": 77182.25,
"per_run_cost": 9.202604,
"actual_model_name": "qwen3-coder-480b-a35b-instruct",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1957,
"std": 0.0652
},
"pass@4": 0.3478,
"pass^4": 0.1304
},
"gpt-5-mini-high": {
"total_tasks": 23,
"total_agent_execution_time": 31101.44998884201,
"total_input_tokens": 91999132,
"total_output_tokens": 2339485,
"total_tokens": 94338617,
"total_turns": 1666,
"avg_agent_execution_time": 338.0592,
"avg_input_tokens": 999990.5652,
"avg_output_tokens": 25429.1848,
"avg_total_tokens": 1025419.75,
"avg_turns": 18.1087,
"per_run_input_tokens": 22999783,
"per_run_output_tokens": 584871.25,
"per_run_cost": 6.919688,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1957,
"std": 0.0217
},
"pass@4": 0.3478,
"pass^4": 0.087
},
"gpt-5-nano-low": {
"total_tasks": 23,
"total_agent_execution_time": 5304.219719648361,
"total_input_tokens": 20887893,
"total_output_tokens": 305082,
"total_tokens": 21192975,
"total_turns": 849,
"avg_agent_execution_time": 57.6546,
"avg_input_tokens": 227042.3152,
"avg_output_tokens": 3316.1087,
"avg_total_tokens": 230358.4239,
"avg_turns": 9.2283,
"per_run_input_tokens": 5221973.25,
"per_run_output_tokens": 76270.5,
"per_run_cost": 0.291607,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"gpt-5-medium": {
"total_tasks": 23,
"total_agent_execution_time": 42011.101893901825,
"total_input_tokens": 60694917,
"total_output_tokens": 1892024,
"total_tokens": 62586941,
"total_turns": 1318,
"avg_agent_execution_time": 456.6424,
"avg_input_tokens": 659727.3587,
"avg_output_tokens": 20565.4783,
"avg_total_tokens": 680292.837,
"avg_turns": 14.3261,
"per_run_input_tokens": 15173729.25,
"per_run_output_tokens": 473006,
"per_run_cost": 23.697222,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4783,
"std": 0.0813
},
"pass@4": 0.6522,
"pass^4": 0.1739
},
"claude-sonnet-4": {
"total_tasks": 23,
"total_agent_execution_time": 18077.839505195618,
"total_input_tokens": 64106636,
"total_output_tokens": 408552,
"total_tokens": 64515188,
"total_turns": 1027,
"avg_agent_execution_time": 196.4983,
"avg_input_tokens": 696811.2609,
"avg_output_tokens": 4440.7826,
"avg_total_tokens": 701252.0435,
"avg_turns": 11.163,
"per_run_input_tokens": 16026659,
"per_run_output_tokens": 102138,
"per_run_cost": 49.612047,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.163,
"std": 0.0565
},
"pass@4": 0.3043,
"pass^4": 0.087
},
"gpt-5-low": {
"total_tasks": 23,
"total_agent_execution_time": 24684.910781621933,
"total_input_tokens": 62630950,
"total_output_tokens": 1095043,
"total_tokens": 63725993,
"total_turns": 930,
"avg_agent_execution_time": 268.3142,
"avg_input_tokens": 680771.1957,
"avg_output_tokens": 11902.6413,
"avg_total_tokens": 692673.837,
"avg_turns": 10.1087,
"per_run_input_tokens": 15657737.5,
"per_run_output_tokens": 273760.75,
"per_run_cost": 22.309779,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2717,
"std": 0.0188
},
"pass@4": 0.3913,
"pass^4": 0.1739
},
"gpt-5-mini-medium": {
"total_tasks": 23,
"total_agent_execution_time": 11734.965313196182,
"total_input_tokens": 56551055,
"total_output_tokens": 709791,
"total_tokens": 57260846,
"total_turns": 1281,
"avg_agent_execution_time": 127.554,
"avg_input_tokens": 614685.3804,
"avg_output_tokens": 7715.1196,
"avg_total_tokens": 622400.5,
"avg_turns": 13.9239,
"per_run_input_tokens": 14137763.75,
"per_run_output_tokens": 177447.75,
"per_run_cost": 3.889336,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1848,
"std": 0.0776
},
"pass@4": 0.3478,
"pass^4": 0.0435
},
"gpt-4-1-nano": {
"total_tasks": 23,
"total_agent_execution_time": 4764.2498252391815,
"total_input_tokens": 28783356,
"total_output_tokens": 238484,
"total_tokens": 29021840,
"total_turns": 853,
"avg_agent_execution_time": 51.7853,
"avg_input_tokens": 312862.5652,
"avg_output_tokens": 2592.2174,
"avg_total_tokens": 315454.7826,
"avg_turns": 9.2717,
"per_run_input_tokens": 7195839,
"per_run_output_tokens": 59621,
"per_run_cost": 0.743432,
"actual_model_name": "gpt-4.1-nano-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"grok-code-fast-1": {
"total_tasks": 23,
"total_agent_execution_time": 16823.461669445038,
"total_input_tokens": 69130173,
"total_output_tokens": 598154,
"total_tokens": 69728327,
"total_turns": 1642,
"avg_agent_execution_time": 182.8637,
"avg_input_tokens": 751414.9239,
"avg_output_tokens": 6501.6739,
"avg_total_tokens": 757916.5978,
"avg_turns": 17.8478,
"per_run_input_tokens": 17282543.25,
"per_run_output_tokens": 149538.5,
"per_run_cost": 3.680816,
"actual_model_name": "grok-code-fast-1",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.087,
"std": 0.0532
},
"pass@4": 0.1739,
"pass^4": 0.0435
},
"deepseek-v3-2-thinking": {
"total_tasks": 23,
"total_agent_execution_time": 37883.39964008331,
"total_input_tokens": 90321478,
"total_output_tokens": 843337,
"total_tokens": 91164815,
"total_turns": 2189,
"avg_agent_execution_time": 411.7761,
"avg_input_tokens": 981755.1957,
"avg_output_tokens": 9166.7065,
"avg_total_tokens": 990921.9022,
"avg_turns": 23.7935,
"per_run_input_tokens": 22580369.5,
"per_run_output_tokens": 210834.25,
"per_run_cost": 6.181033,
"actual_model_name": "deepseek-v3.2-reasoner",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2065,
"std": 0.0188
},
"pass@4": 0.4348,
"pass^4": 0
},
"gemini-2-5-flash": {
"total_tasks": 23,
"total_agent_execution_time": 18992.60165667534,
"total_input_tokens": 101847435,
"total_output_tokens": 1168664,
"total_tokens": 103016099,
"total_turns": 962,
"avg_agent_execution_time": 206.4413,
"avg_input_tokens": 1107037.337,
"avg_output_tokens": 12702.8696,
"avg_total_tokens": 1119740.2065,
"avg_turns": 10.4565,
"per_run_input_tokens": 25461858.75,
"per_run_output_tokens": 292166,
"per_run_cost": 8.368973,
"actual_model_name": "gemini-2.5-flash",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1522,
"std": 0.0217
},
"pass@4": 0.2174,
"pass^4": 0.087
},
"gpt-5-nano-high": {
"total_tasks": 23,
"total_agent_execution_time": 29166.180356264114,
"total_input_tokens": 112335319,
"total_output_tokens": 4512798,
"total_tokens": 116848117,
"total_turns": 1680,
"avg_agent_execution_time": 317.0237,
"avg_input_tokens": 1221036.0761,
"avg_output_tokens": 49052.1522,
"avg_total_tokens": 1270088.2283,
"avg_turns": 18.2609,
"per_run_input_tokens": 28083829.75,
"per_run_output_tokens": 1128199.5,
"per_run_cost": 1.855471,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.087,
"std": 0.0307
},
"pass@4": 0.1739,
"pass^4": 0
},
"gpt-5-mini-low": {
"total_tasks": 23,
"total_agent_execution_time": 5807.832483053207,
"total_input_tokens": 40078426,
"total_output_tokens": 179124,
"total_tokens": 40257550,
"total_turns": 868,
"avg_agent_execution_time": 63.1286,
"avg_input_tokens": 435635.0652,
"avg_output_tokens": 1947,
"avg_total_tokens": 437582.0652,
"avg_turns": 9.4348,
"per_run_input_tokens": 10019606.5,
"per_run_output_tokens": 44781,
"per_run_cost": 2.594464,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.087,
"std": 0.0307
},
"pass@4": 0.1304,
"pass^4": 0
},
"deepseek-v3-1-terminus": {
"total_tasks": 23,
"total_agent_execution_time": 21333.615911722183,
"total_input_tokens": 41122717,
"total_output_tokens": 183454,
"total_tokens": 41306171,
"total_turns": 835,
"avg_agent_execution_time": 231.8871,
"avg_input_tokens": 446986.0543,
"avg_output_tokens": 1994.0652,
"avg_total_tokens": 448980.1196,
"avg_turns": 9.0761,
"per_run_input_tokens": 10280679.25,
"per_run_output_tokens": 45863.5,
"per_run_cost": 2.195175,
"actual_model_name": "deepseek-v3.1-terminus",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0543,
"std": 0.0713
},
"pass@4": 0.1739,
"pass^4": 0
},
"qwen-3-max": {
"total_tasks": 23,
"total_agent_execution_time": 16698.327759742737,
"total_input_tokens": 124028099,
"total_output_tokens": 234790,
"total_tokens": 124262889,
"total_turns": 2456,
"avg_agent_execution_time": 181.5036,
"avg_input_tokens": 1348131.5109,
"avg_output_tokens": 2552.0652,
"avg_total_tokens": 1350683.5761,
"avg_turns": 26.6957,
"per_run_input_tokens": 31007024.75,
"per_run_output_tokens": 58697.5,
"per_run_cost": 37.560615,
"actual_model_name": "qwen3-max-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1413,
"std": 0.0361
},
"pass@4": 0.1739,
"pass^4": 0.0435
},
"claude-opus-4-5-high": {
"total_tasks": 23,
"total_agent_execution_time": 51087.96004152298,
"total_input_tokens": 98581979,
"total_output_tokens": 3222083,
"total_tokens": 101804062,
"total_turns": 1869,
"avg_agent_execution_time": 555.3039,
"avg_input_tokens": 1071543.25,
"avg_output_tokens": 35022.6413,
"avg_total_tokens": 1106565.8913,
"avg_turns": 20.3152,
"per_run_input_tokens": 24645494.75,
"per_run_output_tokens": 805520.75,
"per_run_cost": 143.365493,
"actual_model_name": "claude-opus-4-5-20251101",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.3696,
"std": 0.0721
},
"pass@4": 0.5217,
"pass^4": 0.2174
},
"deepseek-v3-2-chat": {
"total_tasks": 23,
"total_agent_execution_time": 26869.662212133408,
"total_input_tokens": 80055610,
"total_output_tokens": 511059,
"total_tokens": 80566669,
"total_turns": 1924,
"avg_agent_execution_time": 292.0615,
"avg_input_tokens": 870169.6739,
"avg_output_tokens": 5554.9891,
"avg_total_tokens": 875724.663,
"avg_turns": 20.913,
"per_run_input_tokens": 20013902.5,
"per_run_output_tokens": 127764.75,
"per_run_cost": 5.45486,
"actual_model_name": "deepseek-v3.2-chat",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1739,
"std": 0.0532
},
"pass@4": 0.3913,
"pass^4": 0
},
"kimi-k2-0711": {
"total_tasks": 23,
"total_agent_execution_time": 18861.51504421234,
"total_input_tokens": 34909027,
"total_output_tokens": 209917,
"total_tokens": 35118944,
"total_turns": 817,
"avg_agent_execution_time": 205.0165,
"avg_input_tokens": 379445.9457,
"avg_output_tokens": 2281.7065,
"avg_total_tokens": 381727.6522,
"avg_turns": 8.8804,
"per_run_input_tokens": 8727256.75,
"per_run_output_tokens": 52479.25,
"per_run_cost": 5.367552,
"actual_model_name": "kimi-k2-0711-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1087,
"std": 0.0217
},
"pass@4": 0.1304,
"pass^4": 0.0435
},
"gemini-3-pro-high": {
"total_tasks": 23,
"total_agent_execution_time": 19272.317316293716,
"total_input_tokens": 57777857,
"total_output_tokens": 689951,
"total_tokens": 58467808,
"total_turns": 1242,
"avg_agent_execution_time": 209.4817,
"avg_input_tokens": 628020.1848,
"avg_output_tokens": 7499.4674,
"avg_total_tokens": 635519.6522,
"avg_turns": 13.5,
"per_run_input_tokens": 14444464.25,
"per_run_output_tokens": 172487.75,
"per_run_cost": 30.958782,
"actual_model_name": "gemini-3-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4674,
"std": 0.0643
},
"pass@4": 0.6522,
"pass^4": 0.3043
},
"gpt-5-high": {
"total_tasks": 23,
"total_agent_execution_time": 99681.28649711609,
"total_input_tokens": 83755768,
"total_output_tokens": 3423829,
"total_tokens": 87179597,
"total_turns": 1533,
"avg_agent_execution_time": 1083.4922,
"avg_input_tokens": 910388.7826,
"avg_output_tokens": 37215.5326,
"avg_total_tokens": 947604.3152,
"avg_turns": 16.663,
"per_run_input_tokens": 20938942,
"per_run_output_tokens": 855957.25,
"per_run_cost": 34.73325,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5,
"std": 0.0217
},
"pass@4": 0.6087,
"pass^4": 0.3478
},
"kimi-k2-0905": {
"total_tasks": 23,
"total_agent_execution_time": 71829.9812734127,
"total_input_tokens": 91599488,
"total_output_tokens": 758701,
"total_tokens": 92358189,
"total_turns": 2179,
"avg_agent_execution_time": 780.7607,
"avg_input_tokens": 995646.6087,
"avg_output_tokens": 8246.75,
"avg_total_tokens": 1003893.3587,
"avg_turns": 23.6848,
"per_run_input_tokens": 22899872,
"per_run_output_tokens": 189675.25,
"per_run_cost": 14.214111,
"actual_model_name": "kimi-k2-0905-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.163,
"std": 0.0188
},
"pass@4": 0.2609,
"pass^4": 0.087
},
"gpt-5-nano-medium": {
"total_tasks": 23,
"total_agent_execution_time": 17233.95824933052,
"total_input_tokens": 69148890,
"total_output_tokens": 2462505,
"total_tokens": 71611395,
"total_turns": 1394,
"avg_agent_execution_time": 187.3256,
"avg_input_tokens": 751618.3696,
"avg_output_tokens": 26766.3587,
"avg_total_tokens": 778384.7283,
"avg_turns": 15.1522,
"per_run_input_tokens": 17287222.5,
"per_run_output_tokens": 615626.25,
"per_run_cost": 1.110612,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0761,
"std": 0.0188
},
"pass@4": 0.1304,
"pass^4": 0
},
"gpt-5-2-high": {
"total_tasks": 23,
"total_agent_execution_time": 65731.91835689545,
"total_input_tokens": 79194968,
"total_output_tokens": 2482375,
"total_tokens": 81677343,
"total_turns": 1680,
"avg_agent_execution_time": 714.4774,
"avg_input_tokens": 860814.8696,
"avg_output_tokens": 26982.337,
"avg_total_tokens": 887797.2065,
"avg_turns": 18.2609,
"per_run_input_tokens": 19798742,
"per_run_output_tokens": 620593.75,
"per_run_cost": 43.336111,
"actual_model_name": "gpt-5.2-2025-12-11",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4783,
"std": 0.0532
},
"pass@4": 0.6087,
"pass^4": 0.3478
},
"grok-4": {
"total_tasks": 23,
"total_agent_execution_time": 24744.31317639351,
"total_input_tokens": 74013900,
"total_output_tokens": 177230,
"total_tokens": 74636957,
"total_turns": 1194,
"avg_agent_execution_time": 268.9599,
"avg_input_tokens": 804498.913,
"avg_output_tokens": 1926.413,
"avg_total_tokens": 811271.2717,
"avg_turns": 12.9783,
"per_run_input_tokens": 18503475,
"per_run_output_tokens": 44307.5,
"per_run_cost": 56.175038,
"actual_model_name": "grok-4-0709",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1413,
"std": 0.0361
},
"pass@4": 0.2174,
"pass^4": 0.087
},
"claude-sonnet-4-high": {
"total_tasks": 23,
"total_agent_execution_time": 15643.119762897491,
"total_input_tokens": 75490051,
"total_output_tokens": 471509,
"total_tokens": 75961560,
"total_turns": 1415,
"avg_agent_execution_time": 170.0339,
"avg_input_tokens": 820544.0326,
"avg_output_tokens": 5125.0978,
"avg_total_tokens": 825669.1304,
"avg_turns": 15.3804,
"per_run_input_tokens": 18872512.75,
"per_run_output_tokens": 117877.25,
"per_run_cost": 58.385697,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2826,
"std": 0.0217
},
"pass@4": 0.4348,
"pass^4": 0.2174
},
"gemini-3-pro-low": {
"total_tasks": 23,
"total_agent_execution_time": 18905.529168605804,
"total_input_tokens": 60400565,
"total_output_tokens": 784896,
"total_tokens": 61185461,
"total_turns": 1277,
"avg_agent_execution_time": 205.4949,
"avg_input_tokens": 656527.8804,
"avg_output_tokens": 8531.4783,
"avg_total_tokens": 665059.3587,
"avg_turns": 13.8804,
"per_run_input_tokens": 15100141.25,
"per_run_output_tokens": 196224,
"per_run_cost": 32.554971,
"actual_model_name": "gemini-3-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4565,
"std": 0.1171
},
"pass@4": 0.6522,
"pass^4": 0.2609
},
"gpt-oss-120b": {
"total_tasks": 23,
"total_agent_execution_time": 2209.1974110603333,
"total_input_tokens": 7019815,
"total_output_tokens": 129561,
"total_tokens": 7149376,
"total_turns": 425,
"avg_agent_execution_time": 24.013,
"avg_input_tokens": 76302.337,
"avg_output_tokens": 1408.2717,
"avg_total_tokens": 77710.6087,
"avg_turns": 4.6196,
"per_run_input_tokens": 1754953.75,
"per_run_output_tokens": 32390.25,
"per_run_cost": 0.135426,
"actual_model_name": "gpt-oss-120b",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0435,
"std": 0.0307
},
"pass@4": 0.087,
"pass^4": 0
},
"grok-4-fast": {
"total_tasks": 23,
"total_agent_execution_time": 13161.74501156807,
"total_input_tokens": 52963591,
"total_output_tokens": 992874,
"total_tokens": 53956465,
"total_turns": 1047,
"avg_agent_execution_time": 143.0624,
"avg_input_tokens": 575691.2065,
"avg_output_tokens": 10792.1087,
"avg_total_tokens": 586483.3152,
"avg_turns": 11.3804,
"per_run_input_tokens": 13240897.75,
"per_run_output_tokens": 248218.5,
"per_run_cost": 2.772289,
"actual_model_name": "grok-4-fast",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1304,
"std": 0.0307
},
"pass@4": 0.2174,
"pass^4": 0
},
"o4-mini": {
"total_tasks": 23,
"total_agent_execution_time": 22887.657103300095,
"total_input_tokens": 46932136,
"total_output_tokens": 803804,
"total_tokens": 47735940,
"total_turns": 1005,
"avg_agent_execution_time": 248.7789,
"avg_input_tokens": 510131.913,
"avg_output_tokens": 8737,
"avg_total_tokens": 518868.913,
"avg_turns": 10.9239,
"per_run_input_tokens": 11733034,
"per_run_output_tokens": 200951,
"per_run_cost": 13.790522,
"actual_model_name": "o4-mini-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1413,
"std": 0.0643
},
"pass@4": 0.2609,
"pass^4": 0.0435
},
"gpt-4-1-mini": {
"total_tasks": 23,
"total_agent_execution_time": 7644.50294303894,
"total_input_tokens": 42936149,
"total_output_tokens": 139240,
"total_tokens": 43075389,
"total_turns": 1104,
"avg_agent_execution_time": 83.0924,
"avg_input_tokens": 466697.2717,
"avg_output_tokens": 1513.4783,
"avg_total_tokens": 468210.75,
"avg_turns": 12,
"per_run_input_tokens": 10734037.25,
"per_run_output_tokens": 34810,
"per_run_cost": 4.349311,
"actual_model_name": "gpt-4.1-mini-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0652,
"std": 0.0652
},
"pass@4": 0.1739,
"pass^4": 0
}
},
"notion": {
"deepseek-v3-1-terminus-thinking": {
"total_tasks": 28,
"total_agent_execution_time": 102976.53085327148,
"total_input_tokens": 49897002,
"total_output_tokens": 2182951,
"total_tokens": 52079953,
"total_turns": 1664,
"avg_agent_execution_time": 919.4333,
"avg_input_tokens": 445508.9464,
"avg_output_tokens": 19490.6339,
"avg_total_tokens": 464999.5804,
"avg_turns": 14.8571,
"per_run_input_tokens": 12474250.5,
"per_run_output_tokens": 545737.75,
"per_run_cost": 3.050725,
"actual_model_name": "deepseek-v3.1-terminus-thinking",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2232,
"std": 0.0296
},
"pass@4": 0.3929,
"pass^4": 0.0357
},
"deepseek-chat": {
"total_tasks": 28,
"total_agent_execution_time": 26761.268040657043,
"total_input_tokens": 56375571,
"total_output_tokens": 246037,
"total_tokens": 56621608,
"total_turns": 2009,
"avg_agent_execution_time": 238.9399,
"avg_input_tokens": 503353.3125,
"avg_output_tokens": 2196.7589,
"avg_total_tokens": 505550.0714,
"avg_turns": 17.9375,
"per_run_input_tokens": 14093892.75,
"per_run_output_tokens": 61509.25,
"per_run_cost": 7.995915,
"actual_model_name": "deepseek-v3.1-non-think",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.125,
"std": 0.0309
},
"pass@4": 0.2857,
"pass^4": 0
},
"gpt-4-1": {
"total_tasks": 28,
"total_agent_execution_time": 5461.270235300064,
"total_input_tokens": 15181887,
"total_output_tokens": 153733,
"total_tokens": 15335620,
"total_turns": 961,
"avg_agent_execution_time": 48.7613,
"avg_input_tokens": 135552.5625,
"avg_output_tokens": 1372.6161,
"avg_total_tokens": 136925.1786,
"avg_turns": 8.5804,
"per_run_input_tokens": 3795471.75,
"per_run_output_tokens": 38433.25,
"per_run_cost": 7.898409,
"actual_model_name": "gpt-4.1-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0625,
"std": 0.0155
},
"pass@4": 0.1429,
"pass^4": 0
},
"claude-opus-4-1": {
"total_tasks": 28,
"total_agent_execution_time": 8238.807499885559,
"total_input_tokens": 17865599,
"total_output_tokens": 110144,
"total_tokens": 17975743,
"total_turns": 477,
"avg_agent_execution_time": 294.2431,
"avg_input_tokens": 638057.1071,
"avg_output_tokens": 3933.7143,
"avg_total_tokens": 641990.8214,
"avg_turns": 17.0357,
"per_run_input_tokens": 17865599,
"per_run_output_tokens": 110144,
"per_run_cost": 276.244785,
"actual_model_name": "claude-opus-4-1-20250805",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.3571,
"std": 0
}
},
"glm-4-5": {
"total_tasks": 28,
"total_agent_execution_time": 24882.952006816864,
"total_input_tokens": 70108796,
"total_output_tokens": 564356,
"total_tokens": 70673152,
"total_turns": 2481,
"avg_agent_execution_time": 222.1692,
"avg_input_tokens": 625971.3929,
"avg_output_tokens": 5038.8929,
"avg_total_tokens": 631010.2857,
"avg_turns": 22.1518,
"per_run_input_tokens": 17527199,
"per_run_output_tokens": 141089,
"per_run_cost": 5.970213,
"actual_model_name": "glm-4.5",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2143,
"std": 0.0253
},
"pass@4": 0.3214,
"pass^4": 0.1071
},
"o3": {
"total_tasks": 28,
"total_agent_execution_time": 19195.927572011948,
"total_input_tokens": 25191983,
"total_output_tokens": 1060378,
"total_tokens": 26252361,
"total_turns": 1537,
"avg_agent_execution_time": 171.3922,
"avg_input_tokens": 224928.4196,
"avg_output_tokens": 9467.6607,
"avg_total_tokens": 234396.0804,
"avg_turns": 13.7232,
"per_run_input_tokens": 6297995.75,
"per_run_output_tokens": 265094.5,
"per_run_cost": 14.716748,
"actual_model_name": "o3-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2411,
"std": 0.0389
},
"pass@4": 0.4643,
"pass^4": 0.0714
},
"claude-sonnet-4-5": {
"total_tasks": 28,
"total_agent_execution_time": 18484.7717359066,
"total_input_tokens": 75120675,
"total_output_tokens": 545767,
"total_tokens": 75666442,
"total_turns": 1615,
"avg_agent_execution_time": 165.0426,
"avg_input_tokens": 670720.3125,
"avg_output_tokens": 4872.9196,
"avg_total_tokens": 675593.2321,
"avg_turns": 14.4196,
"per_run_input_tokens": 18780168.75,
"per_run_output_tokens": 136441.75,
"per_run_cost": 58.387133,
"actual_model_name": "claude-sonnet-4-5-20250929",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.25,
"std": 0.0758
},
"pass@4": 0.4643,
"pass^4": 0.0357
},
"claude-sonnet-4-low": {
"total_tasks": 28,
"total_agent_execution_time": 24645.00325536728,
"total_input_tokens": 138082856,
"total_output_tokens": 479628,
"total_tokens": 138562484,
"total_turns": 2207,
"avg_agent_execution_time": 220.0447,
"avg_input_tokens": 1232882.6429,
"avg_output_tokens": 4282.3929,
"avg_total_tokens": 1237165.0357,
"avg_turns": 19.7054,
"per_run_input_tokens": 34520714,
"per_run_output_tokens": 119907,
"per_run_cost": 105.360747,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2232,
"std": 0.0296
},
"pass@4": 0.3214,
"pass^4": 0.0714
},
"gemini-2-5-pro": {
"total_tasks": 28,
"total_agent_execution_time": 11470.97702550888,
"total_input_tokens": 23847150,
"total_output_tokens": 798708,
"total_tokens": 24645858,
"total_turns": 797,
"avg_agent_execution_time": 102.4194,
"avg_input_tokens": 212920.9821,
"avg_output_tokens": 7131.3214,
"avg_total_tokens": 220052.3036,
"avg_turns": 7.1161,
"per_run_input_tokens": 5961787.5,
"per_run_output_tokens": 199677,
"per_run_cost": 17.899624,
"actual_model_name": "gemini-2.5-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0446,
"std": 0.0296
},
"pass@4": 0.0714,
"pass^4": 0
},
"qwen-3-coder-plus": {
"total_tasks": 28,
"total_agent_execution_time": 11163.093998670578,
"total_input_tokens": 89233452,
"total_output_tokens": 308390,
"total_tokens": 89541842,
"total_turns": 2360,
"avg_agent_execution_time": 99.6705,
"avg_input_tokens": 796727.25,
"avg_output_tokens": 2753.4821,
"avg_total_tokens": 799480.7321,
"avg_turns": 21.0714,
"per_run_input_tokens": 22308363,
"per_run_output_tokens": 77097.5,
"per_run_cost": 4.523351,
"actual_model_name": "qwen3-coder-480b-a35b-instruct",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1964,
"std": 0.0644
},
"pass@4": 0.3929,
"pass^4": 0.0714
},
"gpt-5-mini-high": {
"total_tasks": 28,
"total_agent_execution_time": 58533.93042230606,
"total_input_tokens": 130525675,
"total_output_tokens": 4916907,
"total_tokens": 135442582,
"total_turns": 2118,
"avg_agent_execution_time": 522.6244,
"avg_input_tokens": 1165407.8125,
"avg_output_tokens": 43900.9554,
"avg_total_tokens": 1209308.7679,
"avg_turns": 18.9107,
"per_run_input_tokens": 32631418.75,
"per_run_output_tokens": 1229226.75,
"per_run_cost": 10.616308,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2054,
"std": 0.1297
},
"pass@4": 0.4286,
"pass^4": 0.0714
},
"gpt-5-nano-low": {
"total_tasks": 28,
"total_agent_execution_time": 7620.024949550629,
"total_input_tokens": 5304447,
"total_output_tokens": 1217553,
"total_tokens": 6522000,
"total_turns": 510,
"avg_agent_execution_time": 68.0359,
"avg_input_tokens": 47361.1339,
"avg_output_tokens": 10871.0089,
"avg_total_tokens": 58232.1429,
"avg_turns": 4.5536,
"per_run_input_tokens": 1326111.75,
"per_run_output_tokens": 304388.25,
"per_run_cost": 0.188061,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"gpt-5-medium": {
"total_tasks": 28,
"total_agent_execution_time": 74123.09180688858,
"total_input_tokens": 42004983,
"total_output_tokens": 3541977,
"total_tokens": 45546960,
"total_turns": 1449,
"avg_agent_execution_time": 661.8133,
"avg_input_tokens": 375044.4911,
"avg_output_tokens": 31624.7946,
"avg_total_tokens": 406669.2857,
"avg_turns": 12.9375,
"per_run_input_tokens": 10501245.75,
"per_run_output_tokens": 885494.25,
"per_run_cost": 21.9815,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4196,
"std": 0.0296
},
"pass@4": 0.5,
"pass^4": 0.3214
},
"claude-sonnet-4": {
"total_tasks": 28,
"total_agent_execution_time": 21642.930950641632,
"total_input_tokens": 72423701,
"total_output_tokens": 474776,
"total_tokens": 72898477,
"total_turns": 2208,
"avg_agent_execution_time": 193.2405,
"avg_input_tokens": 646640.1875,
"avg_output_tokens": 4239.0714,
"avg_total_tokens": 650879.2589,
"avg_turns": 19.7143,
"per_run_input_tokens": 18105925.25,
"per_run_output_tokens": 118694,
"per_run_cost": 56.098186,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2143,
"std": 0.0505
},
"pass@4": 0.3929,
"pass^4": 0.0714
},
"gpt-5-low": {
"total_tasks": 28,
"total_agent_execution_time": 62677.94048953056,
"total_input_tokens": 47638046,
"total_output_tokens": 3351023,
"total_tokens": 50989069,
"total_turns": 1514,
"avg_agent_execution_time": 559.6245,
"avg_input_tokens": 425339.6964,
"avg_output_tokens": 29919.8482,
"avg_total_tokens": 455259.5446,
"avg_turns": 13.5179,
"per_run_input_tokens": 11909511.5,
"per_run_output_tokens": 837755.75,
"per_run_cost": 23.264447,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.3661,
"std": 0.0773
},
"pass@4": 0.5357,
"pass^4": 0.1429
},
"gpt-5-mini-medium": {
"total_tasks": 28,
"total_agent_execution_time": 18890.31924724579,
"total_input_tokens": 78970140,
"total_output_tokens": 1382436,
"total_tokens": 80352576,
"total_turns": 1635,
"avg_agent_execution_time": 168.6636,
"avg_input_tokens": 705090.5357,
"avg_output_tokens": 12343.1786,
"avg_total_tokens": 717433.7143,
"avg_turns": 14.5982,
"per_run_input_tokens": 19742535,
"per_run_output_tokens": 345609,
"per_run_cost": 5.626852,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1607,
"std": 0.0592
},
"pass@4": 0.3214,
"pass^4": 0.0357
},
"gpt-4-1-nano": {
"total_tasks": 28,
"total_agent_execution_time": 3603.7674894332886,
"total_input_tokens": 10458841,
"total_output_tokens": 156924,
"total_tokens": 10615765,
"total_turns": 1080,
"avg_agent_execution_time": 32.1765,
"avg_input_tokens": 93382.5089,
"avg_output_tokens": 1401.1071,
"avg_total_tokens": 94783.6161,
"avg_turns": 9.6429,
"per_run_input_tokens": 2614710.25,
"per_run_output_tokens": 39231,
"per_run_cost": 0.277163,
"actual_model_name": "gpt-4.1-nano-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"grok-code-fast-1": {
"total_tasks": 28,
"total_agent_execution_time": 37424.073429346085,
"total_input_tokens": 62886800,
"total_output_tokens": 812899,
"total_tokens": 64110194,
"total_turns": 2270,
"avg_agent_execution_time": 334.1435,
"avg_input_tokens": 561489.2857,
"avg_output_tokens": 7258.0268,
"avg_total_tokens": 572412.4464,
"avg_turns": 20.2679,
"per_run_input_tokens": 15721700,
"per_run_output_tokens": 203224.75,
"per_run_cost": 3.449177,
"actual_model_name": "grok-code-fast-1",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0268,
"std": 0.0155
},
"pass@4": 0.0357,
"pass^4": 0
},
"deepseek-v3-2-thinking": {
"total_tasks": 28,
"total_agent_execution_time": 45707.08622217178,
"total_input_tokens": 95232729,
"total_output_tokens": 1015978,
"total_tokens": 96248707,
"total_turns": 2525,
"avg_agent_execution_time": 408.099,
"avg_input_tokens": 850292.2232,
"avg_output_tokens": 9071.2321,
"avg_total_tokens": 859363.4554,
"avg_turns": 22.5446,
"per_run_input_tokens": 23808182.25,
"per_run_output_tokens": 253994.5,
"per_run_cost": 6.529807,
"actual_model_name": "deepseek-v3.2-reasoner",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4554,
"std": 0.0464
},
"pass@4": 0.5714,
"pass^4": 0.3214
},
"gemini-2-5-flash": {
"total_tasks": 28,
"total_agent_execution_time": 6247.662793159485,
"total_input_tokens": 22511623,
"total_output_tokens": 737164,
"total_tokens": 23248787,
"total_turns": 684,
"avg_agent_execution_time": 55.7827,
"avg_input_tokens": 200996.6339,
"avg_output_tokens": 6581.8214,
"avg_total_tokens": 207578.4554,
"avg_turns": 6.1071,
"per_run_input_tokens": 5627905.75,
"per_run_output_tokens": 184291,
"per_run_cost": 2.149099,
"actual_model_name": "gemini-2.5-flash",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0625,
"std": 0.0464
},
"pass@4": 0.2143,
"pass^4": 0
},
"gpt-5-nano-high": {
"total_tasks": 28,
"total_agent_execution_time": 44915.13718724251,
"total_input_tokens": 45483673,
"total_output_tokens": 8632428,
"total_tokens": 54116101,
"total_turns": 1113,
"avg_agent_execution_time": 401.028,
"avg_input_tokens": 406104.2232,
"avg_output_tokens": 77075.25,
"avg_total_tokens": 483179.4732,
"avg_turns": 9.9375,
"per_run_input_tokens": 11370918.25,
"per_run_output_tokens": 2158107,
"per_run_cost": 1.431789,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0089,
"std": 0.0155
},
"pass@4": 0.0357,
"pass^4": 0
},
"gpt-5-mini-low": {
"total_tasks": 28,
"total_agent_execution_time": 7015.923507452011,
"total_input_tokens": 13704613,
"total_output_tokens": 430549,
"total_tokens": 14135162,
"total_turns": 847,
"avg_agent_execution_time": 62.6422,
"avg_input_tokens": 122362.6161,
"avg_output_tokens": 3844.1875,
"avg_total_tokens": 126206.8036,
"avg_turns": 7.5625,
"per_run_input_tokens": 3426153.25,
"per_run_output_tokens": 107637.25,
"per_run_cost": 1.071813,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0536,
"std": 0.0536
},
"pass@4": 0.1429,
"pass^4": 0
},
"deepseek-v3-1-terminus": {
"total_tasks": 28,
"total_agent_execution_time": 28280.00402736664,
"total_input_tokens": 80265896,
"total_output_tokens": 312143,
"total_tokens": 80578039,
"total_turns": 2153,
"avg_agent_execution_time": 252.5,
"avg_input_tokens": 716659.7857,
"avg_output_tokens": 2786.9911,
"avg_total_tokens": 719446.7768,
"avg_turns": 19.2232,
"per_run_input_tokens": 20066474,
"per_run_output_tokens": 78035.75,
"per_run_cost": 4.275608,
"actual_model_name": "deepseek-v3.1-terminus",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2232,
"std": 0.0813
},
"pass@4": 0.3929,
"pass^4": 0.0714
},
"qwen-3-max": {
"total_tasks": 28,
"total_agent_execution_time": 20556.586127758026,
"total_input_tokens": 109079387,
"total_output_tokens": 409681,
"total_tokens": 109489068,
"total_turns": 2976,
"avg_agent_execution_time": 183.5409,
"avg_input_tokens": 973923.0982,
"avg_output_tokens": 3657.8661,
"avg_total_tokens": 977580.9643,
"avg_turns": 26.5714,
"per_run_input_tokens": 27269846.75,
"per_run_output_tokens": 102420.25,
"per_run_cost": 33.338338,
"actual_model_name": "qwen3-max-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1696,
"std": 0.0464
},
"pass@4": 0.25,
"pass^4": 0.0357
},
"claude-opus-4-5-high": {
"total_tasks": 28,
"total_agent_execution_time": 16656.720289707184,
"total_input_tokens": 60501543,
"total_output_tokens": 552678,
"total_tokens": 61054221,
"total_turns": 1444,
"avg_agent_execution_time": 148.7207,
"avg_input_tokens": 540192.3482,
"avg_output_tokens": 4934.625,
"avg_total_tokens": 545126.9732,
"avg_turns": 12.8929,
"per_run_input_tokens": 15125385.75,
"per_run_output_tokens": 138169.5,
"per_run_cost": 79.081166,
"actual_model_name": "claude-opus-4-5-20251101",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.3839,
"std": 0.0389
},
"pass@4": 0.4643,
"pass^4": 0.3214
},
"deepseek-v3-2-chat": {
"total_tasks": 28,
"total_agent_execution_time": 29662.059406280518,
"total_input_tokens": 89033075,
"total_output_tokens": 528433,
"total_tokens": 89561508,
"total_turns": 2554,
"avg_agent_execution_time": 264.8398,
"avg_input_tokens": 794938.1696,
"avg_output_tokens": 4718.1518,
"avg_total_tokens": 799656.3214,
"avg_turns": 22.8036,
"per_run_input_tokens": 22258268.75,
"per_run_output_tokens": 132108.25,
"per_run_cost": 6.062576,
"actual_model_name": "deepseek-v3.2-chat",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.3214,
"std": 0.0565
},
"pass@4": 0.4643,
"pass^4": 0.1429
},
"kimi-k2-0711": {
"total_tasks": 28,
"total_agent_execution_time": 20542.138272047043,
"total_input_tokens": 61248998,
"total_output_tokens": 298118,
"total_tokens": 61547116,
"total_turns": 2205,
"avg_agent_execution_time": 183.4119,
"avg_input_tokens": 546866.0536,
"avg_output_tokens": 2661.7679,
"avg_total_tokens": 549527.8214,
"avg_turns": 19.6875,
"per_run_input_tokens": 15312249.5,
"per_run_output_tokens": 74529.5,
"per_run_cost": 9.373673,
"actual_model_name": "kimi-k2-0711-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1429,
"std": 0.0437
},
"pass@4": 0.3214,
"pass^4": 0.0714
},
"gemini-3-pro-high": {
"total_tasks": 28,
"total_agent_execution_time": 17766.135670661926,
"total_input_tokens": 33007447,
"total_output_tokens": 868069,
"total_tokens": 33875516,
"total_turns": 1123,
"avg_agent_execution_time": 158.6262,
"avg_input_tokens": 294709.3482,
"avg_output_tokens": 7750.6161,
"avg_total_tokens": 302459.9643,
"avg_turns": 10.0268,
"per_run_input_tokens": 8251861.75,
"per_run_output_tokens": 217017.25,
"per_run_cost": 19.10793,
"actual_model_name": "gemini-3-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4732,
"std": 0.0296
},
"pass@4": 0.5714,
"pass^4": 0.2857
},
"gpt-5-high": {
"total_tasks": 28,
"total_agent_execution_time": 130073.95933294296,
"total_input_tokens": 44878717,
"total_output_tokens": 5982277,
"total_tokens": 50860994,
"total_turns": 1522,
"avg_agent_execution_time": 1161.3746,
"avg_input_tokens": 400702.8304,
"avg_output_tokens": 53413.1875,
"avg_total_tokens": 454116.0179,
"avg_turns": 13.5893,
"per_run_input_tokens": 11219679.25,
"per_run_output_tokens": 1495569.25,
"per_run_cost": 28.980292,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4464,
"std": 0.0179
},
"pass@4": 0.6071,
"pass^4": 0.2143
},
"kimi-k2-0905": {
"total_tasks": 28,
"total_agent_execution_time": 52309.592324495316,
"total_input_tokens": 125127304,
"total_output_tokens": 582528,
"total_tokens": 125709832,
"total_turns": 3758,
"avg_agent_execution_time": 467.0499,
"avg_input_tokens": 1117208.0714,
"avg_output_tokens": 5201.1429,
"avg_total_tokens": 1122409.2143,
"avg_turns": 33.5536,
"per_run_input_tokens": 31281826,
"per_run_output_tokens": 145632,
"per_run_cost": 19.133176,
"actual_model_name": "kimi-k2-0905-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0804,
"std": 0.0296
},
"pass@4": 0.1071,
"pass^4": 0.0357
},
"gpt-5-nano-medium": {
"total_tasks": 28,
"total_agent_execution_time": 19092.810956716537,
"total_input_tokens": 22883863,
"total_output_tokens": 3592488,
"total_tokens": 26476351,
"total_turns": 835,
"avg_agent_execution_time": 170.4715,
"avg_input_tokens": 204320.2054,
"avg_output_tokens": 32075.7857,
"avg_total_tokens": 236395.9911,
"avg_turns": 7.4554,
"per_run_input_tokens": 5720965.75,
"per_run_output_tokens": 898122,
"per_run_cost": 0.645297,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0357,
"std": 0
},
"pass@4": 0.0357,
"pass^4": 0.0357
},
"gpt-5-2-high": {
"total_tasks": 28,
"total_agent_execution_time": 141006.1795346737,
"total_input_tokens": 96884181,
"total_output_tokens": 5221253,
"total_tokens": 102105434,
"total_turns": 2314,
"avg_agent_execution_time": 1258.9837,
"avg_input_tokens": 865037.3304,
"avg_output_tokens": 46618.3304,
"avg_total_tokens": 911655.6607,
"avg_turns": 20.6607,
"per_run_input_tokens": 24221045.25,
"per_run_output_tokens": 1305313.25,
"per_run_cost": 60.661215,
"actual_model_name": "gpt-5.2-2025-12-11",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.6071,
"std": 0.0253
},
"pass@4": 0.6786,
"pass^4": 0.5
},
"grok-4": {
"total_tasks": 28,
"total_agent_execution_time": 62048.289714574814,
"total_input_tokens": 76007445,
"total_output_tokens": 1460247,
"total_tokens": 77467692,
"total_turns": 2256,
"avg_agent_execution_time": 554.0026,
"avg_input_tokens": 678637.9018,
"avg_output_tokens": 13037.9196,
"avg_total_tokens": 691675.8214,
"avg_turns": 20.1429,
"per_run_input_tokens": 19001861.25,
"per_run_output_tokens": 365061.75,
"per_run_cost": 62.48151,
"actual_model_name": "grok-4-0709",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0268,
"std": 0.0155
},
"pass@4": 0.0357,
"pass^4": 0
},
"claude-sonnet-4-high": {
"total_tasks": 28,
"total_agent_execution_time": 21350.178874015808,
"total_input_tokens": 154199936,
"total_output_tokens": 474188,
"total_tokens": 154674124,
"total_turns": 2162,
"avg_agent_execution_time": 190.6266,
"avg_input_tokens": 1376785.1429,
"avg_output_tokens": 4233.8214,
"avg_total_tokens": 1381018.9643,
"avg_turns": 19.3036,
"per_run_input_tokens": 38549984,
"per_run_output_tokens": 118547,
"per_run_cost": 117.428157,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1964,
"std": 0.0818
},
"pass@4": 0.3571,
"pass^4": 0.0357
},
"gemini-3-pro-low": {
"total_tasks": 28,
"total_agent_execution_time": 22059.459194898605,
"total_input_tokens": 42515780,
"total_output_tokens": 956565,
"total_tokens": 43472345,
"total_turns": 1252,
"avg_agent_execution_time": 196.9595,
"avg_input_tokens": 379605.1786,
"avg_output_tokens": 8540.7589,
"avg_total_tokens": 388145.9375,
"avg_turns": 11.1786,
"per_run_input_tokens": 10628945,
"per_run_output_tokens": 239141.25,
"per_run_cost": 24.127585,
"actual_model_name": "gemini-3-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4375,
"std": 0.0389
},
"pass@4": 0.5714,
"pass^4": 0.2143
},
"gpt-oss-120b": {
"total_tasks": 28,
"total_agent_execution_time": 3811.6953434944153,
"total_input_tokens": 7651000,
"total_output_tokens": 192347,
"total_tokens": 7843347,
"total_turns": 615,
"avg_agent_execution_time": 34.033,
"avg_input_tokens": 68312.5,
"avg_output_tokens": 1717.3839,
"avg_total_tokens": 70029.8839,
"avg_turns": 5.4911,
"per_run_input_tokens": 1912750,
"per_run_output_tokens": 48086.75,
"per_run_cost": 0.151182,
"actual_model_name": "gpt-oss-120b",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0357,
"std": 0.0253
},
"pass@4": 0.1429,
"pass^4": 0
},
"grok-4-fast": {
"total_tasks": 28,
"total_agent_execution_time": 17106.341089487076,
"total_input_tokens": 59274108,
"total_output_tokens": 1683861,
"total_tokens": 60957969,
"total_turns": 1936,
"avg_agent_execution_time": 152.7352,
"avg_input_tokens": 529233.1071,
"avg_output_tokens": 15034.4732,
"avg_total_tokens": 544267.5804,
"avg_turns": 17.2857,
"per_run_input_tokens": 14818527,
"per_run_output_tokens": 420965.25,
"per_run_cost": 3.174188,
"actual_model_name": "grok-4-fast",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0357,
"std": 0
},
"pass@4": 0.0357,
"pass^4": 0.0357
},
"o4-mini": {
"total_tasks": 28,
"total_agent_execution_time": 49544.87328624725,
"total_input_tokens": 29974571,
"total_output_tokens": 2909132,
"total_tokens": 32883703,
"total_turns": 1712,
"avg_agent_execution_time": 442.3649,
"avg_input_tokens": 267630.0982,
"avg_output_tokens": 25974.3929,
"avg_total_tokens": 293604.4911,
"avg_turns": 15.2857,
"per_run_input_tokens": 7493642.75,
"per_run_output_tokens": 727283,
"per_run_cost": 11.443052,
"actual_model_name": "o4-mini-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2054,
"std": 0.0585
},
"pass@4": 0.4286,
"pass^4": 0.0714
},
"gpt-4-1-mini": {
"total_tasks": 28,
"total_agent_execution_time": 6618.801545858383,
"total_input_tokens": 29427654,
"total_output_tokens": 151813,
"total_tokens": 29579467,
"total_turns": 1408,
"avg_agent_execution_time": 59.0964,
"avg_input_tokens": 262746.9107,
"avg_output_tokens": 1355.4732,
"avg_total_tokens": 264102.3839,
"avg_turns": 12.5714,
"per_run_input_tokens": 7356913.5,
"per_run_output_tokens": 37953.25,
"per_run_cost": 3.003491,
"actual_model_name": "gpt-4.1-mini-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0179,
"std": 0.0179
},
"pass@4": 0.0357,
"pass^4": 0
}
},
"playwright": {
"deepseek-v3-1-terminus-thinking": {
"total_tasks": 25,
"total_agent_execution_time": 77575.21338820457,
"total_input_tokens": 53019890,
"total_output_tokens": 1369290,
"total_tokens": 54389180,
"total_turns": 1393,
"avg_agent_execution_time": 775.7521,
"avg_input_tokens": 530198.9,
"avg_output_tokens": 13692.9,
"avg_total_tokens": 543891.8,
"avg_turns": 13.93,
"per_run_input_tokens": 13254972.5,
"per_run_output_tokens": 342322.5,
"per_run_cost": 3.053979,
"actual_model_name": "deepseek-v3.1-terminus-thinking",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.09,
"std": 0.0173
},
"pass@4": 0.2,
"pass^4": 0
},
"deepseek-chat": {
"total_tasks": 25,
"total_agent_execution_time": 28832.343264341354,
"total_input_tokens": 83601065,
"total_output_tokens": 176766,
"total_tokens": 83777831,
"total_turns": 1909,
"avg_agent_execution_time": 288.3234,
"avg_input_tokens": 836010.65,
"avg_output_tokens": 1767.66,
"avg_total_tokens": 837778.31,
"avg_turns": 19.09,
"per_run_input_tokens": 20900266.25,
"per_run_output_tokens": 44191.5,
"per_run_cost": 11.778391,
"actual_model_name": "deepseek-v3.1-non-think",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.07,
"std": 0.0332
},
"pass@4": 0.16,
"pass^4": 0
},
"gpt-4-1": {
"total_tasks": 25,
"total_agent_execution_time": 9215.533914804459,
"total_input_tokens": 85977431,
"total_output_tokens": 86136,
"total_tokens": 86063567,
"total_turns": 1380,
"avg_agent_execution_time": 92.1553,
"avg_input_tokens": 859774.31,
"avg_output_tokens": 861.36,
"avg_total_tokens": 860635.67,
"avg_turns": 13.8,
"per_run_input_tokens": 21494357.75,
"per_run_output_tokens": 21534,
"per_run_cost": 43.160987,
"actual_model_name": "gpt-4.1-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.08,
"std": 0.0283
},
"pass@4": 0.12,
"pass^4": 0.04
},
"claude-opus-4-1": {
"total_tasks": 25,
"total_agent_execution_time": 9880.799889326096,
"total_input_tokens": 28651348,
"total_output_tokens": 72107,
"total_tokens": 28723455,
"total_turns": 476,
"avg_agent_execution_time": 395.232,
"avg_input_tokens": 1146053.92,
"avg_output_tokens": 2884.28,
"avg_total_tokens": 1148938.2,
"avg_turns": 19.04,
"per_run_input_tokens": 28651348,
"per_run_output_tokens": 72107,
"per_run_cost": 435.178245,
"actual_model_name": "claude-opus-4-1-20250805",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.24,
"std": 0
}
},
"glm-4-5": {
"total_tasks": 25,
"total_agent_execution_time": 16563.084107398987,
"total_input_tokens": 58272900,
"total_output_tokens": 275965,
"total_tokens": 58548865,
"total_turns": 1536,
"avg_agent_execution_time": 165.6308,
"avg_input_tokens": 582729,
"avg_output_tokens": 2759.65,
"avg_total_tokens": 585488.65,
"avg_turns": 15.36,
"per_run_input_tokens": 14568225,
"per_run_output_tokens": 68991.25,
"per_run_cost": 4.898583,
"actual_model_name": "glm-4.5",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.13,
"std": 0.0332
},
"pass@4": 0.2,
"pass^4": 0.04
},
"o3": {
"total_tasks": 25,
"total_agent_execution_time": 15392.43521952629,
"total_input_tokens": 55630356,
"total_output_tokens": 446007,
"total_tokens": 56076363,
"total_turns": 1630,
"avg_agent_execution_time": 153.9244,
"avg_input_tokens": 556303.56,
"avg_output_tokens": 4460.07,
"avg_total_tokens": 560763.63,
"avg_turns": 16.3,
"per_run_input_tokens": 13907589,
"per_run_output_tokens": 111501.75,
"per_run_cost": 28.707192,
"actual_model_name": "o3-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.15,
"std": 0.052
},
"pass@4": 0.32,
"pass^4": 0.08
},
"claude-sonnet-4-5": {
"total_tasks": 25,
"total_agent_execution_time": 17537.452577114105,
"total_input_tokens": 124180757,
"total_output_tokens": 329104,
"total_tokens": 124509861,
"total_turns": 1943,
"avg_agent_execution_time": 175.3745,
"avg_input_tokens": 1241807.57,
"avg_output_tokens": 3291.04,
"avg_total_tokens": 1245098.61,
"avg_turns": 19.43,
"per_run_input_tokens": 31045189.25,
"per_run_output_tokens": 82276,
"per_run_cost": 94.369708,
"actual_model_name": "claude-sonnet-4-5-20250929",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.27,
"std": 0.0592
},
"pass@4": 0.36,
"pass^4": 0.16
},
"claude-sonnet-4-low": {
"total_tasks": 25,
"total_agent_execution_time": 23905.656344890594,
"total_input_tokens": 207574613,
"total_output_tokens": 382254,
"total_tokens": 207956867,
"total_turns": 1956,
"avg_agent_execution_time": 239.0566,
"avg_input_tokens": 2075746.13,
"avg_output_tokens": 3822.54,
"avg_total_tokens": 2079568.67,
"avg_turns": 19.56,
"per_run_input_tokens": 51893653.25,
"per_run_output_tokens": 95563.5,
"per_run_cost": 157.114412,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.22,
"std": 0.0346
},
"pass@4": 0.28,
"pass^4": 0.2
},
"gemini-2-5-pro": {
"total_tasks": 25,
"total_agent_execution_time": 17766.667247772217,
"total_input_tokens": 169644208,
"total_output_tokens": 558015,
"total_tokens": 170202223,
"total_turns": 1915,
"avg_agent_execution_time": 177.6667,
"avg_input_tokens": 1696442.08,
"avg_output_tokens": 5580.15,
"avg_total_tokens": 1702022.23,
"avg_turns": 19.15,
"per_run_input_tokens": 42411052,
"per_run_output_tokens": 139503.75,
"per_run_cost": 108.120186,
"actual_model_name": "gemini-2.5-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.15,
"std": 0.0173
},
"pass@4": 0.32,
"pass^4": 0.04
},
"qwen-3-coder-plus": {
"total_tasks": 25,
"total_agent_execution_time": 68003.63626265526,
"total_input_tokens": 285157442,
"total_output_tokens": 238572,
"total_tokens": 285396014,
"total_turns": 2121,
"avg_agent_execution_time": 680.0364,
"avg_input_tokens": 2851574.42,
"avg_output_tokens": 2385.72,
"avg_total_tokens": 2853960.14,
"avg_turns": 21.21,
"per_run_input_tokens": 71289360.5,
"per_run_output_tokens": 59643,
"per_run_cost": 14.305587,
"actual_model_name": "qwen3-coder-480b-a35b-instruct",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.3,
"std": 0.0447
},
"pass@4": 0.48,
"pass^4": 0.08
},
"gpt-5-mini-high": {
"total_tasks": 25,
"total_agent_execution_time": 36555.053931713104,
"total_input_tokens": 228501117,
"total_output_tokens": 2286332,
"total_tokens": 230787449,
"total_turns": 2572,
"avg_agent_execution_time": 365.5505,
"avg_input_tokens": 2285011.17,
"avg_output_tokens": 22863.32,
"avg_total_tokens": 2307874.49,
"avg_turns": 25.72,
"per_run_input_tokens": 57125279.25,
"per_run_output_tokens": 571583,
"per_run_cost": 15.424486,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.15,
"std": 0.052
},
"pass@4": 0.32,
"pass^4": 0.04
},
"gpt-5-nano-low": {
"total_tasks": 25,
"total_agent_execution_time": 13930.66529917717,
"total_input_tokens": 42973579,
"total_output_tokens": 1369659,
"total_tokens": 44343238,
"total_turns": 1419,
"avg_agent_execution_time": 139.3067,
"avg_input_tokens": 429735.79,
"avg_output_tokens": 13696.59,
"avg_total_tokens": 443432.38,
"avg_turns": 14.19,
"per_run_input_tokens": 10743394.75,
"per_run_output_tokens": 342414.75,
"per_run_cost": 0.674136,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"gpt-5-medium": {
"total_tasks": 25,
"total_agent_execution_time": 60819.73624634743,
"total_input_tokens": 180716712,
"total_output_tokens": 2178557,
"total_tokens": 182895269,
"total_turns": 2378,
"avg_agent_execution_time": 608.1974,
"avg_input_tokens": 1807167.12,
"avg_output_tokens": 21785.57,
"avg_total_tokens": 1828952.69,
"avg_turns": 23.78,
"per_run_input_tokens": 45179178,
"per_run_output_tokens": 544639.25,
"per_run_cost": 61.920365,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.43,
"std": 0.052
},
"pass@4": 0.56,
"pass^4": 0.36
},
"claude-sonnet-4": {
"total_tasks": 25,
"total_agent_execution_time": 27869.116582155228,
"total_input_tokens": 124192324,
"total_output_tokens": 352308,
"total_tokens": 124544632,
"total_turns": 1980,
"avg_agent_execution_time": 278.6912,
"avg_input_tokens": 1241923.24,
"avg_output_tokens": 3523.08,
"avg_total_tokens": 1245446.32,
"avg_turns": 19.8,
"per_run_input_tokens": 31048081,
"per_run_output_tokens": 88077,
"per_run_cost": 94.465398,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.26,
"std": 0.06
},
"pass@4": 0.36,
"pass^4": 0.08
},
"gpt-5-low": {
"total_tasks": 25,
"total_agent_execution_time": 52685.02177119255,
"total_input_tokens": 172852606,
"total_output_tokens": 1872354,
"total_tokens": 174724960,
"total_turns": 2421,
"avg_agent_execution_time": 526.8502,
"avg_input_tokens": 1728526.06,
"avg_output_tokens": 18723.54,
"avg_total_tokens": 1747249.6,
"avg_turns": 24.21,
"per_run_input_tokens": 43213151.5,
"per_run_output_tokens": 468088.5,
"per_run_cost": 58.697324,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.45,
"std": 0.0173
},
"pass@4": 0.56,
"pass^4": 0.32
},
"gpt-5-mini-medium": {
"total_tasks": 25,
"total_agent_execution_time": 21595.999030590057,
"total_input_tokens": 181493809,
"total_output_tokens": 854754,
"total_tokens": 182348563,
"total_turns": 2275,
"avg_agent_execution_time": 215.96,
"avg_input_tokens": 1814938.09,
"avg_output_tokens": 8547.54,
"avg_total_tokens": 1823485.63,
"avg_turns": 22.75,
"per_run_input_tokens": 45373452.25,
"per_run_output_tokens": 213688.5,
"per_run_cost": 11.77074,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.12,
"std": 0.0632
},
"pass@4": 0.24,
"pass^4": 0.04
},
"gpt-4-1-nano": {
"total_tasks": 25,
"total_agent_execution_time": 5384.292573690414,
"total_input_tokens": 38979660,
"total_output_tokens": 73604,
"total_tokens": 39053264,
"total_turns": 1351,
"avg_agent_execution_time": 53.8429,
"avg_input_tokens": 389796.6,
"avg_output_tokens": 736.04,
"avg_total_tokens": 390532.64,
"avg_turns": 13.51,
"per_run_input_tokens": 9744915,
"per_run_output_tokens": 18401,
"per_run_cost": 0.981852,
"actual_model_name": "gpt-4.1-nano-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"grok-code-fast-1": {
"total_tasks": 25,
"total_agent_execution_time": 11951.442332029343,
"total_input_tokens": 115771636,
"total_output_tokens": 717365,
"total_tokens": 116489001,
"total_turns": 1823,
"avg_agent_execution_time": 119.5144,
"avg_input_tokens": 1157716.36,
"avg_output_tokens": 7173.65,
"avg_total_tokens": 1164890.01,
"avg_turns": 18.23,
"per_run_input_tokens": 28942909,
"per_run_output_tokens": 179341.25,
"per_run_cost": 6.057594,
"actual_model_name": "grok-code-fast-1",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.25,
"std": 0.0173
},
"pass@4": 0.36,
"pass^4": 0.08
},
"deepseek-v3-2-thinking": {
"total_tasks": 25,
"total_agent_execution_time": 34954.84049654007,
"total_input_tokens": 150437178,
"total_output_tokens": 560150,
"total_tokens": 150997328,
"total_turns": 2739,
"avg_agent_execution_time": 349.5484,
"avg_input_tokens": 1504371.78,
"avg_output_tokens": 5601.5,
"avg_total_tokens": 1509973.28,
"avg_turns": 27.39,
"per_run_input_tokens": 37609294.5,
"per_run_output_tokens": 140037.5,
"per_run_cost": 10.210525,
"actual_model_name": "deepseek-v3.2-reasoner",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.17,
"std": 0.0173
},
"pass@4": 0.24,
"pass^4": 0.12
},
"gemini-2-5-flash": {
"total_tasks": 25,
"total_agent_execution_time": 20540.323031187057,
"total_input_tokens": 383892671,
"total_output_tokens": 821477,
"total_tokens": 384714181,
"total_turns": 2633,
"avg_agent_execution_time": 205.4032,
"avg_input_tokens": 3838926.71,
"avg_output_tokens": 8214.77,
"avg_total_tokens": 3847141.81,
"avg_turns": 26.33,
"per_run_input_tokens": 95973167.75,
"per_run_output_tokens": 205369.25,
"per_run_cost": 29.305373,
"actual_model_name": "gemini-2.5-flash",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.06,
"std": 0.02
},
"pass@4": 0.12,
"pass^4": 0
},
"gpt-5-nano-high": {
"total_tasks": 25,
"total_agent_execution_time": 32499.491863250732,
"total_input_tokens": 153821020,
"total_output_tokens": 4015265,
"total_tokens": 157836285,
"total_turns": 2692,
"avg_agent_execution_time": 324.9949,
"avg_input_tokens": 1538210.2,
"avg_output_tokens": 40152.65,
"avg_total_tokens": 1578362.85,
"avg_turns": 26.92,
"per_run_input_tokens": 38455255,
"per_run_output_tokens": 1003816.25,
"per_run_cost": 2.324289,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.02,
"std": 0.02
},
"pass@4": 0.04,
"pass^4": 0
},
"gpt-5-mini-low": {
"total_tasks": 25,
"total_agent_execution_time": 6713.273415088654,
"total_input_tokens": 41994636,
"total_output_tokens": 185994,
"total_tokens": 42180630,
"total_turns": 1055,
"avg_agent_execution_time": 67.1327,
"avg_input_tokens": 419946.36,
"avg_output_tokens": 1859.94,
"avg_total_tokens": 421806.3,
"avg_turns": 10.55,
"per_run_input_tokens": 10498659,
"per_run_output_tokens": 46498.5,
"per_run_cost": 2.717662,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.01,
"std": 0.0173
},
"pass@4": 0.04,
"pass^4": 0
},
"deepseek-v3-1-terminus": {
"total_tasks": 25,
"total_agent_execution_time": 32988.77002954483,
"total_input_tokens": 67216694,
"total_output_tokens": 187775,
"total_tokens": 67404469,
"total_turns": 1559,
"avg_agent_execution_time": 329.8877,
"avg_input_tokens": 672166.94,
"avg_output_tokens": 1877.75,
"avg_total_tokens": 674044.69,
"avg_turns": 15.59,
"per_run_input_tokens": 16804173.5,
"per_run_output_tokens": 46943.75,
"per_run_cost": 3.565962,
"actual_model_name": "deepseek-v3.1-terminus",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.13,
"std": 0.0173
},
"pass@4": 0.2,
"pass^4": 0.08
},
"qwen-3-max": {
"total_tasks": 25,
"total_agent_execution_time": 41767.95633363724,
"total_input_tokens": 229766924,
"total_output_tokens": 116233,
"total_tokens": 229883157,
"total_turns": 2783,
"avg_agent_execution_time": 417.6796,
"avg_input_tokens": 2297669.24,
"avg_output_tokens": 1162.33,
"avg_total_tokens": 2298831.57,
"avg_turns": 27.83,
"per_run_input_tokens": 57441731,
"per_run_output_tokens": 29058.25,
"per_run_cost": 69.104427,
"actual_model_name": "qwen3-max-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.08,
"std": 0
},
"pass@4": 0.12,
"pass^4": 0.04
},
"claude-opus-4-5-high": {
"total_tasks": 25,
"total_agent_execution_time": 16962.872637987137,
"total_input_tokens": 121322288,
"total_output_tokens": 324168,
"total_tokens": 121646456,
"total_turns": 1912,
"avg_agent_execution_time": 169.6287,
"avg_input_tokens": 1213222.88,
"avg_output_tokens": 3241.68,
"avg_total_tokens": 1216464.56,
"avg_turns": 19.12,
"per_run_input_tokens": 30330572,
"per_run_output_tokens": 81042,
"per_run_cost": 153.67891,
"actual_model_name": "claude-opus-4-5-20251101",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.45,
"std": 0.0173
},
"pass@4": 0.52,
"pass^4": 0.4
},
"deepseek-v3-2-chat": {
"total_tasks": 25,
"total_agent_execution_time": 31488.66070652008,
"total_input_tokens": 105921870,
"total_output_tokens": 370795,
"total_tokens": 106292665,
"total_turns": 2159,
"avg_agent_execution_time": 314.8866,
"avg_input_tokens": 1059218.7,
"avg_output_tokens": 3707.95,
"avg_total_tokens": 1062926.65,
"avg_turns": 21.59,
"per_run_input_tokens": 26480467.5,
"per_run_output_tokens": 92698.75,
"per_run_cost": 7.186806,
"actual_model_name": "deepseek-v3.2-chat",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.19,
"std": 0.0332
},
"pass@4": 0.28,
"pass^4": 0.12
},
"kimi-k2-0711": {
"total_tasks": 25,
"total_agent_execution_time": 22136.299335479736,
"total_input_tokens": 73770186,
"total_output_tokens": 169700,
"total_tokens": 73939886,
"total_turns": 1783,
"avg_agent_execution_time": 221.363,
"avg_input_tokens": 737701.86,
"avg_output_tokens": 1697,
"avg_total_tokens": 739398.86,
"avg_turns": 17.83,
"per_run_input_tokens": 18442546.5,
"per_run_output_tokens": 42425,
"per_run_cost": 11.17159,
"actual_model_name": "kimi-k2-0711-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.13,
"std": 0.0332
},
"pass@4": 0.16,
"pass^4": 0.08
},
"gemini-3-pro-high": {
"total_tasks": 25,
"total_agent_execution_time": 32552.172060728073,
"total_input_tokens": 319775608,
"total_output_tokens": 877470,
"total_tokens": 320653078,
"total_turns": 2464,
"avg_agent_execution_time": 325.5217,
"avg_input_tokens": 3197756.08,
"avg_output_tokens": 8774.7,
"avg_total_tokens": 3206530.78,
"avg_turns": 24.64,
"per_run_input_tokens": 79943902,
"per_run_output_tokens": 219367.5,
"per_run_cost": 162.520214,
"actual_model_name": "gemini-3-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4,
"std": 0.0566
},
"pass@4": 0.48,
"pass^4": 0.28
},
"gpt-5-high": {
"total_tasks": 25,
"total_agent_execution_time": 111593.64490675926,
"total_input_tokens": 169300141,
"total_output_tokens": 3367299,
"total_tokens": 172667440,
"total_turns": 2312,
"avg_agent_execution_time": 1115.9364,
"avg_input_tokens": 1693001.41,
"avg_output_tokens": 33672.99,
"avg_total_tokens": 1726674.4,
"avg_turns": 23.12,
"per_run_input_tokens": 42325035.25,
"per_run_output_tokens": 841824.75,
"per_run_cost": 61.324542,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.42,
"std": 0.0447
},
"pass@4": 0.56,
"pass^4": 0.24
},
"kimi-k2-0905": {
"total_tasks": 25,
"total_agent_execution_time": 38055.46840238571,
"total_input_tokens": 135802332,
"total_output_tokens": 216646,
"total_tokens": 136018978,
"total_turns": 2064,
"avg_agent_execution_time": 380.5547,
"avg_input_tokens": 1358023.32,
"avg_output_tokens": 2166.46,
"avg_total_tokens": 1360189.78,
"avg_turns": 20.64,
"per_run_input_tokens": 33950583,
"per_run_output_tokens": 54161.5,
"per_run_cost": 20.505754,
"actual_model_name": "kimi-k2-0905-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.3,
"std": 0.06
},
"pass@4": 0.4,
"pass^4": 0.2
},
"gpt-5-nano-medium": {
"total_tasks": 25,
"total_agent_execution_time": 17114.76342177391,
"total_input_tokens": 71195262,
"total_output_tokens": 1771003,
"total_tokens": 72966265,
"total_turns": 1852,
"avg_agent_execution_time": 171.1476,
"avg_input_tokens": 711952.62,
"avg_output_tokens": 17710.03,
"avg_total_tokens": 729662.65,
"avg_turns": 18.52,
"per_run_input_tokens": 17798815.5,
"per_run_output_tokens": 442750.75,
"per_run_cost": 1.067041,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"gpt-5-2-high": {
"total_tasks": 25,
"total_agent_execution_time": 53476.62803673744,
"total_input_tokens": 188633415,
"total_output_tokens": 1564048,
"total_tokens": 190197463,
"total_turns": 2470,
"avg_agent_execution_time": 534.7663,
"avg_input_tokens": 1886334.15,
"avg_output_tokens": 15640.48,
"avg_total_tokens": 1901974.63,
"avg_turns": 24.7,
"per_run_input_tokens": 47158353.75,
"per_run_output_tokens": 391012,
"per_run_cost": 88.001287,
"actual_model_name": "gpt-5.2-2025-12-11",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.46,
"std": 0.0346
},
"pass@4": 0.6,
"pass^4": 0.32
},
"grok-4": {
"total_tasks": 25,
"total_agent_execution_time": 27723.676310300827,
"total_input_tokens": 126490610,
"total_output_tokens": 663708,
"total_tokens": 127154318,
"total_turns": 2005,
"avg_agent_execution_time": 277.2368,
"avg_input_tokens": 1264906.1,
"avg_output_tokens": 6637.08,
"avg_total_tokens": 1271543.18,
"avg_turns": 20.05,
"per_run_input_tokens": 31622652.5,
"per_run_output_tokens": 165927,
"per_run_cost": 97.356863,
"actual_model_name": "grok-4-0709",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.35,
"std": 0.0768
},
"pass@4": 0.48,
"pass^4": 0.2
},
"claude-sonnet-4-high": {
"total_tasks": 25,
"total_agent_execution_time": 26188.123467206955,
"total_input_tokens": 203767484,
"total_output_tokens": 387535,
"total_tokens": 204155019,
"total_turns": 2003,
"avg_agent_execution_time": 261.8812,
"avg_input_tokens": 2037674.84,
"avg_output_tokens": 3875.35,
"avg_total_tokens": 2041550.19,
"avg_turns": 20.03,
"per_run_input_tokens": 50941871,
"per_run_output_tokens": 96883.75,
"per_run_cost": 154.278869,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.26,
"std": 0.02
},
"pass@4": 0.28,
"pass^4": 0.24
},
"gemini-3-pro-low": {
"total_tasks": 25,
"total_agent_execution_time": 28675.66722226143,
"total_input_tokens": 301650195,
"total_output_tokens": 787544,
"total_tokens": 302437739,
"total_turns": 2361,
"avg_agent_execution_time": 286.7567,
"avg_input_tokens": 3016501.95,
"avg_output_tokens": 7875.44,
"avg_total_tokens": 3024377.39,
"avg_turns": 23.61,
"per_run_input_tokens": 75412548.75,
"per_run_output_tokens": 196886,
"per_run_cost": 153.187729,
"actual_model_name": "gemini-3-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4,
"std": 0.04
},
"pass@4": 0.48,
"pass^4": 0.28
},
"gpt-oss-120b": {
"total_tasks": 25,
"total_agent_execution_time": 3726.188953638077,
"total_input_tokens": 13933005,
"total_output_tokens": 126750,
"total_tokens": 14059755,
"total_turns": 721,
"avg_agent_execution_time": 37.2619,
"avg_input_tokens": 139330.05,
"avg_output_tokens": 1267.5,
"avg_total_tokens": 140597.55,
"avg_turns": 7.21,
"per_run_input_tokens": 3483251.25,
"per_run_output_tokens": 31687.5,
"per_run_cost": 0.259667,
"actual_model_name": "gpt-oss-120b",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.03,
"std": 0.0173
},
"pass@4": 0.04,
"pass^4": 0
},
"grok-4-fast": {
"total_tasks": 25,
"total_agent_execution_time": 10575.462986469269,
"total_input_tokens": 152417879,
"total_output_tokens": 492539,
"total_tokens": 152910418,
"total_turns": 1792,
"avg_agent_execution_time": 105.7546,
"avg_input_tokens": 1524178.79,
"avg_output_tokens": 4925.39,
"avg_total_tokens": 1529104.18,
"avg_turns": 17.92,
"per_run_input_tokens": 38104469.75,
"per_run_output_tokens": 123134.75,
"per_run_cost": 7.682461,
"actual_model_name": "grok-4-fast",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.27,
"std": 0.0332
},
"pass@4": 0.4,
"pass^4": 0.16
},
"o4-mini": {
"total_tasks": 25,
"total_agent_execution_time": 53063.511632204056,
"total_input_tokens": 86250613,
"total_output_tokens": 1807045,
"total_tokens": 88057658,
"total_turns": 1770,
"avg_agent_execution_time": 530.6351,
"avg_input_tokens": 862506.13,
"avg_output_tokens": 18070.45,
"avg_total_tokens": 880576.58,
"avg_turns": 17.7,
"per_run_input_tokens": 21562653.25,
"per_run_output_tokens": 451761.25,
"per_run_cost": 25.706668,
"actual_model_name": "o4-mini-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.12,
"std": 0.0283
},
"pass@4": 0.28,
"pass^4": 0
},
"gpt-4-1-mini": {
"total_tasks": 25,
"total_agent_execution_time": 19571.90656900406,
"total_input_tokens": 495914086,
"total_output_tokens": 327829,
"total_tokens": 496241915,
"total_turns": 3133,
"avg_agent_execution_time": 195.7191,
"avg_input_tokens": 4959140.86,
"avg_output_tokens": 3278.29,
"avg_total_tokens": 4962419.15,
"avg_turns": 31.33,
"per_run_input_tokens": 123978521.5,
"per_run_output_tokens": 81957.25,
"per_run_cost": 49.72254,
"actual_model_name": "gpt-4.1-mini-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
}
},
"postgres": {
"deepseek-v3-1-terminus-thinking": {
"total_tasks": 21,
"total_agent_execution_time": 35156.10177230835,
"total_input_tokens": 22322429,
"total_output_tokens": 679670,
"total_tokens": 23002099,
"total_turns": 1567,
"avg_agent_execution_time": 418.525,
"avg_input_tokens": 265743.2024,
"avg_output_tokens": 8091.3095,
"avg_total_tokens": 273834.5119,
"avg_turns": 18.6548,
"per_run_input_tokens": 5580607.25,
"per_run_output_tokens": 169917.5,
"per_run_cost": 1.306162,
"actual_model_name": "deepseek-v3.1-terminus-thinking",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4167,
"std": 0.0781
},
"pass@4": 0.619,
"pass^4": 0.1905
},
"deepseek-chat": {
"total_tasks": 21,
"total_agent_execution_time": 29866.240534305573,
"total_input_tokens": 26594178,
"total_output_tokens": 390257,
"total_tokens": 26984435,
"total_turns": 2224,
"avg_agent_execution_time": 355.5505,
"avg_input_tokens": 316597.3571,
"avg_output_tokens": 4645.9167,
"avg_total_tokens": 321243.2738,
"avg_turns": 26.4762,
"per_run_input_tokens": 6648544.5,
"per_run_output_tokens": 97564.25,
"per_run_cost": 3.887093,
"actual_model_name": "deepseek-v3.1-non-think",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.4286,
"std": 0.0753
},
"pass@4": 0.619,
"pass^4": 0.2857
},
"gpt-4-1": {
"total_tasks": 21,
"total_agent_execution_time": 2425.0810141563416,
"total_input_tokens": 4628898,
"total_output_tokens": 100826,
"total_tokens": 4729724,
"total_turns": 682,
"avg_agent_execution_time": 28.87,
"avg_input_tokens": 55105.9286,
"avg_output_tokens": 1200.3095,
"avg_total_tokens": 56306.2381,
"avg_turns": 8.119,
"per_run_input_tokens": 1157224.5,
"per_run_output_tokens": 25206.5,
"per_run_cost": 2.516101,
"actual_model_name": "gpt-4.1-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0476,
"std": 0
},
"pass@4": 0.0476,
"pass^4": 0.0476
},
"claude-opus-4-1": {
"total_tasks": 21,
"total_agent_execution_time": 10822.950607061386,
"total_input_tokens": 5474339,
"total_output_tokens": 205732,
"total_tokens": 5680071,
"total_turns": 522,
"avg_agent_execution_time": 515.3786,
"avg_input_tokens": 260682.8095,
"avg_output_tokens": 9796.7619,
"avg_total_tokens": 270479.5714,
"avg_turns": 24.8571,
"per_run_input_tokens": 5474339,
"per_run_output_tokens": 205732,
"per_run_cost": 97.544985,
"actual_model_name": "claude-opus-4-1-20250805",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.3333,
"std": 0
}
},
"glm-4-5": {
"total_tasks": 21,
"total_agent_execution_time": 13283.361280679703,
"total_input_tokens": 17186959,
"total_output_tokens": 431440,
"total_tokens": 17618399,
"total_turns": 2133,
"avg_agent_execution_time": 158.1353,
"avg_input_tokens": 204606.6548,
"avg_output_tokens": 5136.1905,
"avg_total_tokens": 209742.8452,
"avg_turns": 25.3929,
"per_run_input_tokens": 4296739.75,
"per_run_output_tokens": 107860,
"per_run_cost": 1.560299,
"actual_model_name": "glm-4.5",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1429,
"std": 0.0753
},
"pass@4": 0.2381,
"pass^4": 0
},
"o3": {
"total_tasks": 21,
"total_agent_execution_time": 6351.111663579941,
"total_input_tokens": 5338793,
"total_output_tokens": 396424,
"total_tokens": 5735217,
"total_turns": 900,
"avg_agent_execution_time": 75.6085,
"avg_input_tokens": 63557.0595,
"avg_output_tokens": 4719.3333,
"avg_total_tokens": 68276.3929,
"avg_turns": 10.7143,
"per_run_input_tokens": 1334698.25,
"per_run_output_tokens": 99106,
"per_run_cost": 3.462244,
"actual_model_name": "o3-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.369,
"std": 0.0395
},
"pass@4": 0.6667,
"pass^4": 0.1429
},
"claude-sonnet-4-5": {
"total_tasks": 21,
"total_agent_execution_time": 20314.365948438644,
"total_input_tokens": 40592706,
"total_output_tokens": 887731,
"total_tokens": 41480437,
"total_turns": 2135,
"avg_agent_execution_time": 241.8377,
"avg_input_tokens": 483246.5,
"avg_output_tokens": 10568.2262,
"avg_total_tokens": 493814.7262,
"avg_turns": 25.4167,
"per_run_input_tokens": 10148176.5,
"per_run_output_tokens": 221932.75,
"per_run_cost": 33.773521,
"actual_model_name": "claude-sonnet-4-5-20250929",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.5,
"std": 0.0412
},
"pass@4": 0.6667,
"pass^4": 0.381
},
"claude-sonnet-4-low": {
"total_tasks": 21,
"total_agent_execution_time": 15643.686558961868,
"total_input_tokens": 56619774,
"total_output_tokens": 560879,
"total_tokens": 57180653,
"total_turns": 2040,
"avg_agent_execution_time": 186.2344,
"avg_input_tokens": 674044.9286,
"avg_output_tokens": 6677.131,
"avg_total_tokens": 680722.0595,
"avg_turns": 24.2857,
"per_run_input_tokens": 14154943.5,
"per_run_output_tokens": 140219.75,
"per_run_cost": 44.568127,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.4881,
"std": 0.0704
},
"pass@4": 0.7143,
"pass^4": 0.3333
},
"gemini-2-5-pro": {
"total_tasks": 21,
"total_agent_execution_time": 7857.584183454514,
"total_input_tokens": 3338499,
"total_output_tokens": 748442,
"total_tokens": 4086941,
"total_turns": 626,
"avg_agent_execution_time": 93.5427,
"avg_input_tokens": 39744.0357,
"avg_output_tokens": 8910.0238,
"avg_total_tokens": 48654.0595,
"avg_turns": 7.4524,
"per_run_input_tokens": 834624.75,
"per_run_output_tokens": 187110.5,
"per_run_cost": 4.893219,
"actual_model_name": "gemini-2.5-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2619,
"std": 0.079
},
"pass@4": 0.4762,
"pass^4": 0.0952
},
"qwen-3-coder-plus": {
"total_tasks": 21,
"total_agent_execution_time": 11836.674258947372,
"total_input_tokens": 48207822,
"total_output_tokens": 430956,
"total_tokens": 48638778,
"total_turns": 2436,
"avg_agent_execution_time": 140.9128,
"avg_input_tokens": 573902.6429,
"avg_output_tokens": 5130.4286,
"avg_total_tokens": 579033.0714,
"avg_turns": 29,
"per_run_input_tokens": 12051955.5,
"per_run_output_tokens": 107739,
"per_run_cost": 2.496582,
"actual_model_name": "qwen3-coder-480b-a35b-instruct",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.4762,
"std": 0.0583
},
"pass@4": 0.619,
"pass^4": 0.381
},
"gpt-5-mini-high": {
"total_tasks": 21,
"total_agent_execution_time": 16902.33143734932,
"total_input_tokens": 13472021,
"total_output_tokens": 1982560,
"total_tokens": 15454581,
"total_turns": 1177,
"avg_agent_execution_time": 201.2182,
"avg_input_tokens": 160381.2024,
"avg_output_tokens": 23601.9048,
"avg_total_tokens": 183983.1071,
"avg_turns": 14.0119,
"per_run_input_tokens": 3368005.25,
"per_run_output_tokens": 495640,
"per_run_cost": 1.833281,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.6667,
"std": 0.0337
},
"pass@4": 0.8095,
"pass^4": 0.4286
},
"gpt-5-nano-low": {
"total_tasks": 21,
"total_agent_execution_time": 6595.001572132111,
"total_input_tokens": 2014372,
"total_output_tokens": 1323958,
"total_tokens": 3338330,
"total_turns": 569,
"avg_agent_execution_time": 78.5119,
"avg_input_tokens": 23980.619,
"avg_output_tokens": 15761.4048,
"avg_total_tokens": 39742.0238,
"avg_turns": 6.7738,
"per_run_input_tokens": 503593,
"per_run_output_tokens": 330989.5,
"per_run_cost": 0.157575,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0833,
"std": 0.0395
},
"pass@4": 0.1905,
"pass^4": 0
},
"gpt-5-medium": {
"total_tasks": 21,
"total_agent_execution_time": 28409.77977871895,
"total_input_tokens": 9521385,
"total_output_tokens": 1431337,
"total_tokens": 10952722,
"total_turns": 1123,
"avg_agent_execution_time": 338.2117,
"avg_input_tokens": 113349.8214,
"avg_output_tokens": 17039.7262,
"avg_total_tokens": 130389.5476,
"avg_turns": 13.369,
"per_run_input_tokens": 2380346.25,
"per_run_output_tokens": 357834.25,
"per_run_cost": 6.553775,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.7619,
"std": 0.0753
},
"pass@4": 1,
"pass^4": 0.4762
},
"claude-sonnet-4": {
"total_tasks": 21,
"total_agent_execution_time": 20113.901407003403,
"total_input_tokens": 27812468,
"total_output_tokens": 633803,
"total_tokens": 28446271,
"total_turns": 2251,
"avg_agent_execution_time": 239.4512,
"avg_input_tokens": 331100.8095,
"avg_output_tokens": 7545.2738,
"avg_total_tokens": 338646.0833,
"avg_turns": 26.7976,
"per_run_input_tokens": 6953117,
"per_run_output_tokens": 158450.75,
"per_run_cost": 23.236112,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.5357,
"std": 0.0619
},
"pass@4": 0.7143,
"pass^4": 0.381
},
"gpt-5-low": {
"total_tasks": 21,
"total_agent_execution_time": 22876.62664246559,
"total_input_tokens": 8603642,
"total_output_tokens": 1368540,
"total_tokens": 9972182,
"total_turns": 1051,
"avg_agent_execution_time": 272.3408,
"avg_input_tokens": 102424.3095,
"avg_output_tokens": 16292.1429,
"avg_total_tokens": 118716.4524,
"avg_turns": 12.5119,
"per_run_input_tokens": 2150910.5,
"per_run_output_tokens": 342135,
"per_run_cost": 6.109988,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.7381,
"std": 0.0412
},
"pass@4": 0.9524,
"pass^4": 0.381
},
"gpt-5-mini-medium": {
"total_tasks": 21,
"total_agent_execution_time": 8105.054537057877,
"total_input_tokens": 9693898,
"total_output_tokens": 778810,
"total_tokens": 10472708,
"total_turns": 989,
"avg_agent_execution_time": 96.4887,
"avg_input_tokens": 115403.5476,
"avg_output_tokens": 9271.5476,
"avg_total_tokens": 124675.0952,
"avg_turns": 11.7738,
"per_run_input_tokens": 2423474.5,
"per_run_output_tokens": 194702.5,
"per_run_cost": 0.995274,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.619,
"std": 0.0583
},
"pass@4": 0.9048,
"pass^4": 0.2857
},
"gpt-4-1-nano": {
"total_tasks": 21,
"total_agent_execution_time": 2732.388694524765,
"total_input_tokens": 5969485,
"total_output_tokens": 203920,
"total_tokens": 6173405,
"total_turns": 733,
"avg_agent_execution_time": 32.5284,
"avg_input_tokens": 71065.2976,
"avg_output_tokens": 2427.619,
"avg_total_tokens": 73492.9167,
"avg_turns": 8.7262,
"per_run_input_tokens": 1492371.25,
"per_run_output_tokens": 50980,
"per_run_cost": 0.169629,
"actual_model_name": "gpt-4.1-nano-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"grok-code-fast-1": {
"total_tasks": 21,
"total_agent_execution_time": 4306.698365211487,
"total_input_tokens": 19018054,
"total_output_tokens": 458508,
"total_tokens": 19476562,
"total_turns": 1655,
"avg_agent_execution_time": 51.2702,
"avg_input_tokens": 226405.4048,
"avg_output_tokens": 5458.4286,
"avg_total_tokens": 231863.8333,
"avg_turns": 19.7024,
"per_run_input_tokens": 4754513.5,
"per_run_output_tokens": 114627,
"per_run_cost": 1.122843,
"actual_model_name": "grok-code-fast-1",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4762,
"std": 0.0476
},
"pass@4": 0.619,
"pass^4": 0.2857
},
"deepseek-v3-2-thinking": {
"total_tasks": 21,
"total_agent_execution_time": 34019.092260837555,
"total_input_tokens": 41523136,
"total_output_tokens": 812209,
"total_tokens": 42335345,
"total_turns": 2767,
"avg_agent_execution_time": 404.9892,
"avg_input_tokens": 494323.0476,
"avg_output_tokens": 9669.1548,
"avg_total_tokens": 503992.2024,
"avg_turns": 32.9405,
"per_run_input_tokens": 10380784,
"per_run_output_tokens": 203052.25,
"per_run_cost": 2.884033,
"actual_model_name": "deepseek-v3.2-reasoner",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.6667,
"std": 0.0583
},
"pass@4": 0.9048,
"pass^4": 0.381
},
"gemini-2-5-flash": {
"total_tasks": 21,
"total_agent_execution_time": 5116.033502817154,
"total_input_tokens": 3870615,
"total_output_tokens": 834214,
"total_tokens": 4704829,
"total_turns": 736,
"avg_agent_execution_time": 60.9052,
"avg_input_tokens": 46078.75,
"avg_output_tokens": 9931.119,
"avg_total_tokens": 56009.869,
"avg_turns": 8.7619,
"per_run_input_tokens": 967653.75,
"per_run_output_tokens": 208553.5,
"per_run_cost": 0.81168,
"actual_model_name": "gemini-2.5-flash",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1071,
"std": 0.0619
},
"pass@4": 0.2381,
"pass^4": 0.0476
},
"gpt-5-nano-high": {
"total_tasks": 21,
"total_agent_execution_time": 25844.52258706093,
"total_input_tokens": 15962048,
"total_output_tokens": 5159736,
"total_tokens": 21121784,
"total_turns": 1311,
"avg_agent_execution_time": 307.6729,
"avg_input_tokens": 190024.381,
"avg_output_tokens": 61425.4286,
"avg_total_tokens": 251449.8095,
"avg_turns": 15.6071,
"per_run_input_tokens": 3990512,
"per_run_output_tokens": 1289934,
"per_run_cost": 0.715499,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0952,
"std": 0.0337
},
"pass@4": 0.3333,
"pass^4": 0
},
"gpt-5-mini-low": {
"total_tasks": 21,
"total_agent_execution_time": 4425.5045709609985,
"total_input_tokens": 2294861,
"total_output_tokens": 393050,
"total_tokens": 2687911,
"total_turns": 491,
"avg_agent_execution_time": 52.6846,
"avg_input_tokens": 27319.7738,
"avg_output_tokens": 4679.1667,
"avg_total_tokens": 31998.9405,
"avg_turns": 5.8452,
"per_run_input_tokens": 573715.25,
"per_run_output_tokens": 98262.5,
"per_run_cost": 0.339954,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1429,
"std": 0.0337
},
"pass@4": 0.2857,
"pass^4": 0
},
"deepseek-v3-1-terminus": {
"total_tasks": 21,
"total_agent_execution_time": 20219.888365983963,
"total_input_tokens": 24325657,
"total_output_tokens": 295084,
"total_tokens": 24620741,
"total_turns": 1652,
"avg_agent_execution_time": 240.713,
"avg_input_tokens": 289591.1548,
"avg_output_tokens": 3512.9048,
"avg_total_tokens": 293104.0595,
"avg_turns": 19.6667,
"per_run_input_tokens": 6081414.25,
"per_run_output_tokens": 73771,
"per_run_cost": 1.335376,
"actual_model_name": "deepseek-v3.1-terminus",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.3333,
"std": 0.1992
},
"pass@4": 0.5714,
"pass^4": 0
},
"qwen-3-max": {
"total_tasks": 21,
"total_agent_execution_time": 13406.892842531204,
"total_input_tokens": 16138565,
"total_output_tokens": 412055,
"total_tokens": 16550620,
"total_turns": 1586,
"avg_agent_execution_time": 159.6059,
"avg_input_tokens": 192125.7738,
"avg_output_tokens": 4905.4167,
"avg_total_tokens": 197031.1905,
"avg_turns": 18.881,
"per_run_input_tokens": 4034641.25,
"per_run_output_tokens": 103013.75,
"per_run_cost": 5.459652,
"actual_model_name": "qwen3-max-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.4405,
"std": 0.0206
},
"pass@4": 0.5238,
"pass^4": 0.381
},
"claude-opus-4-5-high": {
"total_tasks": 21,
"total_agent_execution_time": 14917.058457374573,
"total_input_tokens": 39823230,
"total_output_tokens": 647299,
"total_tokens": 40470529,
"total_turns": 1809,
"avg_agent_execution_time": 177.584,
"avg_input_tokens": 474086.0714,
"avg_output_tokens": 7705.9405,
"avg_total_tokens": 481792.0119,
"avg_turns": 21.5357,
"per_run_input_tokens": 9955807.5,
"per_run_output_tokens": 161824.75,
"per_run_cost": 53.824656,
"actual_model_name": "claude-opus-4-5-20251101",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.5357,
"std": 0.0519
},
"pass@4": 0.7143,
"pass^4": 0.4286
},
"deepseek-v3-2-chat": {
"total_tasks": 21,
"total_agent_execution_time": 26201.995259284973,
"total_input_tokens": 41279108,
"total_output_tokens": 559786,
"total_tokens": 41838894,
"total_turns": 2623,
"avg_agent_execution_time": 311.9285,
"avg_input_tokens": 491417.9524,
"avg_output_tokens": 6664.119,
"avg_total_tokens": 498082.0714,
"avg_turns": 31.2262,
"per_run_input_tokens": 10319777,
"per_run_output_tokens": 139946.5,
"per_run_cost": 2.842318,
"actual_model_name": "deepseek-v3.2-chat",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5952,
"std": 0.0532
},
"pass@4": 0.8095,
"pass^4": 0.381
},
"kimi-k2-0711": {
"total_tasks": 21,
"total_agent_execution_time": 20866.895277023315,
"total_input_tokens": 22082538,
"total_output_tokens": 385375,
"total_tokens": 22467913,
"total_turns": 2247,
"avg_agent_execution_time": 248.4154,
"avg_input_tokens": 262887.3571,
"avg_output_tokens": 4587.7976,
"avg_total_tokens": 267475.1548,
"avg_turns": 26.75,
"per_run_input_tokens": 5520634.5,
"per_run_output_tokens": 96343.75,
"per_run_cost": 3.55324,
"actual_model_name": "kimi-k2-0711-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.4048,
"std": 0.079
},
"pass@4": 0.7143,
"pass^4": 0.2857
},
"gemini-3-pro-high": {
"total_tasks": 21,
"total_agent_execution_time": 15857.61001610756,
"total_input_tokens": 18119673,
"total_output_tokens": 753138,
"total_tokens": 18872811,
"total_turns": 1106,
"avg_agent_execution_time": 188.7811,
"avg_input_tokens": 215710.3929,
"avg_output_tokens": 8965.9286,
"avg_total_tokens": 224676.3214,
"avg_turns": 13.1667,
"per_run_input_tokens": 4529918.25,
"per_run_output_tokens": 188284.5,
"per_run_cost": 11.31925,
"actual_model_name": "gemini-3-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.7976,
"std": 0.0519
},
"pass@4": 0.8571,
"pass^4": 0.6667
},
"gpt-5-high": {
"total_tasks": 21,
"total_agent_execution_time": 82147.10680437088,
"total_input_tokens": 13455198,
"total_output_tokens": 2864869,
"total_tokens": 16320067,
"total_turns": 1379,
"avg_agent_execution_time": 977.9417,
"avg_input_tokens": 160180.9286,
"avg_output_tokens": 34105.5833,
"avg_total_tokens": 194286.5119,
"avg_turns": 16.4167,
"per_run_input_tokens": 3363799.5,
"per_run_output_tokens": 716217.25,
"per_run_cost": 11.366922,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.7262,
"std": 0.0395
},
"pass@4": 0.8571,
"pass^4": 0.5238
},
"kimi-k2-0905": {
"total_tasks": 21,
"total_agent_execution_time": 43449.057983875275,
"total_input_tokens": 37057475,
"total_output_tokens": 451643,
"total_tokens": 37509118,
"total_turns": 2538,
"avg_agent_execution_time": 517.2507,
"avg_input_tokens": 441160.4167,
"avg_output_tokens": 5376.7024,
"avg_total_tokens": 446537.119,
"avg_turns": 30.2143,
"per_run_input_tokens": 9264368.75,
"per_run_output_tokens": 112910.75,
"per_run_cost": 5.840898,
"actual_model_name": "kimi-k2-0905-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.4762,
"std": 0.0476
},
"pass@4": 0.6667,
"pass^4": 0.2857
},
"gpt-5-nano-medium": {
"total_tasks": 21,
"total_agent_execution_time": 10844.792605400085,
"total_input_tokens": 8821599,
"total_output_tokens": 1935384,
"total_tokens": 10756983,
"total_turns": 795,
"avg_agent_execution_time": 129.1047,
"avg_input_tokens": 105019.0357,
"avg_output_tokens": 23040.2857,
"avg_total_tokens": 128059.3214,
"avg_turns": 9.4643,
"per_run_input_tokens": 2205399.75,
"per_run_output_tokens": 483846,
"per_run_cost": 0.303808,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1548,
"std": 0.0519
},
"pass@4": 0.2857,
"pass^4": 0.0476
},
"gpt-5-2-high": {
"total_tasks": 21,
"total_agent_execution_time": 51900.328535079956,
"total_input_tokens": 23317441,
"total_output_tokens": 2150311,
"total_tokens": 25467752,
"total_turns": 1547,
"avg_agent_execution_time": 617.8611,
"avg_input_tokens": 277588.5833,
"avg_output_tokens": 25598.9405,
"avg_total_tokens": 303187.5238,
"avg_turns": 18.4167,
"per_run_input_tokens": 5829360.25,
"per_run_output_tokens": 537577.75,
"per_run_cost": 17.727469,
"actual_model_name": "gpt-5.2-2025-12-11",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.7262,
"std": 0.0206
},
"pass@4": 0.7619,
"pass^4": 0.619
},
"grok-4": {
"total_tasks": 21,
"total_agent_execution_time": 17157.205701589584,
"total_input_tokens": 15630090,
"total_output_tokens": 691637,
"total_tokens": 16321727,
"total_turns": 1503,
"avg_agent_execution_time": 204.2524,
"avg_input_tokens": 186072.5,
"avg_output_tokens": 8233.7738,
"avg_total_tokens": 194306.2738,
"avg_turns": 17.8929,
"per_run_input_tokens": 3907522.5,
"per_run_output_tokens": 172909.25,
"per_run_cost": 14.316206,
"actual_model_name": "grok-4-0709",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5833,
"std": 0.0781
},
"pass@4": 0.8095,
"pass^4": 0.381
},
"claude-sonnet-4-high": {
"total_tasks": 21,
"total_agent_execution_time": 13869.583010196686,
"total_input_tokens": 36794175,
"total_output_tokens": 555014,
"total_tokens": 37349189,
"total_turns": 2056,
"avg_agent_execution_time": 165.1141,
"avg_input_tokens": 438025.8929,
"avg_output_tokens": 6607.3095,
"avg_total_tokens": 444633.2024,
"avg_turns": 24.4762,
"per_run_input_tokens": 9198543.75,
"per_run_output_tokens": 138753.5,
"per_run_cost": 29.676934,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.5,
"std": 0.0714
},
"pass@4": 0.6667,
"pass^4": 0.381
},
"gemini-3-pro-low": {
"total_tasks": 21,
"total_agent_execution_time": 11640.001910448074,
"total_input_tokens": 12046379,
"total_output_tokens": 650448,
"total_tokens": 12696827,
"total_turns": 1038,
"avg_agent_execution_time": 138.5715,
"avg_input_tokens": 143409.2738,
"avg_output_tokens": 7743.4286,
"avg_total_tokens": 151152.7024,
"avg_turns": 12.3571,
"per_run_input_tokens": 3011594.75,
"per_run_output_tokens": 162612,
"per_run_cost": 7.974533,
"actual_model_name": "gemini-3-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.7024,
"std": 0.0395
},
"pass@4": 0.9048,
"pass^4": 0.4762
},
"gpt-oss-120b": {
"total_tasks": 21,
"total_agent_execution_time": 1959.240404367447,
"total_input_tokens": 1794062,
"total_output_tokens": 119313,
"total_tokens": 1913375,
"total_turns": 426,
"avg_agent_execution_time": 23.3243,
"avg_input_tokens": 21357.881,
"avg_output_tokens": 1420.3929,
"avg_total_tokens": 22778.2738,
"avg_turns": 5.0714,
"per_run_input_tokens": 448515.5,
"per_run_output_tokens": 29828.25,
"per_run_cost": 0.040645,
"actual_model_name": "gpt-oss-120b",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0714,
"std": 0.0238
},
"pass@4": 0.2381,
"pass^4": 0
},
"grok-4-fast": {
"total_tasks": 21,
"total_agent_execution_time": 6027.847983598709,
"total_input_tokens": 18555081,
"total_output_tokens": 559540,
"total_tokens": 19114621,
"total_turns": 1559,
"avg_agent_execution_time": 71.7601,
"avg_input_tokens": 220893.8214,
"avg_output_tokens": 6661.1905,
"avg_total_tokens": 227555.0119,
"avg_turns": 18.5595,
"per_run_input_tokens": 4638770.25,
"per_run_output_tokens": 139885,
"per_run_cost": 0.997697,
"actual_model_name": "grok-4-fast",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5238,
"std": 0.0891
},
"pass@4": 0.8095,
"pass^4": 0.2857
},
"o4-mini": {
"total_tasks": 21,
"total_agent_execution_time": 7117.457372426987,
"total_input_tokens": 1337143,
"total_output_tokens": 483676,
"total_tokens": 1820819,
"total_turns": 425,
"avg_agent_execution_time": 84.7316,
"avg_input_tokens": 15918.369,
"avg_output_tokens": 5758.0476,
"avg_total_tokens": 21676.4167,
"avg_turns": 5.0595,
"per_run_input_tokens": 334285.75,
"per_run_output_tokens": 120919,
"per_run_cost": 0.899758,
"actual_model_name": "o4-mini-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.119,
"std": 0.0412
},
"pass@4": 0.1905,
"pass^4": 0.0476
},
"gpt-4-1-mini": {
"total_tasks": 21,
"total_agent_execution_time": 3538.005044221878,
"total_input_tokens": 3917278,
"total_output_tokens": 149972,
"total_tokens": 4067250,
"total_turns": 821,
"avg_agent_execution_time": 42.1191,
"avg_input_tokens": 46634.2619,
"avg_output_tokens": 1785.381,
"avg_total_tokens": 48419.6429,
"avg_turns": 9.7738,
"per_run_input_tokens": 979319.5,
"per_run_output_tokens": 37493,
"per_run_cost": 0.451717,
"actual_model_name": "gpt-4.1-mini-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0952,
"std": 0.0337
},
"pass@4": 0.1429,
"pass^4": 0.0476
}
},
"experiment_name": "mcpmark-release",
"task_set": "standard"
}