Model Leaderboard
Updated at 09/18/2025 12:47:26
• claude-opus only ran pass@1 evaluation
MCP Server Analysis
Detailed performance analysis by MCP server with Pass@1/Pass@4/Pass^4 metrics
Note: view the JSON below for detailed grade information.
JSON
{
"generated_at": "2025-09-18T12:47:26.868861",
"k": 4,
"overall": {
"deepseek-chat": {
"total_tasks": 127,
"total_agent_execution_time": 137112.82717490196,
"total_input_tokens": 250467772,
"total_output_tokens": 1425635,
"total_tokens": 251893407,
"total_turns": 9872,
"avg_agent_execution_time": 269.9071,
"avg_input_tokens": 493046.7953,
"avg_output_tokens": 2806.3681,
"avg_total_tokens": 495853.1634,
"avg_turns": 19.4331,
"per_run_input_tokens": 62616943,
"per_run_output_tokens": 356408.75,
"per_run_cost": 35.664255,
"actual_model_name": "deepseek-v3.1-non-think",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1673,
"std": 0.0141
},
"pass@4": 0.2835,
"pass^4": 0.0787
},
"gpt-4-1": {
"total_tasks": 127,
"total_agent_execution_time": 30320.14462161064,
"total_input_tokens": 164083441,
"total_output_tokens": 787047,
"total_tokens": 164870488,
"total_turns": 5052,
"avg_agent_execution_time": 59.6853,
"avg_input_tokens": 322998.8996,
"avg_output_tokens": 1549.3051,
"avg_total_tokens": 324548.2047,
"avg_turns": 9.9449,
"per_run_input_tokens": 41020860.25,
"per_run_output_tokens": 196761.75,
"per_run_cost": 83.615814,
"actual_model_name": "gpt-4.1-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0807,
"std": 0.0065
},
"pass@4": 0.126,
"pass^4": 0.0315
},
"claude-opus-4-1": {
"total_tasks": 127,
"total_agent_execution_time": 45950.07896590233,
"total_input_tokens": 74430756,
"total_output_tokens": 653210,
"total_tokens": 75083966,
"total_turns": 2214,
"avg_agent_execution_time": 361.8116,
"avg_input_tokens": 586068.9449,
"avg_output_tokens": 5143.3858,
"avg_total_tokens": 591212.3307,
"avg_turns": 17.4331,
"per_run_input_tokens": 74430756,
"per_run_output_tokens": 653210,
"per_run_cost": 1165.45209,
"actual_model_name": "claude-opus-4-1-20250805",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2992,
"std": 0
}
},
"glm-4-5": {
"total_tasks": 127,
"total_agent_execution_time": 84480.56672096252,
"total_input_tokens": 213186013,
"total_output_tokens": 2078679,
"total_tokens": 215264692,
"total_turns": 9214,
"avg_agent_execution_time": 166.3003,
"avg_input_tokens": 419657.5059,
"avg_output_tokens": 4091.8878,
"avg_total_tokens": 423749.3937,
"avg_turns": 18.1378,
"per_run_input_tokens": 53296503.25,
"per_run_output_tokens": 519669.75,
"per_run_cost": 18.27381,
"actual_model_name": "glm-4.5",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1555,
"std": 0.0116
},
"pass@4": 0.2441,
"pass^4": 0.063
},
"o3": {
"total_tasks": 127,
"total_agent_execution_time": 86067.44522094727,
"total_input_tokens": 210426740,
"total_output_tokens": 4365349,
"total_tokens": 214792089,
"total_turns": 8368,
"avg_agent_execution_time": 169.4241,
"avg_input_tokens": 414225.8661,
"avg_output_tokens": 8593.2067,
"avg_total_tokens": 422819.0728,
"avg_turns": 16.4724,
"per_run_input_tokens": 52606685,
"per_run_output_tokens": 1091337.25,
"per_run_cost": 113.944068,
"actual_model_name": "o3-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2539,
"std": 0.0204
},
"pass@4": 0.4331,
"pass^4": 0.126
},
"claude-sonnet-4-low": {
"total_tasks": 127,
"total_agent_execution_time": 101288.96435308456,
"total_input_tokens": 602044071,
"total_output_tokens": 2510844,
"total_tokens": 604554915,
"total_turns": 9867,
"avg_agent_execution_time": 199.3877,
"avg_input_tokens": 1185126.124,
"avg_output_tokens": 4942.6063,
"avg_total_tokens": 1190068.7303,
"avg_turns": 19.4232,
"per_run_input_tokens": 150511017.75,
"per_run_output_tokens": 627711,
"per_run_cost": 460.948718,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2736,
"std": 0.017
},
"pass@4": 0.3937,
"pass^4": 0.1811
},
"gemini-2-5-pro": {
"total_tasks": 127,
"total_agent_execution_time": 60629.834964990616,
"total_input_tokens": 238581228,
"total_output_tokens": 3564083,
"total_tokens": 242145311,
"total_turns": 5561,
"avg_agent_execution_time": 119.3501,
"avg_input_tokens": 469648.0866,
"avg_output_tokens": 7015.9114,
"avg_total_tokens": 476663.998,
"avg_turns": 10.9469,
"per_run_input_tokens": 59645307,
"per_run_output_tokens": 891020.75,
"per_run_cost": 162.478579,
"actual_model_name": "gemini-2.5-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1575,
"std": 0.0056
},
"pass@4": 0.2992,
"pass^4": 0.0472
},
"qwen-3-coder-plus": {
"total_tasks": 127,
"total_agent_execution_time": 139339.28879857063,
"total_input_tokens": 722104724,
"total_output_tokens": 1784161,
"total_tokens": 723888885,
"total_turns": 12063,
"avg_agent_execution_time": 274.2899,
"avg_input_tokens": 1421465.9921,
"avg_output_tokens": 3512.128,
"avg_total_tokens": 1424978.1201,
"avg_turns": 23.7461,
"per_run_input_tokens": 180526181,
"per_run_output_tokens": 446040.25,
"per_run_cost": 36.462068,
"actual_model_name": "qwen3-coder-480b-a35b-instruct",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.248,
"std": 0.0205
},
"pass@4": 0.4094,
"pass^4": 0.126
},
"gpt-5-mini-high": {
"total_tasks": 127,
"total_agent_execution_time": 177766.2423055172,
"total_input_tokens": 528738725,
"total_output_tokens": 14617006,
"total_tokens": 543355731,
"total_turns": 9743,
"avg_agent_execution_time": 349.9335,
"avg_input_tokens": 1040824.2618,
"avg_output_tokens": 28773.6339,
"avg_total_tokens": 1069597.8957,
"avg_turns": 19.1791,
"per_run_input_tokens": 132184681.25,
"per_run_output_tokens": 3654251.5,
"per_run_cost": 40.354673,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.3032,
"std": 0.0172
},
"pass@4": 0.4646,
"pass^4": 0.1654
},
"gpt-5-nano-low": {
"total_tasks": 127,
"total_agent_execution_time": 48947.8431391716,
"total_input_tokens": 148338105,
"total_output_tokens": 6497039,
"total_tokens": 154835144,
"total_turns": 6102,
"avg_agent_execution_time": 96.354,
"avg_input_tokens": 292004.1437,
"avg_output_tokens": 12789.4469,
"avg_total_tokens": 304793.5906,
"avg_turns": 12.0118,
"per_run_input_tokens": 37084526.25,
"per_run_output_tokens": 1624259.75,
"per_run_cost": 2.50393,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0433,
"std": 0.0118
},
"pass@4": 0.1024,
"pass^4": 0.0079
},
"gpt-5-medium": {
"total_tasks": 127,
"total_agent_execution_time": 242931.40538144112,
"total_input_tokens": 318853695,
"total_output_tokens": 11128874,
"total_tokens": 329982569,
"total_turns": 7475,
"avg_agent_execution_time": 478.2114,
"avg_input_tokens": 627664.7539,
"avg_output_tokens": 21907.2323,
"avg_total_tokens": 649571.9862,
"avg_turns": 14.7146,
"per_run_input_tokens": 79713423.75,
"per_run_output_tokens": 2782218.5,
"per_run_cost": 127.463965,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5256,
"std": 0.0129
},
"pass@4": 0.685,
"pass^4": 0.3386
},
"claude-sonnet-4": {
"total_tasks": 127,
"total_agent_execution_time": 110881.38604903221,
"total_input_tokens": 324800674,
"total_output_tokens": 2349295,
"total_tokens": 327149969,
"total_turns": 9388,
"avg_agent_execution_time": 218.2704,
"avg_input_tokens": 639371.4055,
"avg_output_tokens": 4624.5965,
"avg_total_tokens": 643996.002,
"avg_turns": 18.4803,
"per_run_input_tokens": 81200168.5,
"per_run_output_tokens": 587323.75,
"per_run_cost": 252.410362,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2815,
"std": 0.0257
},
"pass@4": 0.4488,
"pass^4": 0.126
},
"gpt-5-low": {
"total_tasks": 127,
"total_agent_execution_time": 195995.03595471382,
"total_input_tokens": 325914298,
"total_output_tokens": 9607215,
"total_tokens": 335521513,
"total_turns": 7211,
"avg_agent_execution_time": 385.817,
"avg_input_tokens": 641563.5787,
"avg_output_tokens": 18911.8406,
"avg_total_tokens": 660475.4193,
"avg_turns": 14.1949,
"per_run_input_tokens": 81478574.5,
"per_run_output_tokens": 2401803.75,
"per_run_cost": 125.866256,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4685,
"std": 0.0287
},
"pass@4": 0.6299,
"pass^4": 0.2677
},
"gpt-5-mini-medium": {
"total_tasks": 127,
"total_agent_execution_time": 81238.54882597923,
"total_input_tokens": 374509740,
"total_output_tokens": 5235729,
"total_tokens": 379745469,
"total_turns": 7961,
"avg_agent_execution_time": 159.9184,
"avg_input_tokens": 737223.8976,
"avg_output_tokens": 10306.5531,
"avg_total_tokens": 747530.4508,
"avg_turns": 15.6713,
"per_run_input_tokens": 93627435,
"per_run_output_tokens": 1308932.25,
"per_run_cost": 26.024723,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2736,
"std": 0.0312
},
"pass@4": 0.4567,
"pass^4": 0.0945
},
"gpt-4-1-nano": {
"total_tasks": 127,
"total_agent_execution_time": 19860.039894342422,
"total_input_tokens": 98229466,
"total_output_tokens": 831039,
"total_tokens": 99060505,
"total_turns": 5477,
"avg_agent_execution_time": 39.0946,
"avg_input_tokens": 193365.0906,
"avg_output_tokens": 1635.9035,
"avg_total_tokens": 195000.9941,
"avg_turns": 10.7815,
"per_run_input_tokens": 24557366.5,
"per_run_output_tokens": 207759.75,
"per_run_cost": 2.538841,
"actual_model_name": "gpt-4.1-nano-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"grok-code-fast-1": {
"total_tasks": 127,
"total_agent_execution_time": 79566.39907479286,
"total_input_tokens": 299975054,
"total_output_tokens": 2870082,
"total_tokens": 303880002,
"total_turns": 9356,
"avg_agent_execution_time": 156.6268,
"avg_input_tokens": 590502.0748,
"avg_output_tokens": 5649.7677,
"avg_total_tokens": 598188.9803,
"avg_turns": 18.4173,
"per_run_input_tokens": 74993763.5,
"per_run_output_tokens": 717520.5,
"per_run_cost": 16.075033,
"actual_model_name": "grok-code-fast-1",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2047,
"std": 0.0339
},
"pass@4": 0.3071,
"pass^4": 0.0945
},
"gemini-2-5-flash": {
"total_tasks": 127,
"total_agent_execution_time": 58353.47530055046,
"total_input_tokens": 520239496,
"total_output_tokens": 4469819,
"total_tokens": 524709348,
"total_turns": 5795,
"avg_agent_execution_time": 114.869,
"avg_input_tokens": 1024093.4961,
"avg_output_tokens": 8798.8563,
"avg_total_tokens": 1032892.4173,
"avg_turns": 11.4075,
"per_run_input_tokens": 130059874,
"per_run_output_tokens": 1117454.75,
"per_run_cost": 41.811599,
"actual_model_name": "gemini-2.5-flash",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0906,
"std": 0.0068
},
"pass@4": 0.1811,
"pass^4": 0.0394
},
"gpt-5-nano-high": {
"total_tasks": 127,
"total_agent_execution_time": 157217.0769355297,
"total_input_tokens": 411756625,
"total_output_tokens": 26452351,
"total_tokens": 438208976,
"total_turns": 10031,
"avg_agent_execution_time": 309.4824,
"avg_input_tokens": 810544.5374,
"avg_output_tokens": 52071.5571,
"avg_total_tokens": 862616.0945,
"avg_turns": 19.7461,
"per_run_input_tokens": 102939156.25,
"per_run_output_tokens": 6613087.75,
"per_run_cost": 7.792193,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0512,
"std": 0.0205
},
"pass@4": 0.1417,
"pass^4": 0
},
"gpt-5-mini-low": {
"total_tasks": 127,
"total_agent_execution_time": 32084.921304941177,
"total_input_tokens": 112819524,
"total_output_tokens": 1609354,
"total_tokens": 114428878,
"total_turns": 4226,
"avg_agent_execution_time": 63.1593,
"avg_input_tokens": 222085.6772,
"avg_output_tokens": 3168.0197,
"avg_total_tokens": 225253.6969,
"avg_turns": 8.3189,
"per_run_input_tokens": 28204881,
"per_run_output_tokens": 402338.5,
"per_run_cost": 7.855897,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0827,
"std": 0.0131
},
"pass@4": 0.189,
"pass^4": 0.0079
},
"qwen-3-max": {
"total_tasks": 127,
"total_agent_execution_time": 108499.87574696541,
"total_input_tokens": 525760160,
"total_output_tokens": 1516865,
"total_tokens": 527277025,
"total_turns": 12114,
"avg_agent_execution_time": 213.5824,
"avg_input_tokens": 1034960.9449,
"avg_output_tokens": 2985.9547,
"avg_total_tokens": 1037946.8996,
"avg_turns": 23.8465,
"per_run_input_tokens": 131440040,
"per_run_output_tokens": 379216.25,
"per_run_cost": 160.003345,
"actual_model_name": "qwen3-max-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1772,
"std": 0.0131
},
"pass@4": 0.2283,
"pass^4": 0.1102
},
"kimi-k2-0711": {
"total_tasks": 127,
"total_agent_execution_time": 109108.96008372307,
"total_input_tokens": 236755502,
"total_output_tokens": 1502196,
"total_tokens": 238257698,
"total_turns": 9504,
"avg_agent_execution_time": 214.7814,
"avg_input_tokens": 466054.1378,
"avg_output_tokens": 2957.0787,
"avg_total_tokens": 469011.2165,
"avg_turns": 18.7087,
"per_run_input_tokens": 59188875.5,
"per_run_output_tokens": 375549,
"per_run_cost": 36.452198,
"actual_model_name": "kimi-k2-0711-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1909,
"std": 0.0161
},
"pass@4": 0.315,
"pass^4": 0.1181
},
"gpt-5-high": {
"total_tasks": 127,
"total_agent_execution_time": 522861.792570591,
"total_input_tokens": 343123887,
"total_output_tokens": 18664477,
"total_tokens": 361788364,
"total_turns": 7952,
"avg_agent_execution_time": 1029.2555,
"avg_input_tokens": 675440.7224,
"avg_output_tokens": 36741.0965,
"avg_total_tokens": 712181.8189,
"avg_turns": 15.6535,
"per_run_input_tokens": 85780971.75,
"per_run_output_tokens": 4666119.25,
"per_run_cost": 153.887407,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5157,
"std": 0.0252
},
"pass@4": 0.6614,
"pass^4": 0.3307
},
"kimi-k2-0905": {
"total_tasks": 127,
"total_agent_execution_time": 250829.567029953,
"total_input_tokens": 473201616,
"total_output_tokens": 2545700,
"total_tokens": 475747316,
"total_turns": 13691,
"avg_agent_execution_time": 493.759,
"avg_input_tokens": 931499.2441,
"avg_output_tokens": 5011.2205,
"avg_total_tokens": 936510.4646,
"avg_turns": 26.9508,
"per_run_input_tokens": 118300404,
"per_run_output_tokens": 636425,
"per_run_cost": 72.571305,
"actual_model_name": "kimi-k2-0905-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2185,
"std": 0.0116
},
"pass@4": 0.315,
"pass^4": 0.126
},
"gpt-5-nano-medium": {
"total_tasks": 127,
"total_agent_execution_time": 79832.2317507267,
"total_input_tokens": 227578931,
"total_output_tokens": 12104517,
"total_tokens": 239683448,
"total_turns": 7366,
"avg_agent_execution_time": 157.1501,
"avg_input_tokens": 447990.0217,
"avg_output_tokens": 23827.7894,
"avg_total_tokens": 471817.811,
"avg_turns": 14.5,
"per_run_input_tokens": 56894732.75,
"per_run_output_tokens": 3026129.25,
"per_run_cost": 4.055188,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.063,
"std": 0.0201
},
"pass@4": 0.1181,
"pass^4": 0.0157
},
"grok-4": {
"total_tasks": 127,
"total_agent_execution_time": 162467.43107247353,
"total_input_tokens": 321821722,
"total_output_tokens": 4277197,
"total_tokens": 326544746,
"total_turns": 8254,
"avg_agent_execution_time": 319.8178,
"avg_input_tokens": 633507.3268,
"avg_output_tokens": 8419.6791,
"avg_total_tokens": 642804.6181,
"avg_turns": 16.248,
"per_run_input_tokens": 80455430.5,
"per_run_output_tokens": 1069299.25,
"per_run_cost": 257.40578,
"actual_model_name": "grok-4-0709",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.3169,
"std": 0.0291
},
"pass@4": 0.4488,
"pass^4": 0.1811
},
"claude-sonnet-4-high": {
"total_tasks": 127,
"total_agent_execution_time": 94260.38231873512,
"total_input_tokens": 576975325,
"total_output_tokens": 2558316,
"total_tokens": 579533641,
"total_turns": 9884,
"avg_agent_execution_time": 185.5519,
"avg_input_tokens": 1135778.1988,
"avg_output_tokens": 5036.0551,
"avg_total_tokens": 1140814.2539,
"avg_turns": 19.4567,
"per_run_input_tokens": 144243831.25,
"per_run_output_tokens": 639579,
"per_run_cost": 442.325179,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2835,
"std": 0.0236
},
"pass@4": 0.4094,
"pass^4": 0.1811
},
"gpt-oss-120b": {
"total_tasks": 127,
"total_agent_execution_time": 13901.512543916702,
"total_input_tokens": 32768093,
"total_output_tokens": 697653,
"total_tokens": 33465746,
"total_turns": 2741,
"avg_agent_execution_time": 27.3652,
"avg_input_tokens": 64504.1201,
"avg_output_tokens": 1373.3327,
"avg_total_tokens": 65877.4528,
"avg_turns": 5.3957,
"per_run_input_tokens": 8192023.25,
"per_run_output_tokens": 174413.25,
"per_run_cost": 0.638661,
"actual_model_name": "gpt-oss-120b",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0472,
"std": 0.0096
},
"pass@4": 0.1339,
"pass^4": 0
},
"o4-mini": {
"total_tasks": 127,
"total_agent_execution_time": 164245.99844169617,
"total_input_tokens": 199695107,
"total_output_tokens": 7910153,
"total_tokens": 207605260,
"total_turns": 7418,
"avg_agent_execution_time": 323.3189,
"avg_input_tokens": 393100.6043,
"avg_output_tokens": 15571.1673,
"avg_total_tokens": 408671.7717,
"avg_turns": 14.6024,
"per_run_input_tokens": 49923776.75,
"per_run_output_tokens": 1977538.25,
"per_run_cost": 63.617323,
"actual_model_name": "o4-mini-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1732,
"std": 0.023
},
"pass@4": 0.315,
"pass^4": 0.063
},
"gpt-4-1-mini": {
"total_tasks": 127,
"total_agent_execution_time": 43548.9137635231,
"total_input_tokens": 595733455,
"total_output_tokens": 964338,
"total_tokens": 596697793,
"total_turns": 8326,
"avg_agent_execution_time": 85.7262,
"avg_input_tokens": 1172703.6516,
"avg_output_tokens": 1898.3031,
"avg_total_tokens": 1174601.9547,
"avg_turns": 16.3898,
"per_run_input_tokens": 148933363.75,
"per_run_output_tokens": 241084.5,
"per_run_cost": 59.959081,
"actual_model_name": "gpt-4.1-mini-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0394,
"std": 0.0096
},
"pass@4": 0.0709,
"pass^4": 0.0157
}
},
"filesystem": {
"deepseek-chat": {
"total_tasks": 30,
"total_agent_execution_time": 33809.433807849884,
"total_input_tokens": 50559981,
"total_output_tokens": 406171,
"total_tokens": 50966152,
"total_turns": 2860,
"avg_agent_execution_time": 281.7453,
"avg_input_tokens": 421333.175,
"avg_output_tokens": 3384.7583,
"avg_total_tokens": 424717.9333,
"avg_turns": 23.8333,
"per_run_input_tokens": 12639995.25,
"per_run_output_tokens": 101542.75,
"per_run_cost": 7.248989,
"actual_model_name": "deepseek-v3.1-non-think",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1583,
"std": 0.0144
},
"pass@4": 0.2667,
"pass^4": 0.0667
},
"gpt-4-1": {
"total_tasks": 30,
"total_agent_execution_time": 4918.759680747986,
"total_input_tokens": 17274493,
"total_output_tokens": 217587,
"total_tokens": 17492080,
"total_turns": 1114,
"avg_agent_execution_time": 40.9897,
"avg_input_tokens": 143954.1083,
"avg_output_tokens": 1813.225,
"avg_total_tokens": 145767.3333,
"avg_turns": 9.2833,
"per_run_input_tokens": 4318623.25,
"per_run_output_tokens": 54396.75,
"per_run_cost": 9.07242,
"actual_model_name": "gpt-4.1-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.125,
"std": 0.0144
},
"pass@4": 0.2,
"pass^4": 0.0333
},
"claude-opus-4-1": {
"total_tasks": 30,
"total_agent_execution_time": 8033.295060634613,
"total_input_tokens": 8164979,
"total_output_tokens": 130994,
"total_tokens": 8295973,
"total_turns": 491,
"avg_agent_execution_time": 267.7765,
"avg_input_tokens": 272165.9667,
"avg_output_tokens": 4366.4667,
"avg_total_tokens": 276532.4333,
"avg_turns": 16.3667,
"per_run_input_tokens": 8164979,
"per_run_output_tokens": 130994,
"per_run_cost": 132.299235,
"actual_model_name": "claude-opus-4-1-20250805",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.3333,
"std": 0
}
},
"glm-4-5": {
"total_tasks": 30,
"total_agent_execution_time": 15624.920702934265,
"total_input_tokens": 23273606,
"total_output_tokens": 470863,
"total_tokens": 23744469,
"total_turns": 1967,
"avg_agent_execution_time": 130.2077,
"avg_input_tokens": 193946.7167,
"avg_output_tokens": 3923.8583,
"avg_total_tokens": 197870.575,
"avg_turns": 16.3917,
"per_run_input_tokens": 5818401.5,
"per_run_output_tokens": 117715.75,
"per_run_cost": 2.075457,
"actual_model_name": "glm-4.5",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.075,
"std": 0.0144
},
"pass@4": 0.1333,
"pass^4": 0.0333
},
"o3": {
"total_tasks": 30,
"total_agent_execution_time": 33353.133106946945,
"total_input_tokens": 82757119,
"total_output_tokens": 2135261,
"total_tokens": 84892380,
"total_turns": 3455,
"avg_agent_execution_time": 277.9428,
"avg_input_tokens": 689642.6583,
"avg_output_tokens": 17793.8417,
"avg_total_tokens": 707436.5,
"avg_turns": 28.7917,
"per_run_input_tokens": 20689279.75,
"per_run_output_tokens": 533815.25,
"per_run_cost": 45.649082,
"actual_model_name": "o3-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.3583,
"std": 0.0276
},
"pass@4": 0.5,
"pass^4": 0.2667
},
"claude-sonnet-4-low": {
"total_tasks": 30,
"total_agent_execution_time": 21124.522393226624,
"total_input_tokens": 124738934,
"total_output_tokens": 646826,
"total_tokens": 125385760,
"total_turns": 2247,
"avg_agent_execution_time": 176.0377,
"avg_input_tokens": 1039491.1167,
"avg_output_tokens": 5390.2167,
"avg_total_tokens": 1044881.3333,
"avg_turns": 18.725,
"per_run_input_tokens": 31184733.5,
"per_run_output_tokens": 161706.5,
"per_run_cost": 95.979798,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2333,
"std": 0.0471
},
"pass@4": 0.3667,
"pass^4": 0.1333
},
"gemini-2-5-pro": {
"total_tasks": 30,
"total_agent_execution_time": 15134.337651252747,
"total_input_tokens": 25795900,
"total_output_tokens": 929436,
"total_tokens": 26725336,
"total_turns": 1722,
"avg_agent_execution_time": 126.1195,
"avg_input_tokens": 214965.8333,
"avg_output_tokens": 7745.3,
"avg_total_tokens": 222711.1333,
"avg_turns": 14.35,
"per_run_input_tokens": 6448975,
"per_run_output_tokens": 232359,
"per_run_cost": 19.607823,
"actual_model_name": "gemini-2.5-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2417,
"std": 0.0363
},
"pass@4": 0.4333,
"pass^4": 0.1
},
"qwen-3-coder-plus": {
"total_tasks": 30,
"total_agent_execution_time": 18861.470023155212,
"total_input_tokens": 116688838,
"total_output_tokens": 497514,
"total_tokens": 117186352,
"total_turns": 3387,
"avg_agent_execution_time": 157.1789,
"avg_input_tokens": 972406.9833,
"avg_output_tokens": 4145.95,
"avg_total_tokens": 976552.9333,
"avg_turns": 28.225,
"per_run_input_tokens": 29172209.5,
"per_run_output_tokens": 124378.5,
"per_run_cost": 5.933945,
"actual_model_name": "qwen3-coder-480b-a35b-instruct",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1333,
"std": 0.0667
},
"pass@4": 0.2667,
"pass^4": 0.0333
},
"gpt-5-mini-high": {
"total_tasks": 30,
"total_agent_execution_time": 34673.4765253067,
"total_input_tokens": 64240780,
"total_output_tokens": 3091722,
"total_tokens": 67332502,
"total_turns": 2210,
"avg_agent_execution_time": 288.9456,
"avg_input_tokens": 535339.8333,
"avg_output_tokens": 25764.35,
"avg_total_tokens": 561104.1833,
"avg_turns": 18.4167,
"per_run_input_tokens": 16060195,
"per_run_output_tokens": 772930.5,
"per_run_cost": 5.56091,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.35,
"std": 0.0764
},
"pass@4": 0.4667,
"pass^4": 0.2333
},
"gpt-5-nano-low": {
"total_tasks": 30,
"total_agent_execution_time": 15497.93159866333,
"total_input_tokens": 77157814,
"total_output_tokens": 2280787,
"total_tokens": 79438601,
"total_turns": 2755,
"avg_agent_execution_time": 129.1494,
"avg_input_tokens": 642981.7833,
"avg_output_tokens": 19006.5583,
"avg_total_tokens": 661988.3417,
"avg_turns": 22.9583,
"per_run_input_tokens": 19289453.5,
"per_run_output_tokens": 570196.75,
"per_run_cost": 1.192551,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.125,
"std": 0.0363
},
"pass@4": 0.3,
"pass^4": 0.0333
},
"gpt-5-medium": {
"total_tasks": 30,
"total_agent_execution_time": 37567.695655584335,
"total_input_tokens": 25915698,
"total_output_tokens": 2084979,
"total_tokens": 28000677,
"total_turns": 1207,
"avg_agent_execution_time": 313.0641,
"avg_input_tokens": 215964.15,
"avg_output_tokens": 17374.825,
"avg_total_tokens": 233338.975,
"avg_turns": 10.0583,
"per_run_input_tokens": 6478924.5,
"per_run_output_tokens": 521244.75,
"per_run_cost": 13.311103,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.575,
"std": 0.0363
},
"pass@4": 0.7667,
"pass^4": 0.3667
},
"claude-sonnet-4": {
"total_tasks": 30,
"total_agent_execution_time": 23177.59760403633,
"total_input_tokens": 36265545,
"total_output_tokens": 479856,
"total_tokens": 36745401,
"total_turns": 1922,
"avg_agent_execution_time": 193.1466,
"avg_input_tokens": 302212.875,
"avg_output_tokens": 3998.8,
"avg_total_tokens": 306211.675,
"avg_turns": 16.0167,
"per_run_input_tokens": 9066386.25,
"per_run_output_tokens": 119964,
"per_run_cost": 28.998619,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.275,
"std": 0.0276
},
"pass@4": 0.5,
"pass^4": 0.0667
},
"gpt-5-low": {
"total_tasks": 30,
"total_agent_execution_time": 33070.53626990318,
"total_input_tokens": 34189054,
"total_output_tokens": 1920255,
"total_tokens": 36109309,
"total_turns": 1295,
"avg_agent_execution_time": 275.5878,
"avg_input_tokens": 284908.7833,
"avg_output_tokens": 16002.125,
"avg_total_tokens": 300910.9083,
"avg_turns": 10.7917,
"per_run_input_tokens": 8547263.5,
"per_run_output_tokens": 480063.75,
"per_run_cost": 15.484717,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5417,
"std": 0.0682
},
"pass@4": 0.7333,
"pass^4": 0.3333
},
"gpt-5-mini-medium": {
"total_tasks": 30,
"total_agent_execution_time": 20912.210697889328,
"total_input_tokens": 47800838,
"total_output_tokens": 1509938,
"total_tokens": 49310776,
"total_turns": 1781,
"avg_agent_execution_time": 174.2684,
"avg_input_tokens": 398340.3167,
"avg_output_tokens": 12582.8167,
"avg_total_tokens": 410923.1333,
"avg_turns": 14.8417,
"per_run_input_tokens": 11950209.5,
"per_run_output_tokens": 377484.5,
"per_run_cost": 3.742521,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.3333,
"std": 0.0624
},
"pass@4": 0.5333,
"pass^4": 0.1
},
"gpt-4-1-nano": {
"total_tasks": 30,
"total_agent_execution_time": 3375.341311454773,
"total_input_tokens": 14038124,
"total_output_tokens": 158107,
"total_tokens": 14196231,
"total_turns": 1460,
"avg_agent_execution_time": 28.1278,
"avg_input_tokens": 116984.3667,
"avg_output_tokens": 1317.5583,
"avg_total_tokens": 118301.925,
"avg_turns": 12.1667,
"per_run_input_tokens": 3509531,
"per_run_output_tokens": 39526.75,
"per_run_cost": 0.366764,
"actual_model_name": "gpt-4.1-nano-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"grok-code-fast-1": {
"total_tasks": 30,
"total_agent_execution_time": 9060.72327876091,
"total_input_tokens": 33168391,
"total_output_tokens": 283156,
"total_tokens": 34075918,
"total_turns": 1966,
"avg_agent_execution_time": 75.506,
"avg_input_tokens": 276403.2583,
"avg_output_tokens": 2359.6333,
"avg_total_tokens": 283965.9833,
"avg_turns": 16.3833,
"per_run_input_tokens": 8292097.75,
"per_run_output_tokens": 70789,
"per_run_cost": 1.764603,
"actual_model_name": "grok-code-fast-1",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2333,
"std": 0.0745
},
"pass@4": 0.4,
"pass^4": 0.1
},
"gemini-2-5-flash": {
"total_tasks": 30,
"total_agent_execution_time": 7456.854316711426,
"total_input_tokens": 8117152,
"total_output_tokens": 908300,
"total_tokens": 9025452,
"total_turns": 780,
"avg_agent_execution_time": 62.1405,
"avg_input_tokens": 67642.9333,
"avg_output_tokens": 7569.1667,
"avg_total_tokens": 75212.1,
"avg_turns": 6.5,
"per_run_input_tokens": 2029288,
"per_run_output_tokens": 227075,
"per_run_cost": 1.176474,
"actual_model_name": "gemini-2.5-flash",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0833,
"std": 0.0167
},
"pass@4": 0.1333,
"pass^4": 0.0667
},
"gpt-5-nano-high": {
"total_tasks": 30,
"total_agent_execution_time": 24791.744941711426,
"total_input_tokens": 84154565,
"total_output_tokens": 4132124,
"total_tokens": 88286689,
"total_turns": 3235,
"avg_agent_execution_time": 206.5979,
"avg_input_tokens": 701288.0417,
"avg_output_tokens": 34434.3667,
"avg_total_tokens": 735722.4083,
"avg_turns": 26.9583,
"per_run_input_tokens": 21038641.25,
"per_run_output_tokens": 1033031,
"per_run_cost": 1.465144,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0583,
"std": 0.0493
},
"pass@4": 0.1667,
"pass^4": 0
},
"gpt-5-mini-low": {
"total_tasks": 30,
"total_agent_execution_time": 8122.387328386307,
"total_input_tokens": 14746988,
"total_output_tokens": 420637,
"total_tokens": 15167625,
"total_turns": 965,
"avg_agent_execution_time": 67.6866,
"avg_input_tokens": 122891.5667,
"avg_output_tokens": 3505.3083,
"avg_total_tokens": 126396.875,
"avg_turns": 8.0417,
"per_run_input_tokens": 3686747,
"per_run_output_tokens": 105159.25,
"per_run_cost": 1.132005,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.125,
"std": 0.0493
},
"pass@4": 0.3333,
"pass^4": 0.0333
},
"qwen-3-max": {
"total_tasks": 30,
"total_agent_execution_time": 16070.112683296204,
"total_input_tokens": 46747185,
"total_output_tokens": 344106,
"total_tokens": 47091291,
"total_turns": 2313,
"avg_agent_execution_time": 133.9176,
"avg_input_tokens": 389559.875,
"avg_output_tokens": 2867.55,
"avg_total_tokens": 392427.425,
"avg_turns": 19.275,
"per_run_input_tokens": 11686796.25,
"per_run_output_tokens": 86026.5,
"per_run_cost": 14.540315,
"actual_model_name": "qwen3-max-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1083,
"std": 0.0144
},
"pass@4": 0.1333,
"pass^4": 0.1
},
"kimi-k2-0711": {
"total_tasks": 30,
"total_agent_execution_time": 26702.112154960632,
"total_input_tokens": 44744753,
"total_output_tokens": 439086,
"total_tokens": 45183839,
"total_turns": 2452,
"avg_agent_execution_time": 222.5176,
"avg_input_tokens": 372872.9417,
"avg_output_tokens": 3659.05,
"avg_total_tokens": 376531.9917,
"avg_turns": 20.4333,
"per_run_input_tokens": 11186188.25,
"per_run_output_tokens": 109771.5,
"per_run_cost": 6.986142,
"actual_model_name": "kimi-k2-0711-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2,
"std": 0.0236
},
"pass@4": 0.3,
"pass^4": 0.1333
},
"gpt-5-high": {
"total_tasks": 30,
"total_agent_execution_time": 99365.79502940178,
"total_input_tokens": 31734063,
"total_output_tokens": 3026203,
"total_tokens": 34760266,
"total_turns": 1206,
"avg_agent_execution_time": 828.0483,
"avg_input_tokens": 264450.525,
"avg_output_tokens": 25218.3583,
"avg_total_tokens": 289668.8833,
"avg_turns": 10.05,
"per_run_input_tokens": 7933515.75,
"per_run_output_tokens": 756550.75,
"per_run_cost": 17.482402,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.525,
"std": 0.0363
},
"pass@4": 0.7,
"pass^4": 0.3667
},
"kimi-k2-0905": {
"total_tasks": 30,
"total_agent_execution_time": 45185.467045784,
"total_input_tokens": 83615017,
"total_output_tokens": 536182,
"total_tokens": 84151199,
"total_turns": 3152,
"avg_agent_execution_time": 376.5456,
"avg_input_tokens": 696791.8083,
"avg_output_tokens": 4468.1833,
"avg_total_tokens": 701259.9917,
"avg_turns": 26.2667,
"per_run_input_tokens": 20903754.25,
"per_run_output_tokens": 134045.5,
"per_run_cost": 12.877366,
"actual_model_name": "kimi-k2-0905-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1417,
"std": 0.0144
},
"pass@4": 0.2333,
"pass^4": 0.0667
},
"gpt-5-nano-medium": {
"total_tasks": 30,
"total_agent_execution_time": 15545.906517505646,
"total_input_tokens": 55529317,
"total_output_tokens": 2343137,
"total_tokens": 57872454,
"total_turns": 2490,
"avg_agent_execution_time": 129.5492,
"avg_input_tokens": 462744.3083,
"avg_output_tokens": 19526.1417,
"avg_total_tokens": 482270.45,
"avg_turns": 20.75,
"per_run_input_tokens": 13882329.25,
"per_run_output_tokens": 585784.25,
"per_run_cost": 0.92843,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0667,
"std": 0.0527
},
"pass@4": 0.1667,
"pass^4": 0
},
"grok-4": {
"total_tasks": 30,
"total_agent_execution_time": 30793.946169614792,
"total_input_tokens": 29679677,
"total_output_tokens": 1284375,
"total_tokens": 30964052,
"total_turns": 1296,
"avg_agent_execution_time": 256.6162,
"avg_input_tokens": 247330.6417,
"avg_output_tokens": 10703.125,
"avg_total_tokens": 258033.7667,
"avg_turns": 10.8,
"per_run_input_tokens": 7419919.25,
"per_run_output_tokens": 321093.75,
"per_run_cost": 27.076164,
"actual_model_name": "grok-4-0709",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5083,
"std": 0.064
},
"pass@4": 0.7333,
"pass^4": 0.2667
},
"claude-sonnet-4-high": {
"total_tasks": 30,
"total_agent_execution_time": 17209.377204418182,
"total_input_tokens": 106723679,
"total_output_tokens": 670070,
"total_tokens": 107393749,
"total_turns": 2248,
"avg_agent_execution_time": 143.4115,
"avg_input_tokens": 889363.9917,
"avg_output_tokens": 5583.9167,
"avg_total_tokens": 894947.9083,
"avg_turns": 18.7333,
"per_run_input_tokens": 26680919.75,
"per_run_output_tokens": 167517.5,
"per_run_cost": 82.555522,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2333,
"std": 0.0408
},
"pass@4": 0.3667,
"pass^4": 0.1
},
"gpt-oss-120b": {
"total_tasks": 30,
"total_agent_execution_time": 2195.19043135643,
"total_input_tokens": 2370211,
"total_output_tokens": 129682,
"total_tokens": 2499893,
"total_turns": 554,
"avg_agent_execution_time": 18.2933,
"avg_input_tokens": 19751.7583,
"avg_output_tokens": 1080.6833,
"avg_total_tokens": 20832.4417,
"avg_turns": 4.6167,
"per_run_input_tokens": 592552.75,
"per_run_output_tokens": 32420.5,
"per_run_cost": 0.051742,
"actual_model_name": "gpt-oss-120b",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0583,
"std": 0.0433
},
"pass@4": 0.1667,
"pass^4": 0
},
"o4-mini": {
"total_tasks": 30,
"total_agent_execution_time": 31632.499047517776,
"total_input_tokens": 35200644,
"total_output_tokens": 1906496,
"total_tokens": 37107140,
"total_turns": 2506,
"avg_agent_execution_time": 263.6042,
"avg_input_tokens": 293338.7,
"avg_output_tokens": 15887.4667,
"avg_total_tokens": 309226.1667,
"avg_turns": 20.8833,
"per_run_input_tokens": 8800161,
"per_run_output_tokens": 476624,
"per_run_cost": 11.777323,
"actual_model_name": "o4-mini-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.25,
"std": 0.0289
},
"pass@4": 0.3667,
"pass^4": 0.1333
},
"gpt-4-1-mini": {
"total_tasks": 30,
"total_agent_execution_time": 6175.697661399841,
"total_input_tokens": 23538288,
"total_output_tokens": 195484,
"total_tokens": 23733772,
"total_turns": 1860,
"avg_agent_execution_time": 51.4641,
"avg_input_tokens": 196152.4,
"avg_output_tokens": 1629.0333,
"avg_total_tokens": 197781.4333,
"avg_turns": 15.5,
"per_run_input_tokens": 5884572,
"per_run_output_tokens": 48871,
"per_run_cost": 2.432022,
"actual_model_name": "gpt-4.1-mini-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0333,
"std": 0
},
"pass@4": 0.0333,
"pass^4": 0.0333
}
},
"github": {
"deepseek-chat": {
"total_tasks": 23,
"total_agent_execution_time": 17843.541527748108,
"total_input_tokens": 33336977,
"total_output_tokens": 206404,
"total_tokens": 33543381,
"total_turns": 870,
"avg_agent_execution_time": 193.9515,
"avg_input_tokens": 362358.4457,
"avg_output_tokens": 2243.5217,
"avg_total_tokens": 364601.9674,
"avg_turns": 9.4565,
"per_run_input_tokens": 8334244.25,
"per_run_output_tokens": 51601,
"per_run_cost": 4.753866,
"actual_model_name": "deepseek-v3.1-non-think",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0978,
"std": 0.0188
},
"pass@4": 0.1304,
"pass^4": 0.087
},
"gpt-4-1": {
"total_tasks": 23,
"total_agent_execution_time": 8299.499776601791,
"total_input_tokens": 41020732,
"total_output_tokens": 228765,
"total_tokens": 41249497,
"total_turns": 915,
"avg_agent_execution_time": 90.212,
"avg_input_tokens": 445877.5217,
"avg_output_tokens": 2486.5761,
"avg_total_tokens": 448364.0978,
"avg_turns": 9.9457,
"per_run_input_tokens": 10255183,
"per_run_output_tokens": 57191.25,
"per_run_cost": 20.967896,
"actual_model_name": "gpt-4.1-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0761,
"std": 0.0188
},
"pass@4": 0.087,
"pass^4": 0.0435
},
"claude-opus-4-1": {
"total_tasks": 23,
"total_agent_execution_time": 8974.225908994675,
"total_input_tokens": 14274491,
"total_output_tokens": 134233,
"total_tokens": 14408724,
"total_turns": 248,
"avg_agent_execution_time": 390.1837,
"avg_input_tokens": 620630.0435,
"avg_output_tokens": 5836.2174,
"avg_total_tokens": 626466.2609,
"avg_turns": 10.7826,
"per_run_input_tokens": 14274491,
"per_run_output_tokens": 134233,
"per_run_cost": 224.18484,
"actual_model_name": "claude-opus-4-1-20250805",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2174,
"std": 0
}
},
"glm-4-5": {
"total_tasks": 23,
"total_agent_execution_time": 14126.248623132706,
"total_input_tokens": 44343752,
"total_output_tokens": 336055,
"total_tokens": 44679807,
"total_turns": 1097,
"avg_agent_execution_time": 153.5462,
"avg_input_tokens": 481997.3043,
"avg_output_tokens": 3652.7717,
"avg_total_tokens": 485650.0761,
"avg_turns": 11.9239,
"per_run_input_tokens": 11085938,
"per_run_output_tokens": 84013.75,
"per_run_cost": 3.769258,
"actual_model_name": "glm-4.5",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2283,
"std": 0.0643
},
"pass@4": 0.3478,
"pass^4": 0.1304
},
"o3": {
"total_tasks": 23,
"total_agent_execution_time": 11774.837658882141,
"total_input_tokens": 41508489,
"total_output_tokens": 327279,
"total_tokens": 41835768,
"total_turns": 846,
"avg_agent_execution_time": 127.9874,
"avg_input_tokens": 451179.2283,
"avg_output_tokens": 3557.3804,
"avg_total_tokens": 454736.6087,
"avg_turns": 9.1957,
"per_run_input_tokens": 10377122.25,
"per_run_output_tokens": 81819.75,
"per_run_cost": 21.408803,
"actual_model_name": "o3-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1413,
"std": 0.0361
},
"pass@4": 0.2174,
"pass^4": 0.0435
},
"claude-sonnet-4-low": {
"total_tasks": 23,
"total_agent_execution_time": 15970.095800638199,
"total_input_tokens": 75027894,
"total_output_tokens": 441257,
"total_tokens": 75469151,
"total_turns": 1417,
"avg_agent_execution_time": 173.588,
"avg_input_tokens": 815520.587,
"avg_output_tokens": 4796.2717,
"avg_total_tokens": 820316.8587,
"avg_turns": 15.4022,
"per_run_input_tokens": 18756973.5,
"per_run_output_tokens": 110314.25,
"per_run_cost": 57.925634,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.25,
"std": 0.0361
},
"pass@4": 0.3478,
"pass^4": 0.2174
},
"gemini-2-5-pro": {
"total_tasks": 23,
"total_agent_execution_time": 8400.268857002258,
"total_input_tokens": 15955471,
"total_output_tokens": 529482,
"total_tokens": 16484953,
"total_turns": 501,
"avg_agent_execution_time": 91.3073,
"avg_input_tokens": 173429.0326,
"avg_output_tokens": 5755.2391,
"avg_total_tokens": 179184.2717,
"avg_turns": 5.4457,
"per_run_input_tokens": 3988867.75,
"per_run_output_tokens": 132370.5,
"per_run_cost": 11.957727,
"actual_model_name": "gemini-2.5-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0978,
"std": 0.0188
},
"pass@4": 0.2174,
"pass^4": 0
},
"qwen-3-coder-plus": {
"total_tasks": 23,
"total_agent_execution_time": 29474.414255142212,
"total_input_tokens": 182817170,
"total_output_tokens": 308729,
"total_tokens": 183125899,
"total_turns": 1759,
"avg_agent_execution_time": 320.3741,
"avg_input_tokens": 1987143.1522,
"avg_output_tokens": 3355.75,
"avg_total_tokens": 1990498.9022,
"avg_turns": 19.1196,
"per_run_input_tokens": 45704292.5,
"per_run_output_tokens": 77182.25,
"per_run_cost": 9.202604,
"actual_model_name": "qwen3-coder-480b-a35b-instruct",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1957,
"std": 0.0652
},
"pass@4": 0.3478,
"pass^4": 0.1304
},
"gpt-5-mini-high": {
"total_tasks": 23,
"total_agent_execution_time": 31101.44998884201,
"total_input_tokens": 91999132,
"total_output_tokens": 2339485,
"total_tokens": 94338617,
"total_turns": 1666,
"avg_agent_execution_time": 338.0592,
"avg_input_tokens": 999990.5652,
"avg_output_tokens": 25429.1848,
"avg_total_tokens": 1025419.75,
"avg_turns": 18.1087,
"per_run_input_tokens": 22999783,
"per_run_output_tokens": 584871.25,
"per_run_cost": 6.919688,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1957,
"std": 0.0217
},
"pass@4": 0.3478,
"pass^4": 0.087
},
"gpt-5-nano-low": {
"total_tasks": 23,
"total_agent_execution_time": 5304.219719648361,
"total_input_tokens": 20887893,
"total_output_tokens": 305082,
"total_tokens": 21192975,
"total_turns": 849,
"avg_agent_execution_time": 57.6546,
"avg_input_tokens": 227042.3152,
"avg_output_tokens": 3316.1087,
"avg_total_tokens": 230358.4239,
"avg_turns": 9.2283,
"per_run_input_tokens": 5221973.25,
"per_run_output_tokens": 76270.5,
"per_run_cost": 0.291607,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"gpt-5-medium": {
"total_tasks": 23,
"total_agent_execution_time": 42011.101893901825,
"total_input_tokens": 60694917,
"total_output_tokens": 1892024,
"total_tokens": 62586941,
"total_turns": 1318,
"avg_agent_execution_time": 456.6424,
"avg_input_tokens": 659727.3587,
"avg_output_tokens": 20565.4783,
"avg_total_tokens": 680292.837,
"avg_turns": 14.3261,
"per_run_input_tokens": 15173729.25,
"per_run_output_tokens": 473006,
"per_run_cost": 23.697222,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4783,
"std": 0.0813
},
"pass@4": 0.6522,
"pass^4": 0.1739
},
"claude-sonnet-4": {
"total_tasks": 23,
"total_agent_execution_time": 18077.839505195618,
"total_input_tokens": 64106636,
"total_output_tokens": 408552,
"total_tokens": 64515188,
"total_turns": 1027,
"avg_agent_execution_time": 196.4983,
"avg_input_tokens": 696811.2609,
"avg_output_tokens": 4440.7826,
"avg_total_tokens": 701252.0435,
"avg_turns": 11.163,
"per_run_input_tokens": 16026659,
"per_run_output_tokens": 102138,
"per_run_cost": 49.612047,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.163,
"std": 0.0565
},
"pass@4": 0.3043,
"pass^4": 0.087
},
"gpt-5-low": {
"total_tasks": 23,
"total_agent_execution_time": 24684.910781621933,
"total_input_tokens": 62630950,
"total_output_tokens": 1095043,
"total_tokens": 63725993,
"total_turns": 930,
"avg_agent_execution_time": 268.3142,
"avg_input_tokens": 680771.1957,
"avg_output_tokens": 11902.6413,
"avg_total_tokens": 692673.837,
"avg_turns": 10.1087,
"per_run_input_tokens": 15657737.5,
"per_run_output_tokens": 273760.75,
"per_run_cost": 22.309779,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2717,
"std": 0.0188
},
"pass@4": 0.3913,
"pass^4": 0.1739
},
"gpt-5-mini-medium": {
"total_tasks": 23,
"total_agent_execution_time": 11734.965313196182,
"total_input_tokens": 56551055,
"total_output_tokens": 709791,
"total_tokens": 57260846,
"total_turns": 1281,
"avg_agent_execution_time": 127.554,
"avg_input_tokens": 614685.3804,
"avg_output_tokens": 7715.1196,
"avg_total_tokens": 622400.5,
"avg_turns": 13.9239,
"per_run_input_tokens": 14137763.75,
"per_run_output_tokens": 177447.75,
"per_run_cost": 3.889336,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1848,
"std": 0.0776
},
"pass@4": 0.3478,
"pass^4": 0.0435
},
"gpt-4-1-nano": {
"total_tasks": 23,
"total_agent_execution_time": 4764.2498252391815,
"total_input_tokens": 28783356,
"total_output_tokens": 238484,
"total_tokens": 29021840,
"total_turns": 853,
"avg_agent_execution_time": 51.7853,
"avg_input_tokens": 312862.5652,
"avg_output_tokens": 2592.2174,
"avg_total_tokens": 315454.7826,
"avg_turns": 9.2717,
"per_run_input_tokens": 7195839,
"per_run_output_tokens": 59621,
"per_run_cost": 0.743432,
"actual_model_name": "gpt-4.1-nano-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"grok-code-fast-1": {
"total_tasks": 23,
"total_agent_execution_time": 16823.461669445038,
"total_input_tokens": 69130173,
"total_output_tokens": 598154,
"total_tokens": 69728327,
"total_turns": 1642,
"avg_agent_execution_time": 182.8637,
"avg_input_tokens": 751414.9239,
"avg_output_tokens": 6501.6739,
"avg_total_tokens": 757916.5978,
"avg_turns": 17.8478,
"per_run_input_tokens": 17282543.25,
"per_run_output_tokens": 149538.5,
"per_run_cost": 3.680816,
"actual_model_name": "grok-code-fast-1",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.087,
"std": 0.0532
},
"pass@4": 0.1739,
"pass^4": 0.0435
},
"gemini-2-5-flash": {
"total_tasks": 23,
"total_agent_execution_time": 18992.60165667534,
"total_input_tokens": 101847435,
"total_output_tokens": 1168664,
"total_tokens": 103016099,
"total_turns": 962,
"avg_agent_execution_time": 206.4413,
"avg_input_tokens": 1107037.337,
"avg_output_tokens": 12702.8696,
"avg_total_tokens": 1119740.2065,
"avg_turns": 10.4565,
"per_run_input_tokens": 25461858.75,
"per_run_output_tokens": 292166,
"per_run_cost": 8.368973,
"actual_model_name": "gemini-2.5-flash",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1522,
"std": 0.0217
},
"pass@4": 0.2174,
"pass^4": 0.087
},
"gpt-5-nano-high": {
"total_tasks": 23,
"total_agent_execution_time": 29166.180356264114,
"total_input_tokens": 112335319,
"total_output_tokens": 4512798,
"total_tokens": 116848117,
"total_turns": 1680,
"avg_agent_execution_time": 317.0237,
"avg_input_tokens": 1221036.0761,
"avg_output_tokens": 49052.1522,
"avg_total_tokens": 1270088.2283,
"avg_turns": 18.2609,
"per_run_input_tokens": 28083829.75,
"per_run_output_tokens": 1128199.5,
"per_run_cost": 1.855471,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.087,
"std": 0.0307
},
"pass@4": 0.1739,
"pass^4": 0
},
"gpt-5-mini-low": {
"total_tasks": 23,
"total_agent_execution_time": 5807.832483053207,
"total_input_tokens": 40078426,
"total_output_tokens": 179124,
"total_tokens": 40257550,
"total_turns": 868,
"avg_agent_execution_time": 63.1286,
"avg_input_tokens": 435635.0652,
"avg_output_tokens": 1947,
"avg_total_tokens": 437582.0652,
"avg_turns": 9.4348,
"per_run_input_tokens": 10019606.5,
"per_run_output_tokens": 44781,
"per_run_cost": 2.594464,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.087,
"std": 0.0307
},
"pass@4": 0.1304,
"pass^4": 0
},
"qwen-3-max": {
"total_tasks": 23,
"total_agent_execution_time": 16698.327759742737,
"total_input_tokens": 124028099,
"total_output_tokens": 234790,
"total_tokens": 124262889,
"total_turns": 2456,
"avg_agent_execution_time": 181.5036,
"avg_input_tokens": 1348131.5109,
"avg_output_tokens": 2552.0652,
"avg_total_tokens": 1350683.5761,
"avg_turns": 26.6957,
"per_run_input_tokens": 31007024.75,
"per_run_output_tokens": 58697.5,
"per_run_cost": 37.560615,
"actual_model_name": "qwen3-max-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1413,
"std": 0.0361
},
"pass@4": 0.1739,
"pass^4": 0.0435
},
"kimi-k2-0711": {
"total_tasks": 23,
"total_agent_execution_time": 18861.51504421234,
"total_input_tokens": 34909027,
"total_output_tokens": 209917,
"total_tokens": 35118944,
"total_turns": 817,
"avg_agent_execution_time": 205.0165,
"avg_input_tokens": 379445.9457,
"avg_output_tokens": 2281.7065,
"avg_total_tokens": 381727.6522,
"avg_turns": 8.8804,
"per_run_input_tokens": 8727256.75,
"per_run_output_tokens": 52479.25,
"per_run_cost": 5.367552,
"actual_model_name": "kimi-k2-0711-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1087,
"std": 0.0217
},
"pass@4": 0.1304,
"pass^4": 0.0435
},
"gpt-5-high": {
"total_tasks": 23,
"total_agent_execution_time": 99681.28649711609,
"total_input_tokens": 83755768,
"total_output_tokens": 3423829,
"total_tokens": 87179597,
"total_turns": 1533,
"avg_agent_execution_time": 1083.4922,
"avg_input_tokens": 910388.7826,
"avg_output_tokens": 37215.5326,
"avg_total_tokens": 947604.3152,
"avg_turns": 16.663,
"per_run_input_tokens": 20938942,
"per_run_output_tokens": 855957.25,
"per_run_cost": 34.73325,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5,
"std": 0.0217
},
"pass@4": 0.6087,
"pass^4": 0.3478
},
"kimi-k2-0905": {
"total_tasks": 23,
"total_agent_execution_time": 71829.9812734127,
"total_input_tokens": 91599488,
"total_output_tokens": 758701,
"total_tokens": 92358189,
"total_turns": 2179,
"avg_agent_execution_time": 780.7607,
"avg_input_tokens": 995646.6087,
"avg_output_tokens": 8246.75,
"avg_total_tokens": 1003893.3587,
"avg_turns": 23.6848,
"per_run_input_tokens": 22899872,
"per_run_output_tokens": 189675.25,
"per_run_cost": 14.214111,
"actual_model_name": "kimi-k2-0905-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.163,
"std": 0.0188
},
"pass@4": 0.2609,
"pass^4": 0.087
},
"gpt-5-nano-medium": {
"total_tasks": 23,
"total_agent_execution_time": 17233.95824933052,
"total_input_tokens": 69148890,
"total_output_tokens": 2462505,
"total_tokens": 71611395,
"total_turns": 1394,
"avg_agent_execution_time": 187.3256,
"avg_input_tokens": 751618.3696,
"avg_output_tokens": 26766.3587,
"avg_total_tokens": 778384.7283,
"avg_turns": 15.1522,
"per_run_input_tokens": 17287222.5,
"per_run_output_tokens": 615626.25,
"per_run_cost": 1.110612,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0761,
"std": 0.0188
},
"pass@4": 0.1304,
"pass^4": 0
},
"grok-4": {
"total_tasks": 23,
"total_agent_execution_time": 24744.31317639351,
"total_input_tokens": 74013900,
"total_output_tokens": 177230,
"total_tokens": 74636957,
"total_turns": 1194,
"avg_agent_execution_time": 268.9599,
"avg_input_tokens": 804498.913,
"avg_output_tokens": 1926.413,
"avg_total_tokens": 811271.2717,
"avg_turns": 12.9783,
"per_run_input_tokens": 18503475,
"per_run_output_tokens": 44307.5,
"per_run_cost": 56.175038,
"actual_model_name": "grok-4-0709",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1413,
"std": 0.0361
},
"pass@4": 0.2174,
"pass^4": 0.087
},
"claude-sonnet-4-high": {
"total_tasks": 23,
"total_agent_execution_time": 15643.119762897491,
"total_input_tokens": 75490051,
"total_output_tokens": 471509,
"total_tokens": 75961560,
"total_turns": 1415,
"avg_agent_execution_time": 170.0339,
"avg_input_tokens": 820544.0326,
"avg_output_tokens": 5125.0978,
"avg_total_tokens": 825669.1304,
"avg_turns": 15.3804,
"per_run_input_tokens": 18872512.75,
"per_run_output_tokens": 117877.25,
"per_run_cost": 58.385697,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2826,
"std": 0.0217
},
"pass@4": 0.4348,
"pass^4": 0.2174
},
"gpt-oss-120b": {
"total_tasks": 23,
"total_agent_execution_time": 2209.1974110603333,
"total_input_tokens": 7019815,
"total_output_tokens": 129561,
"total_tokens": 7149376,
"total_turns": 425,
"avg_agent_execution_time": 24.013,
"avg_input_tokens": 76302.337,
"avg_output_tokens": 1408.2717,
"avg_total_tokens": 77710.6087,
"avg_turns": 4.6196,
"per_run_input_tokens": 1754953.75,
"per_run_output_tokens": 32390.25,
"per_run_cost": 0.135426,
"actual_model_name": "gpt-oss-120b",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0435,
"std": 0.0307
},
"pass@4": 0.087,
"pass^4": 0
},
"o4-mini": {
"total_tasks": 23,
"total_agent_execution_time": 22887.657103300095,
"total_input_tokens": 46932136,
"total_output_tokens": 803804,
"total_tokens": 47735940,
"total_turns": 1005,
"avg_agent_execution_time": 248.7789,
"avg_input_tokens": 510131.913,
"avg_output_tokens": 8737,
"avg_total_tokens": 518868.913,
"avg_turns": 10.9239,
"per_run_input_tokens": 11733034,
"per_run_output_tokens": 200951,
"per_run_cost": 13.790522,
"actual_model_name": "o4-mini-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1413,
"std": 0.0643
},
"pass@4": 0.2609,
"pass^4": 0.0435
},
"gpt-4-1-mini": {
"total_tasks": 23,
"total_agent_execution_time": 7644.50294303894,
"total_input_tokens": 42936149,
"total_output_tokens": 139240,
"total_tokens": 43075389,
"total_turns": 1104,
"avg_agent_execution_time": 83.0924,
"avg_input_tokens": 466697.2717,
"avg_output_tokens": 1513.4783,
"avg_total_tokens": 468210.75,
"avg_turns": 12,
"per_run_input_tokens": 10734037.25,
"per_run_output_tokens": 34810,
"per_run_cost": 4.349311,
"actual_model_name": "gpt-4.1-mini-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0652,
"std": 0.0652
},
"pass@4": 0.1739,
"pass^4": 0
}
},
"notion": {
"deepseek-chat": {
"total_tasks": 28,
"total_agent_execution_time": 26761.268040657043,
"total_input_tokens": 56375571,
"total_output_tokens": 246037,
"total_tokens": 56621608,
"total_turns": 2009,
"avg_agent_execution_time": 238.9399,
"avg_input_tokens": 503353.3125,
"avg_output_tokens": 2196.7589,
"avg_total_tokens": 505550.0714,
"avg_turns": 17.9375,
"per_run_input_tokens": 14093892.75,
"per_run_output_tokens": 61509.25,
"per_run_cost": 7.995915,
"actual_model_name": "deepseek-v3.1-non-think",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.125,
"std": 0.0309
},
"pass@4": 0.2857,
"pass^4": 0
},
"gpt-4-1": {
"total_tasks": 28,
"total_agent_execution_time": 5461.270235300064,
"total_input_tokens": 15181887,
"total_output_tokens": 153733,
"total_tokens": 15335620,
"total_turns": 961,
"avg_agent_execution_time": 48.7613,
"avg_input_tokens": 135552.5625,
"avg_output_tokens": 1372.6161,
"avg_total_tokens": 136925.1786,
"avg_turns": 8.5804,
"per_run_input_tokens": 3795471.75,
"per_run_output_tokens": 38433.25,
"per_run_cost": 7.898409,
"actual_model_name": "gpt-4.1-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0625,
"std": 0.0155
},
"pass@4": 0.1429,
"pass^4": 0
},
"claude-opus-4-1": {
"total_tasks": 28,
"total_agent_execution_time": 8238.807499885559,
"total_input_tokens": 17865599,
"total_output_tokens": 110144,
"total_tokens": 17975743,
"total_turns": 477,
"avg_agent_execution_time": 294.2431,
"avg_input_tokens": 638057.1071,
"avg_output_tokens": 3933.7143,
"avg_total_tokens": 641990.8214,
"avg_turns": 17.0357,
"per_run_input_tokens": 17865599,
"per_run_output_tokens": 110144,
"per_run_cost": 276.244785,
"actual_model_name": "claude-opus-4-1-20250805",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.3571,
"std": 0
}
},
"glm-4-5": {
"total_tasks": 28,
"total_agent_execution_time": 24882.952006816864,
"total_input_tokens": 70108796,
"total_output_tokens": 564356,
"total_tokens": 70673152,
"total_turns": 2481,
"avg_agent_execution_time": 222.1692,
"avg_input_tokens": 625971.3929,
"avg_output_tokens": 5038.8929,
"avg_total_tokens": 631010.2857,
"avg_turns": 22.1518,
"per_run_input_tokens": 17527199,
"per_run_output_tokens": 141089,
"per_run_cost": 5.970213,
"actual_model_name": "glm-4.5",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2143,
"std": 0.0253
},
"pass@4": 0.3214,
"pass^4": 0.1071
},
"o3": {
"total_tasks": 28,
"total_agent_execution_time": 19195.927572011948,
"total_input_tokens": 25191983,
"total_output_tokens": 1060378,
"total_tokens": 26252361,
"total_turns": 1537,
"avg_agent_execution_time": 171.3922,
"avg_input_tokens": 224928.4196,
"avg_output_tokens": 9467.6607,
"avg_total_tokens": 234396.0804,
"avg_turns": 13.7232,
"per_run_input_tokens": 6297995.75,
"per_run_output_tokens": 265094.5,
"per_run_cost": 14.716748,
"actual_model_name": "o3-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2411,
"std": 0.0389
},
"pass@4": 0.4643,
"pass^4": 0.0714
},
"claude-sonnet-4-low": {
"total_tasks": 28,
"total_agent_execution_time": 24645.00325536728,
"total_input_tokens": 138082856,
"total_output_tokens": 479628,
"total_tokens": 138562484,
"total_turns": 2207,
"avg_agent_execution_time": 220.0447,
"avg_input_tokens": 1232882.6429,
"avg_output_tokens": 4282.3929,
"avg_total_tokens": 1237165.0357,
"avg_turns": 19.7054,
"per_run_input_tokens": 34520714,
"per_run_output_tokens": 119907,
"per_run_cost": 105.360747,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2232,
"std": 0.0296
},
"pass@4": 0.3214,
"pass^4": 0.0714
},
"gemini-2-5-pro": {
"total_tasks": 28,
"total_agent_execution_time": 11470.97702550888,
"total_input_tokens": 23847150,
"total_output_tokens": 798708,
"total_tokens": 24645858,
"total_turns": 797,
"avg_agent_execution_time": 102.4194,
"avg_input_tokens": 212920.9821,
"avg_output_tokens": 7131.3214,
"avg_total_tokens": 220052.3036,
"avg_turns": 7.1161,
"per_run_input_tokens": 5961787.5,
"per_run_output_tokens": 199677,
"per_run_cost": 17.899624,
"actual_model_name": "gemini-2.5-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0446,
"std": 0.0296
},
"pass@4": 0.0714,
"pass^4": 0
},
"qwen-3-coder-plus": {
"total_tasks": 28,
"total_agent_execution_time": 11163.093998670578,
"total_input_tokens": 89233452,
"total_output_tokens": 308390,
"total_tokens": 89541842,
"total_turns": 2360,
"avg_agent_execution_time": 99.6705,
"avg_input_tokens": 796727.25,
"avg_output_tokens": 2753.4821,
"avg_total_tokens": 799480.7321,
"avg_turns": 21.0714,
"per_run_input_tokens": 22308363,
"per_run_output_tokens": 77097.5,
"per_run_cost": 4.523351,
"actual_model_name": "qwen3-coder-480b-a35b-instruct",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1964,
"std": 0.0644
},
"pass@4": 0.3929,
"pass^4": 0.0714
},
"gpt-5-mini-high": {
"total_tasks": 28,
"total_agent_execution_time": 58533.93042230606,
"total_input_tokens": 130525675,
"total_output_tokens": 4916907,
"total_tokens": 135442582,
"total_turns": 2118,
"avg_agent_execution_time": 522.6244,
"avg_input_tokens": 1165407.8125,
"avg_output_tokens": 43900.9554,
"avg_total_tokens": 1209308.7679,
"avg_turns": 18.9107,
"per_run_input_tokens": 32631418.75,
"per_run_output_tokens": 1229226.75,
"per_run_cost": 10.616308,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2054,
"std": 0.1297
},
"pass@4": 0.4286,
"pass^4": 0.0714
},
"gpt-5-nano-low": {
"total_tasks": 28,
"total_agent_execution_time": 7620.024949550629,
"total_input_tokens": 5304447,
"total_output_tokens": 1217553,
"total_tokens": 6522000,
"total_turns": 510,
"avg_agent_execution_time": 68.0359,
"avg_input_tokens": 47361.1339,
"avg_output_tokens": 10871.0089,
"avg_total_tokens": 58232.1429,
"avg_turns": 4.5536,
"per_run_input_tokens": 1326111.75,
"per_run_output_tokens": 304388.25,
"per_run_cost": 0.188061,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"gpt-5-medium": {
"total_tasks": 28,
"total_agent_execution_time": 74123.09180688858,
"total_input_tokens": 42004983,
"total_output_tokens": 3541977,
"total_tokens": 45546960,
"total_turns": 1449,
"avg_agent_execution_time": 661.8133,
"avg_input_tokens": 375044.4911,
"avg_output_tokens": 31624.7946,
"avg_total_tokens": 406669.2857,
"avg_turns": 12.9375,
"per_run_input_tokens": 10501245.75,
"per_run_output_tokens": 885494.25,
"per_run_cost": 21.9815,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4196,
"std": 0.0296
},
"pass@4": 0.5,
"pass^4": 0.3214
},
"claude-sonnet-4": {
"total_tasks": 28,
"total_agent_execution_time": 21642.930950641632,
"total_input_tokens": 72423701,
"total_output_tokens": 474776,
"total_tokens": 72898477,
"total_turns": 2208,
"avg_agent_execution_time": 193.2405,
"avg_input_tokens": 646640.1875,
"avg_output_tokens": 4239.0714,
"avg_total_tokens": 650879.2589,
"avg_turns": 19.7143,
"per_run_input_tokens": 18105925.25,
"per_run_output_tokens": 118694,
"per_run_cost": 56.098186,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.2143,
"std": 0.0505
},
"pass@4": 0.3929,
"pass^4": 0.0714
},
"gpt-5-low": {
"total_tasks": 28,
"total_agent_execution_time": 62677.94048953056,
"total_input_tokens": 47638046,
"total_output_tokens": 3351023,
"total_tokens": 50989069,
"total_turns": 1514,
"avg_agent_execution_time": 559.6245,
"avg_input_tokens": 425339.6964,
"avg_output_tokens": 29919.8482,
"avg_total_tokens": 455259.5446,
"avg_turns": 13.5179,
"per_run_input_tokens": 11909511.5,
"per_run_output_tokens": 837755.75,
"per_run_cost": 23.264447,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.3661,
"std": 0.0773
},
"pass@4": 0.5357,
"pass^4": 0.1429
},
"gpt-5-mini-medium": {
"total_tasks": 28,
"total_agent_execution_time": 18890.31924724579,
"total_input_tokens": 78970140,
"total_output_tokens": 1382436,
"total_tokens": 80352576,
"total_turns": 1635,
"avg_agent_execution_time": 168.6636,
"avg_input_tokens": 705090.5357,
"avg_output_tokens": 12343.1786,
"avg_total_tokens": 717433.7143,
"avg_turns": 14.5982,
"per_run_input_tokens": 19742535,
"per_run_output_tokens": 345609,
"per_run_cost": 5.626852,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1607,
"std": 0.0592
},
"pass@4": 0.3214,
"pass^4": 0.0357
},
"gpt-4-1-nano": {
"total_tasks": 28,
"total_agent_execution_time": 3603.7674894332886,
"total_input_tokens": 10458841,
"total_output_tokens": 156924,
"total_tokens": 10615765,
"total_turns": 1080,
"avg_agent_execution_time": 32.1765,
"avg_input_tokens": 93382.5089,
"avg_output_tokens": 1401.1071,
"avg_total_tokens": 94783.6161,
"avg_turns": 9.6429,
"per_run_input_tokens": 2614710.25,
"per_run_output_tokens": 39231,
"per_run_cost": 0.277163,
"actual_model_name": "gpt-4.1-nano-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"grok-code-fast-1": {
"total_tasks": 28,
"total_agent_execution_time": 37424.073429346085,
"total_input_tokens": 62886800,
"total_output_tokens": 812899,
"total_tokens": 64110194,
"total_turns": 2270,
"avg_agent_execution_time": 334.1435,
"avg_input_tokens": 561489.2857,
"avg_output_tokens": 7258.0268,
"avg_total_tokens": 572412.4464,
"avg_turns": 20.2679,
"per_run_input_tokens": 15721700,
"per_run_output_tokens": 203224.75,
"per_run_cost": 3.449177,
"actual_model_name": "grok-code-fast-1",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0268,
"std": 0.0155
},
"pass@4": 0.0357,
"pass^4": 0
},
"gemini-2-5-flash": {
"total_tasks": 28,
"total_agent_execution_time": 6247.662793159485,
"total_input_tokens": 22511623,
"total_output_tokens": 737164,
"total_tokens": 23248787,
"total_turns": 684,
"avg_agent_execution_time": 55.7827,
"avg_input_tokens": 200996.6339,
"avg_output_tokens": 6581.8214,
"avg_total_tokens": 207578.4554,
"avg_turns": 6.1071,
"per_run_input_tokens": 5627905.75,
"per_run_output_tokens": 184291,
"per_run_cost": 2.149099,
"actual_model_name": "gemini-2.5-flash",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0625,
"std": 0.0464
},
"pass@4": 0.2143,
"pass^4": 0
},
"gpt-5-nano-high": {
"total_tasks": 28,
"total_agent_execution_time": 44915.13718724251,
"total_input_tokens": 45483673,
"total_output_tokens": 8632428,
"total_tokens": 54116101,
"total_turns": 1113,
"avg_agent_execution_time": 401.028,
"avg_input_tokens": 406104.2232,
"avg_output_tokens": 77075.25,
"avg_total_tokens": 483179.4732,
"avg_turns": 9.9375,
"per_run_input_tokens": 11370918.25,
"per_run_output_tokens": 2158107,
"per_run_cost": 1.431789,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0089,
"std": 0.0155
},
"pass@4": 0.0357,
"pass^4": 0
},
"gpt-5-mini-low": {
"total_tasks": 28,
"total_agent_execution_time": 7015.923507452011,
"total_input_tokens": 13704613,
"total_output_tokens": 430549,
"total_tokens": 14135162,
"total_turns": 847,
"avg_agent_execution_time": 62.6422,
"avg_input_tokens": 122362.6161,
"avg_output_tokens": 3844.1875,
"avg_total_tokens": 126206.8036,
"avg_turns": 7.5625,
"per_run_input_tokens": 3426153.25,
"per_run_output_tokens": 107637.25,
"per_run_cost": 1.071813,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0536,
"std": 0.0536
},
"pass@4": 0.1429,
"pass^4": 0
},
"qwen-3-max": {
"total_tasks": 28,
"total_agent_execution_time": 20556.586127758026,
"total_input_tokens": 109079387,
"total_output_tokens": 409681,
"total_tokens": 109489068,
"total_turns": 2976,
"avg_agent_execution_time": 183.5409,
"avg_input_tokens": 973923.0982,
"avg_output_tokens": 3657.8661,
"avg_total_tokens": 977580.9643,
"avg_turns": 26.5714,
"per_run_input_tokens": 27269846.75,
"per_run_output_tokens": 102420.25,
"per_run_cost": 33.338338,
"actual_model_name": "qwen3-max-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1696,
"std": 0.0464
},
"pass@4": 0.25,
"pass^4": 0.0357
},
"kimi-k2-0711": {
"total_tasks": 28,
"total_agent_execution_time": 20542.138272047043,
"total_input_tokens": 61248998,
"total_output_tokens": 298118,
"total_tokens": 61547116,
"total_turns": 2205,
"avg_agent_execution_time": 183.4119,
"avg_input_tokens": 546866.0536,
"avg_output_tokens": 2661.7679,
"avg_total_tokens": 549527.8214,
"avg_turns": 19.6875,
"per_run_input_tokens": 15312249.5,
"per_run_output_tokens": 74529.5,
"per_run_cost": 9.373673,
"actual_model_name": "kimi-k2-0711-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1429,
"std": 0.0437
},
"pass@4": 0.3214,
"pass^4": 0.0714
},
"gpt-5-high": {
"total_tasks": 28,
"total_agent_execution_time": 130073.95933294296,
"total_input_tokens": 44878717,
"total_output_tokens": 5982277,
"total_tokens": 50860994,
"total_turns": 1522,
"avg_agent_execution_time": 1161.3746,
"avg_input_tokens": 400702.8304,
"avg_output_tokens": 53413.1875,
"avg_total_tokens": 454116.0179,
"avg_turns": 13.5893,
"per_run_input_tokens": 11219679.25,
"per_run_output_tokens": 1495569.25,
"per_run_cost": 28.980292,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4464,
"std": 0.0179
},
"pass@4": 0.6071,
"pass^4": 0.2143
},
"kimi-k2-0905": {
"total_tasks": 28,
"total_agent_execution_time": 52309.592324495316,
"total_input_tokens": 125127304,
"total_output_tokens": 582528,
"total_tokens": 125709832,
"total_turns": 3758,
"avg_agent_execution_time": 467.0499,
"avg_input_tokens": 1117208.0714,
"avg_output_tokens": 5201.1429,
"avg_total_tokens": 1122409.2143,
"avg_turns": 33.5536,
"per_run_input_tokens": 31281826,
"per_run_output_tokens": 145632,
"per_run_cost": 19.133176,
"actual_model_name": "kimi-k2-0905-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0804,
"std": 0.0296
},
"pass@4": 0.1071,
"pass^4": 0.0357
},
"gpt-5-nano-medium": {
"total_tasks": 28,
"total_agent_execution_time": 19092.810956716537,
"total_input_tokens": 22883863,
"total_output_tokens": 3592488,
"total_tokens": 26476351,
"total_turns": 835,
"avg_agent_execution_time": 170.4715,
"avg_input_tokens": 204320.2054,
"avg_output_tokens": 32075.7857,
"avg_total_tokens": 236395.9911,
"avg_turns": 7.4554,
"per_run_input_tokens": 5720965.75,
"per_run_output_tokens": 898122,
"per_run_cost": 0.645297,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0357,
"std": 0
},
"pass@4": 0.0357,
"pass^4": 0.0357
},
"grok-4": {
"total_tasks": 28,
"total_agent_execution_time": 62048.289714574814,
"total_input_tokens": 76007445,
"total_output_tokens": 1460247,
"total_tokens": 77467692,
"total_turns": 2256,
"avg_agent_execution_time": 554.0026,
"avg_input_tokens": 678637.9018,
"avg_output_tokens": 13037.9196,
"avg_total_tokens": 691675.8214,
"avg_turns": 20.1429,
"per_run_input_tokens": 19001861.25,
"per_run_output_tokens": 365061.75,
"per_run_cost": 62.48151,
"actual_model_name": "grok-4-0709",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0268,
"std": 0.0155
},
"pass@4": 0.0357,
"pass^4": 0
},
"claude-sonnet-4-high": {
"total_tasks": 28,
"total_agent_execution_time": 21350.178874015808,
"total_input_tokens": 154199936,
"total_output_tokens": 474188,
"total_tokens": 154674124,
"total_turns": 2162,
"avg_agent_execution_time": 190.6266,
"avg_input_tokens": 1376785.1429,
"avg_output_tokens": 4233.8214,
"avg_total_tokens": 1381018.9643,
"avg_turns": 19.3036,
"per_run_input_tokens": 38549984,
"per_run_output_tokens": 118547,
"per_run_cost": 117.428157,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.1964,
"std": 0.0818
},
"pass@4": 0.3571,
"pass^4": 0.0357
},
"gpt-oss-120b": {
"total_tasks": 28,
"total_agent_execution_time": 3811.6953434944153,
"total_input_tokens": 7651000,
"total_output_tokens": 192347,
"total_tokens": 7843347,
"total_turns": 615,
"avg_agent_execution_time": 34.033,
"avg_input_tokens": 68312.5,
"avg_output_tokens": 1717.3839,
"avg_total_tokens": 70029.8839,
"avg_turns": 5.4911,
"per_run_input_tokens": 1912750,
"per_run_output_tokens": 48086.75,
"per_run_cost": 0.151182,
"actual_model_name": "gpt-oss-120b",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0357,
"std": 0.0253
},
"pass@4": 0.1429,
"pass^4": 0
},
"o4-mini": {
"total_tasks": 28,
"total_agent_execution_time": 49544.87328624725,
"total_input_tokens": 29974571,
"total_output_tokens": 2909132,
"total_tokens": 32883703,
"total_turns": 1712,
"avg_agent_execution_time": 442.3649,
"avg_input_tokens": 267630.0982,
"avg_output_tokens": 25974.3929,
"avg_total_tokens": 293604.4911,
"avg_turns": 15.2857,
"per_run_input_tokens": 7493642.75,
"per_run_output_tokens": 727283,
"per_run_cost": 11.443052,
"actual_model_name": "o4-mini-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2054,
"std": 0.0585
},
"pass@4": 0.4286,
"pass^4": 0.0714
},
"gpt-4-1-mini": {
"total_tasks": 28,
"total_agent_execution_time": 6618.801545858383,
"total_input_tokens": 29427654,
"total_output_tokens": 151813,
"total_tokens": 29579467,
"total_turns": 1408,
"avg_agent_execution_time": 59.0964,
"avg_input_tokens": 262746.9107,
"avg_output_tokens": 1355.4732,
"avg_total_tokens": 264102.3839,
"avg_turns": 12.5714,
"per_run_input_tokens": 7356913.5,
"per_run_output_tokens": 37953.25,
"per_run_cost": 3.003491,
"actual_model_name": "gpt-4.1-mini-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0179,
"std": 0.0179
},
"pass@4": 0.0357,
"pass^4": 0
}
},
"playwright": {
"deepseek-chat": {
"total_tasks": 25,
"total_agent_execution_time": 28832.343264341354,
"total_input_tokens": 83601065,
"total_output_tokens": 176766,
"total_tokens": 83777831,
"total_turns": 1909,
"avg_agent_execution_time": 288.3234,
"avg_input_tokens": 836010.65,
"avg_output_tokens": 1767.66,
"avg_total_tokens": 837778.31,
"avg_turns": 19.09,
"per_run_input_tokens": 20900266.25,
"per_run_output_tokens": 44191.5,
"per_run_cost": 11.778391,
"actual_model_name": "deepseek-v3.1-non-think",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.07,
"std": 0.0332
},
"pass@4": 0.16,
"pass^4": 0
},
"gpt-4-1": {
"total_tasks": 25,
"total_agent_execution_time": 9215.533914804459,
"total_input_tokens": 85977431,
"total_output_tokens": 86136,
"total_tokens": 86063567,
"total_turns": 1380,
"avg_agent_execution_time": 92.1553,
"avg_input_tokens": 859774.31,
"avg_output_tokens": 861.36,
"avg_total_tokens": 860635.67,
"avg_turns": 13.8,
"per_run_input_tokens": 21494357.75,
"per_run_output_tokens": 21534,
"per_run_cost": 43.160987,
"actual_model_name": "gpt-4.1-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.08,
"std": 0.0283
},
"pass@4": 0.12,
"pass^4": 0.04
},
"claude-opus-4-1": {
"total_tasks": 25,
"total_agent_execution_time": 9880.799889326096,
"total_input_tokens": 28651348,
"total_output_tokens": 72107,
"total_tokens": 28723455,
"total_turns": 476,
"avg_agent_execution_time": 395.232,
"avg_input_tokens": 1146053.92,
"avg_output_tokens": 2884.28,
"avg_total_tokens": 1148938.2,
"avg_turns": 19.04,
"per_run_input_tokens": 28651348,
"per_run_output_tokens": 72107,
"per_run_cost": 435.178245,
"actual_model_name": "claude-opus-4-1-20250805",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.24,
"std": 0
}
},
"glm-4-5": {
"total_tasks": 25,
"total_agent_execution_time": 16563.084107398987,
"total_input_tokens": 58272900,
"total_output_tokens": 275965,
"total_tokens": 58548865,
"total_turns": 1536,
"avg_agent_execution_time": 165.6308,
"avg_input_tokens": 582729,
"avg_output_tokens": 2759.65,
"avg_total_tokens": 585488.65,
"avg_turns": 15.36,
"per_run_input_tokens": 14568225,
"per_run_output_tokens": 68991.25,
"per_run_cost": 4.898583,
"actual_model_name": "glm-4.5",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.13,
"std": 0.0332
},
"pass@4": 0.2,
"pass^4": 0.04
},
"o3": {
"total_tasks": 25,
"total_agent_execution_time": 15392.43521952629,
"total_input_tokens": 55630356,
"total_output_tokens": 446007,
"total_tokens": 56076363,
"total_turns": 1630,
"avg_agent_execution_time": 153.9244,
"avg_input_tokens": 556303.56,
"avg_output_tokens": 4460.07,
"avg_total_tokens": 560763.63,
"avg_turns": 16.3,
"per_run_input_tokens": 13907589,
"per_run_output_tokens": 111501.75,
"per_run_cost": 28.707192,
"actual_model_name": "o3-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.15,
"std": 0.052
},
"pass@4": 0.32,
"pass^4": 0.08
},
"claude-sonnet-4-low": {
"total_tasks": 25,
"total_agent_execution_time": 23905.656344890594,
"total_input_tokens": 207574613,
"total_output_tokens": 382254,
"total_tokens": 207956867,
"total_turns": 1956,
"avg_agent_execution_time": 239.0566,
"avg_input_tokens": 2075746.13,
"avg_output_tokens": 3822.54,
"avg_total_tokens": 2079568.67,
"avg_turns": 19.56,
"per_run_input_tokens": 51893653.25,
"per_run_output_tokens": 95563.5,
"per_run_cost": 157.114412,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.22,
"std": 0.0346
},
"pass@4": 0.28,
"pass^4": 0.2
},
"gemini-2-5-pro": {
"total_tasks": 25,
"total_agent_execution_time": 17766.667247772217,
"total_input_tokens": 169644208,
"total_output_tokens": 558015,
"total_tokens": 170202223,
"total_turns": 1915,
"avg_agent_execution_time": 177.6667,
"avg_input_tokens": 1696442.08,
"avg_output_tokens": 5580.15,
"avg_total_tokens": 1702022.23,
"avg_turns": 19.15,
"per_run_input_tokens": 42411052,
"per_run_output_tokens": 139503.75,
"per_run_cost": 108.120186,
"actual_model_name": "gemini-2.5-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.15,
"std": 0.0173
},
"pass@4": 0.32,
"pass^4": 0.04
},
"qwen-3-coder-plus": {
"total_tasks": 25,
"total_agent_execution_time": 68003.63626265526,
"total_input_tokens": 285157442,
"total_output_tokens": 238572,
"total_tokens": 285396014,
"total_turns": 2121,
"avg_agent_execution_time": 680.0364,
"avg_input_tokens": 2851574.42,
"avg_output_tokens": 2385.72,
"avg_total_tokens": 2853960.14,
"avg_turns": 21.21,
"per_run_input_tokens": 71289360.5,
"per_run_output_tokens": 59643,
"per_run_cost": 14.305587,
"actual_model_name": "qwen3-coder-480b-a35b-instruct",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.3,
"std": 0.0447
},
"pass@4": 0.48,
"pass^4": 0.08
},
"gpt-5-mini-high": {
"total_tasks": 25,
"total_agent_execution_time": 36555.053931713104,
"total_input_tokens": 228501117,
"total_output_tokens": 2286332,
"total_tokens": 230787449,
"total_turns": 2572,
"avg_agent_execution_time": 365.5505,
"avg_input_tokens": 2285011.17,
"avg_output_tokens": 22863.32,
"avg_total_tokens": 2307874.49,
"avg_turns": 25.72,
"per_run_input_tokens": 57125279.25,
"per_run_output_tokens": 571583,
"per_run_cost": 15.424486,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.15,
"std": 0.052
},
"pass@4": 0.32,
"pass^4": 0.04
},
"gpt-5-nano-low": {
"total_tasks": 25,
"total_agent_execution_time": 13930.66529917717,
"total_input_tokens": 42973579,
"total_output_tokens": 1369659,
"total_tokens": 44343238,
"total_turns": 1419,
"avg_agent_execution_time": 139.3067,
"avg_input_tokens": 429735.79,
"avg_output_tokens": 13696.59,
"avg_total_tokens": 443432.38,
"avg_turns": 14.19,
"per_run_input_tokens": 10743394.75,
"per_run_output_tokens": 342414.75,
"per_run_cost": 0.674136,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"gpt-5-medium": {
"total_tasks": 25,
"total_agent_execution_time": 60819.73624634743,
"total_input_tokens": 180716712,
"total_output_tokens": 2178557,
"total_tokens": 182895269,
"total_turns": 2378,
"avg_agent_execution_time": 608.1974,
"avg_input_tokens": 1807167.12,
"avg_output_tokens": 21785.57,
"avg_total_tokens": 1828952.69,
"avg_turns": 23.78,
"per_run_input_tokens": 45179178,
"per_run_output_tokens": 544639.25,
"per_run_cost": 61.920365,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.43,
"std": 0.052
},
"pass@4": 0.56,
"pass^4": 0.36
},
"claude-sonnet-4": {
"total_tasks": 25,
"total_agent_execution_time": 27869.116582155228,
"total_input_tokens": 124192324,
"total_output_tokens": 352308,
"total_tokens": 124544632,
"total_turns": 1980,
"avg_agent_execution_time": 278.6912,
"avg_input_tokens": 1241923.24,
"avg_output_tokens": 3523.08,
"avg_total_tokens": 1245446.32,
"avg_turns": 19.8,
"per_run_input_tokens": 31048081,
"per_run_output_tokens": 88077,
"per_run_cost": 94.465398,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.26,
"std": 0.06
},
"pass@4": 0.36,
"pass^4": 0.08
},
"gpt-5-low": {
"total_tasks": 25,
"total_agent_execution_time": 52685.02177119255,
"total_input_tokens": 172852606,
"total_output_tokens": 1872354,
"total_tokens": 174724960,
"total_turns": 2421,
"avg_agent_execution_time": 526.8502,
"avg_input_tokens": 1728526.06,
"avg_output_tokens": 18723.54,
"avg_total_tokens": 1747249.6,
"avg_turns": 24.21,
"per_run_input_tokens": 43213151.5,
"per_run_output_tokens": 468088.5,
"per_run_cost": 58.697324,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.45,
"std": 0.0173
},
"pass@4": 0.56,
"pass^4": 0.32
},
"gpt-5-mini-medium": {
"total_tasks": 25,
"total_agent_execution_time": 21595.999030590057,
"total_input_tokens": 181493809,
"total_output_tokens": 854754,
"total_tokens": 182348563,
"total_turns": 2275,
"avg_agent_execution_time": 215.96,
"avg_input_tokens": 1814938.09,
"avg_output_tokens": 8547.54,
"avg_total_tokens": 1823485.63,
"avg_turns": 22.75,
"per_run_input_tokens": 45373452.25,
"per_run_output_tokens": 213688.5,
"per_run_cost": 11.77074,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.12,
"std": 0.0632
},
"pass@4": 0.24,
"pass^4": 0.04
},
"gpt-4-1-nano": {
"total_tasks": 25,
"total_agent_execution_time": 5384.292573690414,
"total_input_tokens": 38979660,
"total_output_tokens": 73604,
"total_tokens": 39053264,
"total_turns": 1351,
"avg_agent_execution_time": 53.8429,
"avg_input_tokens": 389796.6,
"avg_output_tokens": 736.04,
"avg_total_tokens": 390532.64,
"avg_turns": 13.51,
"per_run_input_tokens": 9744915,
"per_run_output_tokens": 18401,
"per_run_cost": 0.981852,
"actual_model_name": "gpt-4.1-nano-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"grok-code-fast-1": {
"total_tasks": 25,
"total_agent_execution_time": 11951.442332029343,
"total_input_tokens": 115771636,
"total_output_tokens": 717365,
"total_tokens": 116489001,
"total_turns": 1823,
"avg_agent_execution_time": 119.5144,
"avg_input_tokens": 1157716.36,
"avg_output_tokens": 7173.65,
"avg_total_tokens": 1164890.01,
"avg_turns": 18.23,
"per_run_input_tokens": 28942909,
"per_run_output_tokens": 179341.25,
"per_run_cost": 6.057594,
"actual_model_name": "grok-code-fast-1",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.25,
"std": 0.0173
},
"pass@4": 0.36,
"pass^4": 0.08
},
"gemini-2-5-flash": {
"total_tasks": 25,
"total_agent_execution_time": 20540.323031187057,
"total_input_tokens": 383892671,
"total_output_tokens": 821477,
"total_tokens": 384714181,
"total_turns": 2633,
"avg_agent_execution_time": 205.4032,
"avg_input_tokens": 3838926.71,
"avg_output_tokens": 8214.77,
"avg_total_tokens": 3847141.81,
"avg_turns": 26.33,
"per_run_input_tokens": 95973167.75,
"per_run_output_tokens": 205369.25,
"per_run_cost": 29.305373,
"actual_model_name": "gemini-2.5-flash",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.06,
"std": 0.02
},
"pass@4": 0.12,
"pass^4": 0
},
"gpt-5-nano-high": {
"total_tasks": 25,
"total_agent_execution_time": 32499.491863250732,
"total_input_tokens": 153821020,
"total_output_tokens": 4015265,
"total_tokens": 157836285,
"total_turns": 2692,
"avg_agent_execution_time": 324.9949,
"avg_input_tokens": 1538210.2,
"avg_output_tokens": 40152.65,
"avg_total_tokens": 1578362.85,
"avg_turns": 26.92,
"per_run_input_tokens": 38455255,
"per_run_output_tokens": 1003816.25,
"per_run_cost": 2.324289,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.02,
"std": 0.02
},
"pass@4": 0.04,
"pass^4": 0
},
"gpt-5-mini-low": {
"total_tasks": 25,
"total_agent_execution_time": 6713.273415088654,
"total_input_tokens": 41994636,
"total_output_tokens": 185994,
"total_tokens": 42180630,
"total_turns": 1055,
"avg_agent_execution_time": 67.1327,
"avg_input_tokens": 419946.36,
"avg_output_tokens": 1859.94,
"avg_total_tokens": 421806.3,
"avg_turns": 10.55,
"per_run_input_tokens": 10498659,
"per_run_output_tokens": 46498.5,
"per_run_cost": 2.717662,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.01,
"std": 0.0173
},
"pass@4": 0.04,
"pass^4": 0
},
"qwen-3-max": {
"total_tasks": 25,
"total_agent_execution_time": 41767.95633363724,
"total_input_tokens": 229766924,
"total_output_tokens": 116233,
"total_tokens": 229883157,
"total_turns": 2783,
"avg_agent_execution_time": 417.6796,
"avg_input_tokens": 2297669.24,
"avg_output_tokens": 1162.33,
"avg_total_tokens": 2298831.57,
"avg_turns": 27.83,
"per_run_input_tokens": 57441731,
"per_run_output_tokens": 29058.25,
"per_run_cost": 69.104427,
"actual_model_name": "qwen3-max-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.08,
"std": 0
},
"pass@4": 0.12,
"pass^4": 0.04
},
"kimi-k2-0711": {
"total_tasks": 25,
"total_agent_execution_time": 22136.299335479736,
"total_input_tokens": 73770186,
"total_output_tokens": 169700,
"total_tokens": 73939886,
"total_turns": 1783,
"avg_agent_execution_time": 221.363,
"avg_input_tokens": 737701.86,
"avg_output_tokens": 1697,
"avg_total_tokens": 739398.86,
"avg_turns": 17.83,
"per_run_input_tokens": 18442546.5,
"per_run_output_tokens": 42425,
"per_run_cost": 11.17159,
"actual_model_name": "kimi-k2-0711-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.13,
"std": 0.0332
},
"pass@4": 0.16,
"pass^4": 0.08
},
"gpt-5-high": {
"total_tasks": 25,
"total_agent_execution_time": 111593.64490675926,
"total_input_tokens": 169300141,
"total_output_tokens": 3367299,
"total_tokens": 172667440,
"total_turns": 2312,
"avg_agent_execution_time": 1115.9364,
"avg_input_tokens": 1693001.41,
"avg_output_tokens": 33672.99,
"avg_total_tokens": 1726674.4,
"avg_turns": 23.12,
"per_run_input_tokens": 42325035.25,
"per_run_output_tokens": 841824.75,
"per_run_cost": 61.324542,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.42,
"std": 0.0447
},
"pass@4": 0.56,
"pass^4": 0.24
},
"kimi-k2-0905": {
"total_tasks": 25,
"total_agent_execution_time": 38055.46840238571,
"total_input_tokens": 135802332,
"total_output_tokens": 216646,
"total_tokens": 136018978,
"total_turns": 2064,
"avg_agent_execution_time": 380.5547,
"avg_input_tokens": 1358023.32,
"avg_output_tokens": 2166.46,
"avg_total_tokens": 1360189.78,
"avg_turns": 20.64,
"per_run_input_tokens": 33950583,
"per_run_output_tokens": 54161.5,
"per_run_cost": 20.505754,
"actual_model_name": "kimi-k2-0905-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.3,
"std": 0.06
},
"pass@4": 0.4,
"pass^4": 0.2
},
"gpt-5-nano-medium": {
"total_tasks": 25,
"total_agent_execution_time": 17114.76342177391,
"total_input_tokens": 71195262,
"total_output_tokens": 1771003,
"total_tokens": 72966265,
"total_turns": 1852,
"avg_agent_execution_time": 171.1476,
"avg_input_tokens": 711952.62,
"avg_output_tokens": 17710.03,
"avg_total_tokens": 729662.65,
"avg_turns": 18.52,
"per_run_input_tokens": 17798815.5,
"per_run_output_tokens": 442750.75,
"per_run_cost": 1.067041,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"grok-4": {
"total_tasks": 25,
"total_agent_execution_time": 27723.676310300827,
"total_input_tokens": 126490610,
"total_output_tokens": 663708,
"total_tokens": 127154318,
"total_turns": 2005,
"avg_agent_execution_time": 277.2368,
"avg_input_tokens": 1264906.1,
"avg_output_tokens": 6637.08,
"avg_total_tokens": 1271543.18,
"avg_turns": 20.05,
"per_run_input_tokens": 31622652.5,
"per_run_output_tokens": 165927,
"per_run_cost": 97.356863,
"actual_model_name": "grok-4-0709",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.35,
"std": 0.0768
},
"pass@4": 0.48,
"pass^4": 0.2
},
"claude-sonnet-4-high": {
"total_tasks": 25,
"total_agent_execution_time": 26188.123467206955,
"total_input_tokens": 203767484,
"total_output_tokens": 387535,
"total_tokens": 204155019,
"total_turns": 2003,
"avg_agent_execution_time": 261.8812,
"avg_input_tokens": 2037674.84,
"avg_output_tokens": 3875.35,
"avg_total_tokens": 2041550.19,
"avg_turns": 20.03,
"per_run_input_tokens": 50941871,
"per_run_output_tokens": 96883.75,
"per_run_cost": 154.278869,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.26,
"std": 0.02
},
"pass@4": 0.28,
"pass^4": 0.24
},
"gpt-oss-120b": {
"total_tasks": 25,
"total_agent_execution_time": 3726.188953638077,
"total_input_tokens": 13933005,
"total_output_tokens": 126750,
"total_tokens": 14059755,
"total_turns": 721,
"avg_agent_execution_time": 37.2619,
"avg_input_tokens": 139330.05,
"avg_output_tokens": 1267.5,
"avg_total_tokens": 140597.55,
"avg_turns": 7.21,
"per_run_input_tokens": 3483251.25,
"per_run_output_tokens": 31687.5,
"per_run_cost": 0.259667,
"actual_model_name": "gpt-oss-120b",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.03,
"std": 0.0173
},
"pass@4": 0.04,
"pass^4": 0
},
"o4-mini": {
"total_tasks": 25,
"total_agent_execution_time": 53063.511632204056,
"total_input_tokens": 86250613,
"total_output_tokens": 1807045,
"total_tokens": 88057658,
"total_turns": 1770,
"avg_agent_execution_time": 530.6351,
"avg_input_tokens": 862506.13,
"avg_output_tokens": 18070.45,
"avg_total_tokens": 880576.58,
"avg_turns": 17.7,
"per_run_input_tokens": 21562653.25,
"per_run_output_tokens": 451761.25,
"per_run_cost": 25.706668,
"actual_model_name": "o4-mini-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.12,
"std": 0.0283
},
"pass@4": 0.28,
"pass^4": 0
},
"gpt-4-1-mini": {
"total_tasks": 25,
"total_agent_execution_time": 19571.90656900406,
"total_input_tokens": 495914086,
"total_output_tokens": 327829,
"total_tokens": 496241915,
"total_turns": 3133,
"avg_agent_execution_time": 195.7191,
"avg_input_tokens": 4959140.86,
"avg_output_tokens": 3278.29,
"avg_total_tokens": 4962419.15,
"avg_turns": 31.33,
"per_run_input_tokens": 123978521.5,
"per_run_output_tokens": 81957.25,
"per_run_cost": 49.72254,
"actual_model_name": "gpt-4.1-mini-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
}
},
"postgres": {
"deepseek-chat": {
"total_tasks": 21,
"total_agent_execution_time": 29866.240534305573,
"total_input_tokens": 26594178,
"total_output_tokens": 390257,
"total_tokens": 26984435,
"total_turns": 2224,
"avg_agent_execution_time": 355.5505,
"avg_input_tokens": 316597.3571,
"avg_output_tokens": 4645.9167,
"avg_total_tokens": 321243.2738,
"avg_turns": 26.4762,
"per_run_input_tokens": 6648544.5,
"per_run_output_tokens": 97564.25,
"per_run_cost": 3.887093,
"actual_model_name": "deepseek-v3.1-non-think",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.4286,
"std": 0.0753
},
"pass@4": 0.619,
"pass^4": 0.2857
},
"gpt-4-1": {
"total_tasks": 21,
"total_agent_execution_time": 2425.0810141563416,
"total_input_tokens": 4628898,
"total_output_tokens": 100826,
"total_tokens": 4729724,
"total_turns": 682,
"avg_agent_execution_time": 28.87,
"avg_input_tokens": 55105.9286,
"avg_output_tokens": 1200.3095,
"avg_total_tokens": 56306.2381,
"avg_turns": 8.119,
"per_run_input_tokens": 1157224.5,
"per_run_output_tokens": 25206.5,
"per_run_cost": 2.516101,
"actual_model_name": "gpt-4.1-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0476,
"std": 0
},
"pass@4": 0.0476,
"pass^4": 0.0476
},
"claude-opus-4-1": {
"total_tasks": 21,
"total_agent_execution_time": 10822.950607061386,
"total_input_tokens": 5474339,
"total_output_tokens": 205732,
"total_tokens": 5680071,
"total_turns": 522,
"avg_agent_execution_time": 515.3786,
"avg_input_tokens": 260682.8095,
"avg_output_tokens": 9796.7619,
"avg_total_tokens": 270479.5714,
"avg_turns": 24.8571,
"per_run_input_tokens": 5474339,
"per_run_output_tokens": 205732,
"per_run_cost": 97.544985,
"actual_model_name": "claude-opus-4-1-20250805",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.3333,
"std": 0
}
},
"glm-4-5": {
"total_tasks": 21,
"total_agent_execution_time": 13283.361280679703,
"total_input_tokens": 17186959,
"total_output_tokens": 431440,
"total_tokens": 17618399,
"total_turns": 2133,
"avg_agent_execution_time": 158.1353,
"avg_input_tokens": 204606.6548,
"avg_output_tokens": 5136.1905,
"avg_total_tokens": 209742.8452,
"avg_turns": 25.3929,
"per_run_input_tokens": 4296739.75,
"per_run_output_tokens": 107860,
"per_run_cost": 1.560299,
"actual_model_name": "glm-4.5",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1429,
"std": 0.0753
},
"pass@4": 0.2381,
"pass^4": 0
},
"o3": {
"total_tasks": 21,
"total_agent_execution_time": 6351.111663579941,
"total_input_tokens": 5338793,
"total_output_tokens": 396424,
"total_tokens": 5735217,
"total_turns": 900,
"avg_agent_execution_time": 75.6085,
"avg_input_tokens": 63557.0595,
"avg_output_tokens": 4719.3333,
"avg_total_tokens": 68276.3929,
"avg_turns": 10.7143,
"per_run_input_tokens": 1334698.25,
"per_run_output_tokens": 99106,
"per_run_cost": 3.462244,
"actual_model_name": "o3-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.369,
"std": 0.0395
},
"pass@4": 0.6667,
"pass^4": 0.1429
},
"claude-sonnet-4-low": {
"total_tasks": 21,
"total_agent_execution_time": 15643.686558961868,
"total_input_tokens": 56619774,
"total_output_tokens": 560879,
"total_tokens": 57180653,
"total_turns": 2040,
"avg_agent_execution_time": 186.2344,
"avg_input_tokens": 674044.9286,
"avg_output_tokens": 6677.131,
"avg_total_tokens": 680722.0595,
"avg_turns": 24.2857,
"per_run_input_tokens": 14154943.5,
"per_run_output_tokens": 140219.75,
"per_run_cost": 44.568127,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.4881,
"std": 0.0704
},
"pass@4": 0.7143,
"pass^4": 0.3333
},
"gemini-2-5-pro": {
"total_tasks": 21,
"total_agent_execution_time": 7857.584183454514,
"total_input_tokens": 3338499,
"total_output_tokens": 748442,
"total_tokens": 4086941,
"total_turns": 626,
"avg_agent_execution_time": 93.5427,
"avg_input_tokens": 39744.0357,
"avg_output_tokens": 8910.0238,
"avg_total_tokens": 48654.0595,
"avg_turns": 7.4524,
"per_run_input_tokens": 834624.75,
"per_run_output_tokens": 187110.5,
"per_run_cost": 4.893219,
"actual_model_name": "gemini-2.5-pro",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.2619,
"std": 0.079
},
"pass@4": 0.4762,
"pass^4": 0.0952
},
"qwen-3-coder-plus": {
"total_tasks": 21,
"total_agent_execution_time": 11836.674258947372,
"total_input_tokens": 48207822,
"total_output_tokens": 430956,
"total_tokens": 48638778,
"total_turns": 2436,
"avg_agent_execution_time": 140.9128,
"avg_input_tokens": 573902.6429,
"avg_output_tokens": 5130.4286,
"avg_total_tokens": 579033.0714,
"avg_turns": 29,
"per_run_input_tokens": 12051955.5,
"per_run_output_tokens": 107739,
"per_run_cost": 2.496582,
"actual_model_name": "qwen3-coder-480b-a35b-instruct",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.4762,
"std": 0.0583
},
"pass@4": 0.619,
"pass^4": 0.381
},
"gpt-5-mini-high": {
"total_tasks": 21,
"total_agent_execution_time": 16902.33143734932,
"total_input_tokens": 13472021,
"total_output_tokens": 1982560,
"total_tokens": 15454581,
"total_turns": 1177,
"avg_agent_execution_time": 201.2182,
"avg_input_tokens": 160381.2024,
"avg_output_tokens": 23601.9048,
"avg_total_tokens": 183983.1071,
"avg_turns": 14.0119,
"per_run_input_tokens": 3368005.25,
"per_run_output_tokens": 495640,
"per_run_cost": 1.833281,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.6667,
"std": 0.0337
},
"pass@4": 0.8095,
"pass^4": 0.4286
},
"gpt-5-nano-low": {
"total_tasks": 21,
"total_agent_execution_time": 6595.001572132111,
"total_input_tokens": 2014372,
"total_output_tokens": 1323958,
"total_tokens": 3338330,
"total_turns": 569,
"avg_agent_execution_time": 78.5119,
"avg_input_tokens": 23980.619,
"avg_output_tokens": 15761.4048,
"avg_total_tokens": 39742.0238,
"avg_turns": 6.7738,
"per_run_input_tokens": 503593,
"per_run_output_tokens": 330989.5,
"per_run_cost": 0.157575,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0833,
"std": 0.0395
},
"pass@4": 0.1905,
"pass^4": 0
},
"gpt-5-medium": {
"total_tasks": 21,
"total_agent_execution_time": 28409.77977871895,
"total_input_tokens": 9521385,
"total_output_tokens": 1431337,
"total_tokens": 10952722,
"total_turns": 1123,
"avg_agent_execution_time": 338.2117,
"avg_input_tokens": 113349.8214,
"avg_output_tokens": 17039.7262,
"avg_total_tokens": 130389.5476,
"avg_turns": 13.369,
"per_run_input_tokens": 2380346.25,
"per_run_output_tokens": 357834.25,
"per_run_cost": 6.553775,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.7619,
"std": 0.0753
},
"pass@4": 1,
"pass^4": 0.4762
},
"claude-sonnet-4": {
"total_tasks": 21,
"total_agent_execution_time": 20113.901407003403,
"total_input_tokens": 27812468,
"total_output_tokens": 633803,
"total_tokens": 28446271,
"total_turns": 2251,
"avg_agent_execution_time": 239.4512,
"avg_input_tokens": 331100.8095,
"avg_output_tokens": 7545.2738,
"avg_total_tokens": 338646.0833,
"avg_turns": 26.7976,
"per_run_input_tokens": 6953117,
"per_run_output_tokens": 158450.75,
"per_run_cost": 23.236112,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.5357,
"std": 0.0619
},
"pass@4": 0.7143,
"pass^4": 0.381
},
"gpt-5-low": {
"total_tasks": 21,
"total_agent_execution_time": 22876.62664246559,
"total_input_tokens": 8603642,
"total_output_tokens": 1368540,
"total_tokens": 9972182,
"total_turns": 1051,
"avg_agent_execution_time": 272.3408,
"avg_input_tokens": 102424.3095,
"avg_output_tokens": 16292.1429,
"avg_total_tokens": 118716.4524,
"avg_turns": 12.5119,
"per_run_input_tokens": 2150910.5,
"per_run_output_tokens": 342135,
"per_run_cost": 6.109988,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.7381,
"std": 0.0412
},
"pass@4": 0.9524,
"pass^4": 0.381
},
"gpt-5-mini-medium": {
"total_tasks": 21,
"total_agent_execution_time": 8105.054537057877,
"total_input_tokens": 9693898,
"total_output_tokens": 778810,
"total_tokens": 10472708,
"total_turns": 989,
"avg_agent_execution_time": 96.4887,
"avg_input_tokens": 115403.5476,
"avg_output_tokens": 9271.5476,
"avg_total_tokens": 124675.0952,
"avg_turns": 11.7738,
"per_run_input_tokens": 2423474.5,
"per_run_output_tokens": 194702.5,
"per_run_cost": 0.995274,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.619,
"std": 0.0583
},
"pass@4": 0.9048,
"pass^4": 0.2857
},
"gpt-4-1-nano": {
"total_tasks": 21,
"total_agent_execution_time": 2732.388694524765,
"total_input_tokens": 5969485,
"total_output_tokens": 203920,
"total_tokens": 6173405,
"total_turns": 733,
"avg_agent_execution_time": 32.5284,
"avg_input_tokens": 71065.2976,
"avg_output_tokens": 2427.619,
"avg_total_tokens": 73492.9167,
"avg_turns": 8.7262,
"per_run_input_tokens": 1492371.25,
"per_run_output_tokens": 50980,
"per_run_cost": 0.169629,
"actual_model_name": "gpt-4.1-nano-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0
},
"grok-code-fast-1": {
"total_tasks": 21,
"total_agent_execution_time": 4306.698365211487,
"total_input_tokens": 19018054,
"total_output_tokens": 458508,
"total_tokens": 19476562,
"total_turns": 1655,
"avg_agent_execution_time": 51.2702,
"avg_input_tokens": 226405.4048,
"avg_output_tokens": 5458.4286,
"avg_total_tokens": 231863.8333,
"avg_turns": 19.7024,
"per_run_input_tokens": 4754513.5,
"per_run_output_tokens": 114627,
"per_run_cost": 1.122843,
"actual_model_name": "grok-code-fast-1",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.4762,
"std": 0.0476
},
"pass@4": 0.619,
"pass^4": 0.2857
},
"gemini-2-5-flash": {
"total_tasks": 21,
"total_agent_execution_time": 5116.033502817154,
"total_input_tokens": 3870615,
"total_output_tokens": 834214,
"total_tokens": 4704829,
"total_turns": 736,
"avg_agent_execution_time": 60.9052,
"avg_input_tokens": 46078.75,
"avg_output_tokens": 9931.119,
"avg_total_tokens": 56009.869,
"avg_turns": 8.7619,
"per_run_input_tokens": 967653.75,
"per_run_output_tokens": 208553.5,
"per_run_cost": 0.81168,
"actual_model_name": "gemini-2.5-flash",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1071,
"std": 0.0619
},
"pass@4": 0.2381,
"pass^4": 0.0476
},
"gpt-5-nano-high": {
"total_tasks": 21,
"total_agent_execution_time": 25844.52258706093,
"total_input_tokens": 15962048,
"total_output_tokens": 5159736,
"total_tokens": 21121784,
"total_turns": 1311,
"avg_agent_execution_time": 307.6729,
"avg_input_tokens": 190024.381,
"avg_output_tokens": 61425.4286,
"avg_total_tokens": 251449.8095,
"avg_turns": 15.6071,
"per_run_input_tokens": 3990512,
"per_run_output_tokens": 1289934,
"per_run_cost": 0.715499,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0952,
"std": 0.0337
},
"pass@4": 0.3333,
"pass^4": 0
},
"gpt-5-mini-low": {
"total_tasks": 21,
"total_agent_execution_time": 4425.5045709609985,
"total_input_tokens": 2294861,
"total_output_tokens": 393050,
"total_tokens": 2687911,
"total_turns": 491,
"avg_agent_execution_time": 52.6846,
"avg_input_tokens": 27319.7738,
"avg_output_tokens": 4679.1667,
"avg_total_tokens": 31998.9405,
"avg_turns": 5.8452,
"per_run_input_tokens": 573715.25,
"per_run_output_tokens": 98262.5,
"per_run_cost": 0.339954,
"actual_model_name": "gpt-5-mini-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1429,
"std": 0.0337
},
"pass@4": 0.2857,
"pass^4": 0
},
"qwen-3-max": {
"total_tasks": 21,
"total_agent_execution_time": 13406.892842531204,
"total_input_tokens": 16138565,
"total_output_tokens": 412055,
"total_tokens": 16550620,
"total_turns": 1586,
"avg_agent_execution_time": 159.6059,
"avg_input_tokens": 192125.7738,
"avg_output_tokens": 4905.4167,
"avg_total_tokens": 197031.1905,
"avg_turns": 18.881,
"per_run_input_tokens": 4034641.25,
"per_run_output_tokens": 103013.75,
"per_run_cost": 5.459652,
"actual_model_name": "qwen3-max-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.4405,
"std": 0.0206
},
"pass@4": 0.5238,
"pass^4": 0.381
},
"kimi-k2-0711": {
"total_tasks": 21,
"total_agent_execution_time": 20866.895277023315,
"total_input_tokens": 22082538,
"total_output_tokens": 385375,
"total_tokens": 22467913,
"total_turns": 2247,
"avg_agent_execution_time": 248.4154,
"avg_input_tokens": 262887.3571,
"avg_output_tokens": 4587.7976,
"avg_total_tokens": 267475.1548,
"avg_turns": 26.75,
"per_run_input_tokens": 5520634.5,
"per_run_output_tokens": 96343.75,
"per_run_cost": 3.55324,
"actual_model_name": "kimi-k2-0711-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.4048,
"std": 0.079
},
"pass@4": 0.7143,
"pass^4": 0.2857
},
"gpt-5-high": {
"total_tasks": 21,
"total_agent_execution_time": 82147.10680437088,
"total_input_tokens": 13455198,
"total_output_tokens": 2864869,
"total_tokens": 16320067,
"total_turns": 1379,
"avg_agent_execution_time": 977.9417,
"avg_input_tokens": 160180.9286,
"avg_output_tokens": 34105.5833,
"avg_total_tokens": 194286.5119,
"avg_turns": 16.4167,
"per_run_input_tokens": 3363799.5,
"per_run_output_tokens": 716217.25,
"per_run_cost": 11.366922,
"actual_model_name": "gpt-5-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.7262,
"std": 0.0395
},
"pass@4": 0.8571,
"pass^4": 0.5238
},
"kimi-k2-0905": {
"total_tasks": 21,
"total_agent_execution_time": 43449.057983875275,
"total_input_tokens": 37057475,
"total_output_tokens": 451643,
"total_tokens": 37509118,
"total_turns": 2538,
"avg_agent_execution_time": 517.2507,
"avg_input_tokens": 441160.4167,
"avg_output_tokens": 5376.7024,
"avg_total_tokens": 446537.119,
"avg_turns": 30.2143,
"per_run_input_tokens": 9264368.75,
"per_run_output_tokens": 112910.75,
"per_run_cost": 5.840898,
"actual_model_name": "kimi-k2-0905-preview",
"is_open_source_model": true,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.4762,
"std": 0.0476
},
"pass@4": 0.6667,
"pass^4": 0.2857
},
"gpt-5-nano-medium": {
"total_tasks": 21,
"total_agent_execution_time": 10844.792605400085,
"total_input_tokens": 8821599,
"total_output_tokens": 1935384,
"total_tokens": 10756983,
"total_turns": 795,
"avg_agent_execution_time": 129.1047,
"avg_input_tokens": 105019.0357,
"avg_output_tokens": 23040.2857,
"avg_total_tokens": 128059.3214,
"avg_turns": 9.4643,
"per_run_input_tokens": 2205399.75,
"per_run_output_tokens": 483846,
"per_run_cost": 0.303808,
"actual_model_name": "gpt-5-nano-2025-08-07",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.1548,
"std": 0.0519
},
"pass@4": 0.2857,
"pass^4": 0.0476
},
"grok-4": {
"total_tasks": 21,
"total_agent_execution_time": 17157.205701589584,
"total_input_tokens": 15630090,
"total_output_tokens": 691637,
"total_tokens": 16321727,
"total_turns": 1503,
"avg_agent_execution_time": 204.2524,
"avg_input_tokens": 186072.5,
"avg_output_tokens": 8233.7738,
"avg_total_tokens": 194306.2738,
"avg_turns": 17.8929,
"per_run_input_tokens": 3907522.5,
"per_run_output_tokens": 172909.25,
"per_run_cost": 14.316206,
"actual_model_name": "grok-4-0709",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.5833,
"std": 0.0781
},
"pass@4": 0.8095,
"pass^4": 0.381
},
"claude-sonnet-4-high": {
"total_tasks": 21,
"total_agent_execution_time": 13869.583010196686,
"total_input_tokens": 36794175,
"total_output_tokens": 555014,
"total_tokens": 37349189,
"total_turns": 2056,
"avg_agent_execution_time": 165.1141,
"avg_input_tokens": 438025.8929,
"avg_output_tokens": 6607.3095,
"avg_total_tokens": 444633.2024,
"avg_turns": 24.4762,
"per_run_input_tokens": 9198543.75,
"per_run_output_tokens": 138753.5,
"per_run_cost": 29.676934,
"actual_model_name": "claude-sonnet-4-20250514",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.5,
"std": 0.0714
},
"pass@4": 0.6667,
"pass^4": 0.381
},
"gpt-oss-120b": {
"total_tasks": 21,
"total_agent_execution_time": 1959.240404367447,
"total_input_tokens": 1794062,
"total_output_tokens": 119313,
"total_tokens": 1913375,
"total_turns": 426,
"avg_agent_execution_time": 23.3243,
"avg_input_tokens": 21357.881,
"avg_output_tokens": 1420.3929,
"avg_total_tokens": 22778.2738,
"avg_turns": 5.0714,
"per_run_input_tokens": 448515.5,
"per_run_output_tokens": 29828.25,
"per_run_cost": 0.040645,
"actual_model_name": "gpt-oss-120b",
"is_open_source_model": true,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.0714,
"std": 0.0238
},
"pass@4": 0.2381,
"pass^4": 0
},
"o4-mini": {
"total_tasks": 21,
"total_agent_execution_time": 7117.457372426987,
"total_input_tokens": 1337143,
"total_output_tokens": 483676,
"total_tokens": 1820819,
"total_turns": 425,
"avg_agent_execution_time": 84.7316,
"avg_input_tokens": 15918.369,
"avg_output_tokens": 5758.0476,
"avg_total_tokens": 21676.4167,
"avg_turns": 5.0595,
"per_run_input_tokens": 334285.75,
"per_run_output_tokens": 120919,
"per_run_cost": 0.899758,
"actual_model_name": "o4-mini-2025-04-16",
"is_open_source_model": false,
"is_reasoning_model": true,
"pass@1": {
"avg": 0.119,
"std": 0.0412
},
"pass@4": 0.1905,
"pass^4": 0.0476
},
"gpt-4-1-mini": {
"total_tasks": 21,
"total_agent_execution_time": 3538.005044221878,
"total_input_tokens": 3917278,
"total_output_tokens": 149972,
"total_tokens": 4067250,
"total_turns": 821,
"avg_agent_execution_time": 42.1191,
"avg_input_tokens": 46634.2619,
"avg_output_tokens": 1785.381,
"avg_total_tokens": 48419.6429,
"avg_turns": 9.7738,
"per_run_input_tokens": 979319.5,
"per_run_output_tokens": 37493,
"per_run_cost": 0.451717,
"actual_model_name": "gpt-4.1-mini-2025-04-14",
"is_open_source_model": false,
"is_reasoning_model": false,
"pass@1": {
"avg": 0.0952,
"std": 0.0337
},
"pass@4": 0.1429,
"pass^4": 0.0476
}
},
"experiment_name": "mcpmark-release"
}