MCPMark Leaderboard
Generated at 08/25/2025 16:02:00
• claude-opus only ran pass@1 evaluation
• grok-4 could not retrieve token information due to compatibility issues
MCP Server Analysis
Detailed performance analysis by MCP server with Pass@1/Pass@4/Pass^4 metrics
Note: view the JSON below for detailed grade information.
JSON
{
"generated_at": "2025-08-25T16:02:00.888958",
"experiment_name": "consolidated-all-services-0823",
"k": 4,
"overall": {
"claude-4-1-opus": {
"total_tasks": 127,
"total_agent_execution_time": 43079.61178779602,
"total_input_tokens": 72916181,
"total_output_tokens": 577066,
"total_tokens": 73493247,
"total_turns": 2167,
"avg_agent_execution_time": 339.2095,
"avg_input_tokens": 574143.1575,
"avg_output_tokens": 4543.8268,
"avg_total_tokens": 578686.9843,
"avg_turns": 17.063,
"per_run_input_tokens": 72916181,
"per_run_output_tokens": 577066,
"per_run_cost": 1137.0227,
"actual_model_name": "claude-opus-4-1-20250805",
"pass@1": {
"avg": 0.2962,
"std": 0
}
},
"claude-4-sonnet": {
"total_tasks": 127,
"total_agent_execution_time": 107301.59874916077,
"total_input_tokens": 310048126,
"total_output_tokens": 2194611,
"total_tokens": 312242737,
"total_turns": 9067,
"avg_agent_execution_time": 211.2236,
"avg_input_tokens": 610330.9567,
"avg_output_tokens": 4320.1004,
"avg_total_tokens": 614651.0571,
"avg_turns": 17.8484,
"per_run_input_tokens": 77512031,
"per_run_output_tokens": 548652,
"per_run_cost": 240.7659,
"actual_model_name": "claude-sonnet-4-20250514",
"pass@1": {
"avg": 0.2763,
"std": 0.0623
},
"pass@4": 0.4385,
"pass^4": 0.1095
},
"deepseek-chat": {
"total_tasks": 127,
"total_agent_execution_time": 134924.74478578568,
"total_input_tokens": 246544002,
"total_output_tokens": 1346283,
"total_tokens": 247890285,
"total_turns": 9624,
"avg_agent_execution_time": 265.5999,
"avg_input_tokens": 485322.8386,
"avg_output_tokens": 2650.1634,
"avg_total_tokens": 487973.002,
"avg_turns": 18.9449,
"per_run_input_tokens": 61636000,
"per_run_output_tokens": 336570,
"per_run_cost": 17.0119,
"actual_model_name": "deepseek-v3.1-non-think",
"pass@1": {
"avg": 0.1631,
"std": 0.0425
},
"pass@4": 0.2757,
"pass^4": 0.0792
},
"gemini-2-5-pro": {
"total_tasks": 127,
"total_agent_execution_time": 54243.50524830818,
"total_input_tokens": 201670879,
"total_output_tokens": 3008536,
"total_tokens": 204679415,
"total_turns": 4411,
"avg_agent_execution_time": 106.7786,
"avg_input_tokens": 396989.9193,
"avg_output_tokens": 5922.315,
"avg_total_tokens": 402912.2343,
"avg_turns": 8.6831,
"per_run_input_tokens": 50417719,
"per_run_output_tokens": 752134,
"per_run_cost": 137.3263,
"actual_model_name": "gemini-2.5-pro",
"pass@1": {
"avg": 0.1222,
"std": 0.036
},
"pass@4": 0.2453,
"pass^4": 0.0404
},
"gpt-5": {
"total_tasks": 127,
"total_agent_execution_time": 195925.04424381256,
"total_input_tokens": 316223124,
"total_output_tokens": 9609603,
"total_tokens": 325832727,
"total_turns": 7131,
"avg_agent_execution_time": 385.6792,
"avg_input_tokens": 622486.4646,
"avg_output_tokens": 18916.5413,
"avg_total_tokens": 641403.0059,
"avg_turns": 14.0374,
"per_run_input_tokens": 79055781,
"per_run_output_tokens": 2402400,
"per_run_cost": 122.8437,
"actual_model_name": "gpt-5-2025-08-07",
"pass@1": {
"avg": 0.4696,
"std": 0.0507
},
"pass@4": 0.6345,
"pass^4": 0.2544
},
"grok-4": {
"total_tasks": 127,
"total_agent_execution_time": 55695.29113435747,
"total_input_tokens": 0,
"total_output_tokens": 0,
"total_tokens": 0,
"total_turns": 3580,
"avg_agent_execution_time": 109.6364,
"avg_input_tokens": 0,
"avg_output_tokens": 0,
"avg_total_tokens": 0,
"avg_turns": 7.0472,
"per_run_input_tokens": 0,
"per_run_output_tokens": 0,
"per_run_cost": 0,
"actual_model_name": "grok-4-0709",
"pass@1": {
"avg": 0.18,
"std": 0.0515
},
"pass@4": 0.295,
"pass^4": 0.0676
},
"k2": {
"total_tasks": 127,
"total_agent_execution_time": 108763.46826410294,
"total_input_tokens": 209937443,
"total_output_tokens": 1262180,
"total_tokens": 211199623,
"total_turns": 8711,
"avg_agent_execution_time": 214.1013,
"avg_input_tokens": 413262.6831,
"avg_output_tokens": 2484.6063,
"avg_total_tokens": 415747.2894,
"avg_turns": 17.1476,
"per_run_input_tokens": 52484360,
"per_run_output_tokens": 315545,
"per_run_cost": 8.5037,
"actual_model_name": "kimi-k2-0711-preview",
"pass@1": {
"avg": 0.1808,
"std": 0.0568
},
"pass@4": 0.3014,
"pass^4": 0.1061
},
"o3": {
"total_tasks": 127,
"total_agent_execution_time": 86026.92910456657,
"total_input_tokens": 210117262,
"total_output_tokens": 4358963,
"total_tokens": 214476225,
"total_turns": 8351,
"avg_agent_execution_time": 169.3443,
"avg_input_tokens": 413616.6575,
"avg_output_tokens": 8580.6358,
"avg_total_tokens": 422197.2933,
"avg_turns": 16.439,
"per_run_input_tokens": 52529315,
"per_run_output_tokens": 1089740,
"per_run_cost": 105.6035,
"actual_model_name": "o3-2025-04-16",
"pass@1": {
"avg": 0.2498,
"std": 0.0465
},
"pass@4": 0.4337,
"pass^4": 0.1209
},
"qwen-3-coder": {
"total_tasks": 127,
"total_agent_execution_time": 98039.19856882095,
"total_input_tokens": 431046337,
"total_output_tokens": 1557744,
"total_tokens": 432604081,
"total_turns": 10940,
"avg_agent_execution_time": 192.9905,
"avg_input_tokens": 848516.4114,
"avg_output_tokens": 3066.4252,
"avg_total_tokens": 851582.8366,
"avg_turns": 21.5354,
"per_run_input_tokens": 107761584,
"per_run_output_tokens": 389436,
"per_run_cost": 32.9126,
"actual_model_name": "qwen3-coder-480b-a35b-instruct",
"pass@1": {
"avg": 0.1871,
"std": 0.0585
},
"pass@4": 0.3211,
"pass^4": 0.0809
}
},
"filesystem": {
"claude-4-1-opus": {
"total_tasks": 30,
"pass@1": {
"avg": 0.3333,
"std": 0
},
"total_agent_execution_time": 8033.295060634613,
"total_input_tokens": 8164979,
"total_output_tokens": 130994,
"total_tokens": 8295973,
"total_turns": 491,
"avg_agent_execution_time": 267.7765,
"avg_input_tokens": 272165.9667,
"avg_output_tokens": 4366.4667,
"avg_total_tokens": 276532.4333,
"avg_turns": 16.3667,
"per_run_input_tokens": 8164979,
"per_run_output_tokens": 130994,
"per_run_cost": 132.2992,
"actual_model_name": "claude-opus-4-1-20250805"
},
"claude-4-sonnet": {
"total_tasks": 30,
"pass@1": {
"avg": 0.275,
"std": 0.0319
},
"pass@4": 0.5,
"pass^4": 0.0667,
"total_agent_execution_time": 23177.59760403633,
"total_input_tokens": 36265545,
"total_output_tokens": 479856,
"total_tokens": 36745401,
"total_turns": 1922,
"avg_agent_execution_time": 193.1466,
"avg_input_tokens": 302212.875,
"avg_output_tokens": 3998.8,
"avg_total_tokens": 306211.675,
"avg_turns": 16.0167,
"per_run_input_tokens": 9066386,
"per_run_output_tokens": 119964,
"per_run_cost": 28.9986,
"actual_model_name": "claude-sonnet-4-20250514"
},
"deepseek-chat": {
"total_tasks": 30,
"pass@1": {
"avg": 0.1583,
"std": 0.0167
},
"pass@4": 0.2667,
"pass^4": 0.0667,
"total_agent_execution_time": 35387.34510588646,
"total_input_tokens": 57387768,
"total_output_tokens": 413190,
"total_tokens": 57800958,
"total_turns": 2996,
"avg_agent_execution_time": 294.8945,
"avg_input_tokens": 478231.4,
"avg_output_tokens": 3443.25,
"avg_total_tokens": 481674.65,
"avg_turns": 24.9667,
"per_run_input_tokens": 14346942,
"per_run_output_tokens": 103297,
"per_run_cost": 3.9873,
"actual_model_name": "deepseek-v3.1-non-think"
},
"gemini-2-5-pro": {
"total_tasks": 30,
"pass@1": {
"avg": 0.1667,
"std": 0.0272
},
"pass@4": 0.3,
"pass^4": 0.0667,
"total_agent_execution_time": 14496.54878783226,
"total_input_tokens": 21693516,
"total_output_tokens": 816074,
"total_tokens": 22509590,
"total_turns": 1289,
"avg_agent_execution_time": 120.8046,
"avg_input_tokens": 180779.3,
"avg_output_tokens": 6800.6167,
"avg_total_tokens": 187579.9167,
"avg_turns": 10.7417,
"per_run_input_tokens": 5423379,
"per_run_output_tokens": 204018,
"per_run_cost": 16.6187,
"actual_model_name": "gemini-2.5-pro"
},
"gpt-5": {
"total_tasks": 30,
"pass@1": {
"avg": 0.5417,
"std": 0.0788
},
"pass@4": 0.7333,
"pass^4": 0.3333,
"total_agent_execution_time": 33070.53626990318,
"total_input_tokens": 34189054,
"total_output_tokens": 1920255,
"total_tokens": 36109309,
"total_turns": 1295,
"avg_agent_execution_time": 275.5878,
"avg_input_tokens": 284908.7833,
"avg_output_tokens": 16002.125,
"avg_total_tokens": 300910.9083,
"avg_turns": 10.7917,
"per_run_input_tokens": 8547263,
"per_run_output_tokens": 480063,
"per_run_cost": 15.4847,
"actual_model_name": "gpt-5-2025-08-07"
},
"grok-4": {
"total_tasks": 30,
"pass@1": {
"avg": 0.2167,
"std": 0.0694
},
"pass@4": 0.4333,
"pass^4": 0.1,
"total_agent_execution_time": 24664.243208885193,
"total_input_tokens": 0,
"total_output_tokens": 0,
"total_tokens": 0,
"total_turns": 998,
"avg_agent_execution_time": 205.5354,
"avg_input_tokens": 0,
"avg_output_tokens": 0,
"avg_total_tokens": 0,
"avg_turns": 8.3167,
"per_run_input_tokens": 0,
"per_run_output_tokens": 0,
"per_run_cost": 0,
"actual_model_name": "grok-4-0709"
},
"k2": {
"total_tasks": 30,
"pass@1": {
"avg": 0.2,
"std": 0.0272
},
"pass@4": 0.3,
"pass^4": 0.1333,
"total_agent_execution_time": 26445.638306856155,
"total_input_tokens": 43057637,
"total_output_tokens": 421500,
"total_tokens": 43479137,
"total_turns": 2406,
"avg_agent_execution_time": 220.3803,
"avg_input_tokens": 358813.6417,
"avg_output_tokens": 3512.5,
"avg_total_tokens": 362326.1417,
"avg_turns": 20.05,
"per_run_input_tokens": 10764409,
"per_run_output_tokens": 105375,
"per_run_cost": 1.8254,
"actual_model_name": "kimi-k2-0711-preview"
},
"o3": {
"total_tasks": 30,
"pass@1": {
"avg": 0.3583,
"std": 0.0319
},
"pass@4": 0.5,
"pass^4": 0.2667,
"total_agent_execution_time": 33353.133106946945,
"total_input_tokens": 82757119,
"total_output_tokens": 2135261,
"total_tokens": 84892380,
"total_turns": 3455,
"avg_agent_execution_time": 277.9428,
"avg_input_tokens": 689642.6583,
"avg_output_tokens": 17793.8417,
"avg_total_tokens": 707436.5,
"avg_turns": 28.7917,
"per_run_input_tokens": 20689279,
"per_run_output_tokens": 533815,
"per_run_cost": 41.6455,
"actual_model_name": "o3-2025-04-16"
},
"qwen-3-coder": {
"total_tasks": 30,
"pass@1": {
"avg": 0.1,
"std": 0.0272
},
"pass@4": 0.1667,
"pass^4": 0.0667,
"total_agent_execution_time": 17427.514757871628,
"total_input_tokens": 50805263,
"total_output_tokens": 434272,
"total_tokens": 51239535,
"total_turns": 2848,
"avg_agent_execution_time": 145.2293,
"avg_input_tokens": 423377.1917,
"avg_output_tokens": 3618.9333,
"avg_total_tokens": 426996.125,
"avg_turns": 23.7333,
"per_run_input_tokens": 12701315,
"per_run_output_tokens": 108568,
"per_run_cost": 3.9732,
"actual_model_name": "qwen-3-coder-480b"
}
},
"github": {
"claude-4-1-opus": {
"total_tasks": 23,
"pass@1": {
"avg": 0.2174,
"std": 0
},
"total_agent_execution_time": 6103.758730888367,
"total_input_tokens": 12759916,
"total_output_tokens": 58089,
"total_tokens": 12818005,
"total_turns": 201,
"avg_agent_execution_time": 265.3808,
"avg_input_tokens": 554778.9565,
"avg_output_tokens": 2525.6087,
"avg_total_tokens": 557304.5652,
"avg_turns": 8.7391,
"per_run_input_tokens": 12759916,
"per_run_output_tokens": 58089,
"per_run_cost": 195.7554,
"actual_model_name": "claude-opus-4-1-20250805"
},
"claude-4-sonnet": {
"total_tasks": 23,
"pass@1": {
"avg": 0.1413,
"std": 0.0743
},
"pass@4": 0.2609,
"pass^4": 0.0435,
"total_agent_execution_time": 15039.942813634872,
"total_input_tokens": 54684140,
"total_output_tokens": 295616,
"total_tokens": 54979756,
"total_turns": 856,
"avg_agent_execution_time": 163.4776,
"avg_input_tokens": 594392.8261,
"avg_output_tokens": 3213.2174,
"avg_total_tokens": 597606.0435,
"avg_turns": 9.3043,
"per_run_input_tokens": 13671035,
"per_run_output_tokens": 73904,
"per_run_cost": 42.1217,
"actual_model_name": "claude-sonnet-4-20250514"
},
"deepseek-chat": {
"total_tasks": 23,
"pass@1": {
"avg": 0.087,
"std": 0.0355
},
"pass@4": 0.1304,
"pass^4": 0.0435,
"total_agent_execution_time": 17183.911482810974,
"total_input_tokens": 28838553,
"total_output_tokens": 179346,
"total_tokens": 29017899,
"total_turns": 771,
"avg_agent_execution_time": 186.7816,
"avg_input_tokens": 313462.5326,
"avg_output_tokens": 1949.413,
"avg_total_tokens": 315411.9457,
"avg_turns": 8.3804,
"per_run_input_tokens": 7209638,
"per_run_output_tokens": 44836,
"per_run_cost": 1.9959,
"actual_model_name": "deepseek-v3.1-non-think"
},
"gemini-2-5-pro": {
"total_tasks": 23,
"pass@1": {
"avg": 0.0326,
"std": 0.0416
},
"pass@4": 0.1304,
"pass^4": 0,
"total_agent_execution_time": 6112.180836439133,
"total_input_tokens": 3034667,
"total_output_tokens": 418333,
"total_tokens": 3453000,
"total_turns": 263,
"avg_agent_execution_time": 66.4367,
"avg_input_tokens": 32985.5109,
"avg_output_tokens": 4547.0978,
"avg_total_tokens": 37532.6087,
"avg_turns": 2.8587,
"per_run_input_tokens": 758666,
"per_run_output_tokens": 104583,
"per_run_cost": 3.4654,
"actual_model_name": "gemini-2.5-pro"
},
"gpt-5": {
"total_tasks": 23,
"pass@1": {
"avg": 0.2609,
"std": 0.0355
},
"pass@4": 0.3913,
"pass^4": 0.1304,
"total_agent_execution_time": 24182.901379346848,
"total_input_tokens": 52951730,
"total_output_tokens": 1079307,
"total_tokens": 54031037,
"total_turns": 860,
"avg_agent_execution_time": 262.8576,
"avg_input_tokens": 575562.2826,
"avg_output_tokens": 11731.5978,
"avg_total_tokens": 587293.8804,
"avg_turns": 9.3478,
"per_run_input_tokens": 13237932,
"per_run_output_tokens": 269826,
"per_run_cost": 19.2457,
"actual_model_name": "gpt-5-2025-08-07"
},
"grok-4": {
"total_tasks": 23,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0,
"total_agent_execution_time": 2796.729387998581,
"total_input_tokens": 0,
"total_output_tokens": 0,
"total_tokens": 0,
"total_turns": 0,
"avg_agent_execution_time": 30.3992,
"avg_input_tokens": 0,
"avg_output_tokens": 0,
"avg_total_tokens": 0,
"avg_turns": 0,
"per_run_input_tokens": 0,
"per_run_output_tokens": 0,
"per_run_cost": 0,
"actual_model_name": "grok-4-0709"
},
"k2": {
"total_tasks": 23,
"pass@1": {
"avg": 0.0761,
"std": 0.0547
},
"pass@4": 0.1304,
"pass^4": 0,
"total_agent_execution_time": 15361.710830926895,
"total_input_tokens": 27315814,
"total_output_tokens": 125811,
"total_tokens": 27441625,
"total_turns": 640,
"avg_agent_execution_time": 166.9751,
"avg_input_tokens": 296911.0217,
"avg_output_tokens": 1367.5109,
"avg_total_tokens": 298278.5326,
"avg_turns": 6.9565,
"per_run_input_tokens": 6828953,
"per_run_output_tokens": 31452,
"per_run_cost": 1.0872,
"actual_model_name": "kimi-k2-0711-preview"
},
"o3": {
"total_tasks": 23,
"pass@1": {
"avg": 0.1304,
"std": 0.0502
},
"pass@4": 0.2174,
"pass^4": 0.0435,
"total_agent_execution_time": 11734.32154250145,
"total_input_tokens": 41199011,
"total_output_tokens": 320893,
"total_tokens": 41519904,
"total_turns": 829,
"avg_agent_execution_time": 127.547,
"avg_input_tokens": 447815.337,
"avg_output_tokens": 3487.9674,
"avg_total_tokens": 451303.3043,
"avg_turns": 9.0109,
"per_run_input_tokens": 10299752,
"per_run_output_tokens": 80223,
"per_run_cost": 20.6396,
"actual_model_name": "o3-2025-04-16"
},
"qwen-3-coder": {
"total_tasks": 23,
"pass@1": {
"avg": 0.1087,
"std": 0.0561
},
"pass@4": 0.1739,
"pass^4": 0.0435,
"total_agent_execution_time": 18877.776975631714,
"total_input_tokens": 91676725,
"total_output_tokens": 239313,
"total_tokens": 91916038,
"total_turns": 1123,
"avg_agent_execution_time": 205.1932,
"avg_input_tokens": 996486.1413,
"avg_output_tokens": 2601.2283,
"avg_total_tokens": 999087.3696,
"avg_turns": 12.2065,
"per_run_input_tokens": 22919181,
"per_run_output_tokens": 59828,
"per_run_cost": 6.9655,
"actual_model_name": "qwen3-coder-480b-a35b-instruct"
}
},
"notion": {
"claude-4-1-opus": {
"total_tasks": 28,
"pass@1": {
"avg": 0.3571,
"std": 0
},
"total_agent_execution_time": 8238.807499885559,
"total_input_tokens": 17865599,
"total_output_tokens": 110144,
"total_tokens": 17975743,
"total_turns": 477,
"avg_agent_execution_time": 294.2431,
"avg_input_tokens": 638057.1071,
"avg_output_tokens": 3933.7143,
"avg_total_tokens": 641990.8214,
"avg_turns": 17.0357,
"per_run_input_tokens": 17865599,
"per_run_output_tokens": 110144,
"per_run_cost": 276.2448,
"actual_model_name": "claude-opus-4-1-20250805"
},
"claude-4-sonnet": {
"total_tasks": 28,
"pass@1": {
"avg": 0.2054,
"std": 0.0449
},
"pass@4": 0.3571,
"pass^4": 0.0714,
"total_agent_execution_time": 21727.141927719116,
"total_input_tokens": 68866519,
"total_output_tokens": 459748,
"total_tokens": 69326267,
"total_turns": 2153,
"avg_agent_execution_time": 193.9923,
"avg_input_tokens": 614879.6339,
"avg_output_tokens": 4104.8929,
"avg_total_tokens": 618984.5268,
"avg_turns": 19.2232,
"per_run_input_tokens": 17216629,
"per_run_output_tokens": 114937,
"per_run_cost": 53.3739,
"actual_model_name": "claude-sonnet-4-20250514"
},
"deepseek-chat": {
"total_tasks": 28,
"pass@1": {
"avg": 0.1071,
"std": 0.0505
},
"pass@4": 0.25,
"pass^4": 0,
"total_agent_execution_time": 26992.861032247543,
"total_input_tokens": 53985039,
"total_output_tokens": 247605,
"total_tokens": 54232644,
"total_turns": 1981,
"avg_agent_execution_time": 241.0077,
"avg_input_tokens": 482009.2768,
"avg_output_tokens": 2210.7589,
"avg_total_tokens": 484220.0357,
"avg_turns": 17.6875,
"per_run_input_tokens": 13496259,
"per_run_output_tokens": 61901,
"per_run_cost": 3.7121,
"actual_model_name": "deepseek-v3.1-non-think"
},
"gemini-2-5-pro": {
"total_tasks": 28,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0,
"total_agent_execution_time": 8772.349657773972,
"total_input_tokens": 6688452,
"total_output_tokens": 514692,
"total_tokens": 7203144,
"total_turns": 417,
"avg_agent_execution_time": 78.3246,
"avg_input_tokens": 59718.3214,
"avg_output_tokens": 4595.4643,
"avg_total_tokens": 64313.7857,
"avg_turns": 3.7232,
"per_run_input_tokens": 1672113,
"per_run_output_tokens": 128673,
"per_run_cost": 6.1104,
"actual_model_name": "gemini-2.5-pro"
},
"gpt-5": {
"total_tasks": 28,
"pass@1": {
"avg": 0.3571,
"std": 0.0714
},
"pass@4": 0.5357,
"pass^4": 0.1071,
"total_agent_execution_time": 63109.95818090439,
"total_input_tokens": 47626092,
"total_output_tokens": 3369147,
"total_tokens": 50995239,
"total_turns": 1504,
"avg_agent_execution_time": 563.4818,
"avg_input_tokens": 425232.9643,
"avg_output_tokens": 30081.6696,
"avg_total_tokens": 455314.6339,
"avg_turns": 13.4286,
"per_run_input_tokens": 11906523,
"per_run_output_tokens": 842286,
"per_run_cost": 23.306,
"actual_model_name": "gpt-5-2025-08-07"
},
"grok-4": {
"total_tasks": 28,
"pass@1": {
"avg": 0,
"std": 0
},
"pass@4": 0,
"pass^4": 0,
"total_agent_execution_time": 792.0980796814058,
"total_input_tokens": 0,
"total_output_tokens": 0,
"total_tokens": 0,
"total_turns": 0,
"avg_agent_execution_time": 7.0723,
"avg_input_tokens": 0,
"avg_output_tokens": 0,
"avg_total_tokens": 0,
"avg_turns": 0,
"per_run_input_tokens": 0,
"per_run_output_tokens": 0,
"per_run_cost": 0,
"actual_model_name": "grok-4-0709"
},
"k2": {
"total_tasks": 28,
"pass@1": {
"avg": 0.125,
"std": 0.0619
},
"pass@4": 0.25,
"pass^4": 0.0714,
"total_agent_execution_time": 18588.57569885254,
"total_input_tokens": 46505937,
"total_output_tokens": 239141,
"total_tokens": 46745078,
"total_turns": 1798,
"avg_agent_execution_time": 165.9694,
"avg_input_tokens": 415231.5804,
"avg_output_tokens": 2135.1875,
"avg_total_tokens": 417366.7679,
"avg_turns": 16.0536,
"per_run_input_tokens": 11626484,
"per_run_output_tokens": 59785,
"per_run_cost": 1.8635,
"actual_model_name": "kimi-k2-0711-preview"
},
"o3": {
"total_tasks": 28,
"pass@1": {
"avg": 0.2411,
"std": 0.0449
},
"pass@4": 0.4643,
"pass^4": 0.0714,
"total_agent_execution_time": 19195.927572011948,
"total_input_tokens": 25191983,
"total_output_tokens": 1060378,
"total_tokens": 26252361,
"total_turns": 1537,
"avg_agent_execution_time": 171.3922,
"avg_input_tokens": 224928.4196,
"avg_output_tokens": 9467.6607,
"avg_total_tokens": 234396.0804,
"avg_turns": 13.7232,
"per_run_input_tokens": 6297995,
"per_run_output_tokens": 265094,
"per_run_cost": 12.7285,
"actual_model_name": "o3-2025-04-16"
},
"qwen-3-coder": {
"total_tasks": 28,
"pass@1": {
"avg": 0.1339,
"std": 0.0342
},
"pass@4": 0.2857,
"pass^4": 0.0714,
"total_agent_execution_time": 17790.901881217957,
"total_input_tokens": 77327639,
"total_output_tokens": 309166,
"total_tokens": 77636805,
"total_turns": 2288,
"avg_agent_execution_time": 158.8473,
"avg_input_tokens": 690425.3482,
"avg_output_tokens": 2760.4107,
"avg_total_tokens": 693185.7589,
"avg_turns": 20.4286,
"per_run_input_tokens": 19331909,
"per_run_output_tokens": 77291,
"per_run_cost": 5.9155,
"actual_model_name": "qwen3-coder-480b-a35b-instruct"
}
},
"playwright": {
"claude-4-1-opus": {
"total_tasks": 25,
"pass@1": {
"avg": 0.24,
"std": 0
},
"total_agent_execution_time": 9880.799889326096,
"total_input_tokens": 28651348,
"total_output_tokens": 72107,
"total_tokens": 28723455,
"total_turns": 476,
"avg_agent_execution_time": 395.232,
"avg_input_tokens": 1146053.92,
"avg_output_tokens": 2884.28,
"avg_total_tokens": 1148938.2,
"avg_turns": 19.04,
"per_run_input_tokens": 28651348,
"per_run_output_tokens": 72107,
"per_run_cost": 435.1782,
"actual_model_name": "claude-opus-4-1-20250805"
},
"claude-4-sonnet": {
"total_tasks": 25,
"pass@1": {
"avg": 0.26,
"std": 0.0693
},
"pass@4": 0.36,
"pass^4": 0.08,
"total_agent_execution_time": 27847.038078069687,
"total_input_tokens": 124000371,
"total_output_tokens": 351250,
"total_tokens": 124351621,
"total_turns": 1975,
"avg_agent_execution_time": 278.4704,
"avg_input_tokens": 1240003.71,
"avg_output_tokens": 3512.5,
"avg_total_tokens": 1243516.21,
"avg_turns": 19.75,
"per_run_input_tokens": 31000092,
"per_run_output_tokens": 87812,
"per_run_cost": 94.3175,
"actual_model_name": "claude-sonnet-4-20250514"
},
"deepseek-chat": {
"total_tasks": 25,
"pass@1": {
"avg": 0.07,
"std": 0.0383
},
"pass@4": 0.16,
"pass^4": 0,
"total_agent_execution_time": 28337.25964307785,
"total_input_tokens": 82582456,
"total_output_tokens": 175146,
"total_tokens": 82757602,
"total_turns": 1895,
"avg_agent_execution_time": 283.3726,
"avg_input_tokens": 825824.56,
"avg_output_tokens": 1751.46,
"avg_total_tokens": 827576.02,
"avg_turns": 18.95,
"per_run_input_tokens": 20645614,
"per_run_output_tokens": 43786,
"per_run_cost": 5.6225,
"actual_model_name": "deepseek-v3.1-non-think"
},
"gemini-2-5-pro": {
"total_tasks": 25,
"pass@1": {
"avg": 0.15,
"std": 0.02
},
"pass@4": 0.32,
"pass^4": 0.04,
"total_agent_execution_time": 17044.64262318611,
"total_input_tokens": 167489425,
"total_output_tokens": 548991,
"total_tokens": 168038416,
"total_turns": 1892,
"avg_agent_execution_time": 170.4464,
"avg_input_tokens": 1674894.25,
"avg_output_tokens": 5489.91,
"avg_total_tokens": 1680384.16,
"avg_turns": 18.92,
"per_run_input_tokens": 41872356,
"per_run_output_tokens": 137247,
"per_run_cost": 106.7396,
"actual_model_name": "gemini-2.5-pro"
},
"gpt-5": {
"total_tasks": 25,
"pass@1": {
"avg": 0.45,
"std": 0.02
},
"pass@4": 0.56,
"pass^4": 0.32,
"total_agent_execution_time": 52685.02177119255,
"total_input_tokens": 172852606,
"total_output_tokens": 1872354,
"total_tokens": 174724960,
"total_turns": 2421,
"avg_agent_execution_time": 526.8502,
"avg_input_tokens": 1728526.06,
"avg_output_tokens": 18723.54,
"avg_total_tokens": 1747249.6,
"avg_turns": 24.21,
"per_run_input_tokens": 43213151,
"per_run_output_tokens": 468088,
"per_run_cost": 58.6973,
"actual_model_name": "gpt-5-2025-08-07"
},
"grok-4": {
"total_tasks": 25,
"pass@1": {
"avg": 0.1,
"std": 0.0693
},
"pass@4": 0.28,
"pass^4": 0,
"total_agent_execution_time": 13024.376833677292,
"total_input_tokens": 0,
"total_output_tokens": 0,
"total_tokens": 0,
"total_turns": 1046,
"avg_agent_execution_time": 130.2438,
"avg_input_tokens": 0,
"avg_output_tokens": 0,
"avg_total_tokens": 0,
"avg_turns": 10.46,
"per_run_input_tokens": 0,
"per_run_output_tokens": 0,
"per_run_cost": 0,
"actual_model_name": "grok-4-0709"
},
"k2": {
"total_tasks": 25,
"pass@1": {
"avg": 0.11,
"std": 0.0503
},
"pass@4": 0.16,
"pass^4": 0.04,
"total_agent_execution_time": 22158.128977775574,
"total_input_tokens": 72964759,
"total_output_tokens": 163732,
"total_tokens": 73128491,
"total_turns": 1777,
"avg_agent_execution_time": 221.5813,
"avg_input_tokens": 729647.59,
"avg_output_tokens": 1637.32,
"avg_total_tokens": 731284.91,
"avg_turns": 17.77,
"per_run_input_tokens": 18241189,
"per_run_output_tokens": 40933,
"per_run_cost": 2.818,
"actual_model_name": "kimi-k2-0711-preview"
},
"o3": {
"total_tasks": 25,
"pass@1": {
"avg": 0.15,
"std": 0.06
},
"pass@4": 0.32,
"pass^4": 0.08,
"total_agent_execution_time": 15392.43521952629,
"total_input_tokens": 55630356,
"total_output_tokens": 446007,
"total_tokens": 56076363,
"total_turns": 1630,
"avg_agent_execution_time": 153.9244,
"avg_input_tokens": 556303.56,
"avg_output_tokens": 4460.07,
"avg_total_tokens": 560763.63,
"avg_turns": 16.3,
"per_run_input_tokens": 13907589,
"per_run_output_tokens": 111501,
"per_run_cost": 27.8709,
"actual_model_name": "o3-2025-04-16"
},
"qwen-3-coder": {
"total_tasks": 25,
"pass@1": {
"avg": 0.2,
"std": 0.0327
},
"pass@4": 0.36,
"pass^4": 0.08,
"total_agent_execution_time": 30588.364787101746,
"total_input_tokens": 185286192,
"total_output_tokens": 234766,
"total_tokens": 185520958,
"total_turns": 2384,
"avg_agent_execution_time": 305.8836,
"avg_input_tokens": 1852861.92,
"avg_output_tokens": 2347.66,
"avg_total_tokens": 1855209.58,
"avg_turns": 23.84,
"per_run_input_tokens": 46321548,
"per_run_output_tokens": 58691,
"per_run_cost": 13.9845,
"actual_model_name": "qwen3-coder-480b-a35b-instruct"
}
},
"postgres": {
"claude-4-1-opus": {
"total_tasks": 21,
"pass@1": {
"avg": 0.3333,
"std": 0
},
"total_agent_execution_time": 10822.950607061386,
"total_input_tokens": 5474339,
"total_output_tokens": 205732,
"total_tokens": 5680071,
"total_turns": 522,
"avg_agent_execution_time": 515.3786,
"avg_input_tokens": 260682.8095,
"avg_output_tokens": 9796.7619,
"avg_total_tokens": 270479.5714,
"avg_turns": 24.8571,
"per_run_input_tokens": 5474339,
"per_run_output_tokens": 205732,
"per_run_cost": 97.545,
"actual_model_name": "claude-opus-4-1-20250805"
},
"claude-4-sonnet": {
"total_tasks": 21,
"pass@1": {
"avg": 0.5,
"std": 0.0912
},
"pass@4": 0.7143,
"pass^4": 0.2857,
"total_agent_execution_time": 19509.87832570076,
"total_input_tokens": 26231551,
"total_output_tokens": 608141,
"total_tokens": 26839692,
"total_turns": 2161,
"avg_agent_execution_time": 232.2605,
"avg_input_tokens": 312280.369,
"avg_output_tokens": 7239.7738,
"avg_total_tokens": 319520.1429,
"avg_turns": 25.7262,
"per_run_input_tokens": 6557887,
"per_run_output_tokens": 152035,
"per_run_cost": 21.9542,
"actual_model_name": "claude-sonnet-4-20250514"
},
"deepseek-chat": {
"total_tasks": 21,
"pass@1": {
"avg": 0.3929,
"std": 0.0714
},
"pass@4": 0.5714,
"pass^4": 0.2857,
"total_agent_execution_time": 27023.367521762848,
"total_input_tokens": 23750186,
"total_output_tokens": 330996,
"total_tokens": 24081182,
"total_turns": 1981,
"avg_agent_execution_time": 321.7068,
"avg_input_tokens": 282740.3095,
"avg_output_tokens": 3940.4286,
"avg_total_tokens": 286680.7381,
"avg_turns": 23.5833,
"per_run_input_tokens": 5937546,
"per_run_output_tokens": 82749,
"per_run_cost": 1.6942,
"actual_model_name": "deepseek-v3.1-non-think"
},
"gemini-2-5-pro": {
"total_tasks": 21,
"pass@1": {
"avg": 0.2619,
"std": 0.0912
},
"pass@4": 0.4762,
"pass^4": 0.0952,
"total_agent_execution_time": 7817.783343076706,
"total_input_tokens": 2764819,
"total_output_tokens": 710446,
"total_tokens": 3475265,
"total_turns": 550,
"avg_agent_execution_time": 93.0688,
"avg_input_tokens": 32914.5119,
"avg_output_tokens": 8457.6905,
"avg_total_tokens": 41372.2024,
"avg_turns": 6.5476,
"per_run_input_tokens": 691204,
"per_run_output_tokens": 177611,
"per_run_cost": 4.3922,
"actual_model_name": "gemini-2.5-pro"
},
"gpt-5": {
"total_tasks": 21,
"pass@1": {
"avg": 0.7381,
"std": 0.0476
},
"pass@4": 0.9524,
"pass^4": 0.381,
"total_agent_execution_time": 22876.62664246559,
"total_input_tokens": 8603642,
"total_output_tokens": 1368540,
"total_tokens": 9972182,
"total_turns": 1051,
"avg_agent_execution_time": 272.3408,
"avg_input_tokens": 102424.3095,
"avg_output_tokens": 16292.1429,
"avg_total_tokens": 118716.4524,
"avg_turns": 12.5119,
"per_run_input_tokens": 2150910,
"per_run_output_tokens": 342135,
"per_run_cost": 6.11,
"actual_model_name": "gpt-5-2025-08-07"
},
"grok-4": {
"total_tasks": 21,
"pass@1": {
"avg": 0.5833,
"std": 0.119
},
"pass@4": 0.7619,
"pass^4": 0.2381,
"total_agent_execution_time": 14417.84362411499,
"total_input_tokens": 0,
"total_output_tokens": 0,
"total_tokens": 0,
"total_turns": 1536,
"avg_agent_execution_time": 171.641,
"avg_input_tokens": 0,
"avg_output_tokens": 0,
"avg_total_tokens": 0,
"avg_turns": 18.2857,
"per_run_input_tokens": 0,
"per_run_output_tokens": 0,
"per_run_cost": 0,
"actual_model_name": "grok-4-0709"
},
"k2": {
"total_tasks": 21,
"pass@1": {
"avg": 0.3929,
"std": 0.0901
},
"pass@4": 0.6667,
"pass^4": 0.2857,
"total_agent_execution_time": 26209.414449691772,
"total_input_tokens": 20093296,
"total_output_tokens": 311996,
"total_tokens": 20405292,
"total_turns": 2090,
"avg_agent_execution_time": 312.0168,
"avg_input_tokens": 239205.9048,
"avg_output_tokens": 3714.2381,
"avg_total_tokens": 242920.1429,
"avg_turns": 24.881,
"per_run_input_tokens": 5023324,
"per_run_output_tokens": 77999,
"per_run_cost": 0.9095,
"actual_model_name": "kimi-k2-0711-preview"
},
"o3": {
"total_tasks": 21,
"pass@1": {
"avg": 0.369,
"std": 0.0456
},
"pass@4": 0.6667,
"pass^4": 0.1429,
"total_agent_execution_time": 6351.111663579941,
"total_input_tokens": 5338793,
"total_output_tokens": 396424,
"total_tokens": 5735217,
"total_turns": 900,
"avg_agent_execution_time": 75.6085,
"avg_input_tokens": 63557.0595,
"avg_output_tokens": 4719.3333,
"avg_total_tokens": 68276.3929,
"avg_turns": 10.7143,
"per_run_input_tokens": 1334698,
"per_run_output_tokens": 99106,
"per_run_cost": 2.7189,
"actual_model_name": "o3-2025-04-16"
},
"qwen-3-coder": {
"total_tasks": 21,
"pass@1": {
"avg": 0.3929,
"std": 0.1422
},
"pass@4": 0.619,
"pass^4": 0.1429,
"total_agent_execution_time": 13354.64016699791,
"total_input_tokens": 25950518,
"total_output_tokens": 340227,
"total_tokens": 26290745,
"total_turns": 2297,
"avg_agent_execution_time": 158.9838,
"avg_input_tokens": 308934.7381,
"avg_output_tokens": 4050.3214,
"avg_total_tokens": 312985.0595,
"avg_turns": 27.3452,
"per_run_input_tokens": 6487629,
"per_run_output_tokens": 85056,
"per_run_cost": 2.0739,
"actual_model_name": "qwen3-coder-480b-a35b-instruct"
}
}
}