loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

main_effects_cost.json (8320B)


      1 {
      2   "model": {
      3     "values": {
      4       "gemma-4-26b": {
      5         "mean": 1.7918,
      6         "effect": 1.1071,
      7         "n": 43
      8       },
      9       "glm-4.5-air": {
     10         "mean": 0.6678,
     11         "effect": -0.0169,
     12         "n": 41
     13       },
     14       "glm-4.7": {
     15         "mean": 0.407,
     16         "effect": -0.2776,
     17         "n": 81
     18       },
     19       "glm-5.1": {
     20         "mean": 0.3566,
     21         "effect": -0.3281,
     22         "n": 123
     23       },
     24       "haiku-4.5": {
     25         "mean": 0.2769,
     26         "effect": -0.4078,
     27         "n": 89
     28       },
     29       "kimi-k2.5": {
     30         "mean": 1.1331,
     31         "effect": 0.4484,
     32         "n": 3
     33       },
     34       "minimax-m2.7": {
     35         "mean": 1.8167,
     36         "effect": 1.132,
     37         "n": 3
     38       },
     39       "opus-4.6": {
     40         "mean": 0.6445,
     41         "effect": -0.0402,
     42         "n": 52
     43       },
     44       "qwen-3.6-plus": {
     45         "mean": 1.3787,
     46         "effect": 0.694,
     47         "n": 22
     48       },
     49       "sonnet-4.6": {
     50         "mean": 1.3319,
     51         "effect": 0.6472,
     52         "n": 53
     53       }
     54     },
     55     "spread": 1.5398
     56   },
     57   "provider": {
     58     "values": {
     59       "anthropic": {
     60         "mean": 0.6636,
     61         "effect": -0.021,
     62         "n": 194
     63       },
     64       "openrouter": {
     65         "mean": 1.637,
     66         "effect": 0.9523,
     67         "n": 71
     68       },
     69       "zai": {
     70         "mean": 0.4254,
     71         "effect": -0.2593,
     72         "n": 245
     73       }
     74     },
     75     "spread": 1.2116
     76   },
     77   "strategy": {
     78     "values": {
     79       "creative_validate": {
     80         "mean": 1.2884,
     81         "effect": 0.6037,
     82         "n": 8
     83       },
     84       "delegate": {
     85         "mean": 0.8727,
     86         "effect": 0.188,
     87         "n": 7
     88       },
     89       "iterate": {
     90         "mean": 1.1211,
     91         "effect": 0.4364,
     92         "n": 11
     93       },
     94       "none": {
     95         "mean": 0.6073,
     96         "effect": -0.0774,
     97         "n": 300
     98       },
     99       "plan_first": {
    100         "mean": 0.9415,
    101         "effect": 0.2568,
    102         "n": 10
    103       },
    104       "review": {
    105         "mean": 1.0455,
    106         "effect": 0.3608,
    107         "n": 5
    108       },
    109       "split_work": {
    110         "mean": 1.032,
    111         "effect": 0.3473,
    112         "n": 5
    113       },
    114       "use_subagents": {
    115         "mean": 0.7223,
    116         "effect": 0.0376,
    117         "n": 164
    118       }
    119     },
    120     "spread": 0.6811
    121   },
    122   "playwright": {
    123     "values": {
    124       "available": {
    125         "mean": 0.7125,
    126         "effect": 0.0278,
    127         "n": 165
    128       },
    129       "instructed": {
    130         "mean": 1.1926,
    131         "effect": 0.5079,
    132         "n": 11
    133       },
    134       "off": {
    135         "mean": 0.6542,
    136         "effect": -0.0305,
    137         "n": 334
    138       }
    139     },
    140     "spread": 0.5384
    141   },
    142   "context_noise": {
    143     "values": {
    144       "clean": {
    145         "mean": 0.7076,
    146         "effect": 0.023,
    147         "n": 477
    148       },
    149       "lorem_100k": {
    150         "mean": 0.5024,
    151         "effect": -0.1823,
    152         "n": 6
    153       },
    154       "lorem_10k": {
    155         "mean": 0.265,
    156         "effect": -0.4197,
    157         "n": 6
    158       },
    159       "lorem_1k": {
    160         "mean": 0.3576,
    161         "effect": -0.3271,
    162         "n": 3
    163       },
    164       "lorem_50k": {
    165         "mean": 0.3994,
    166         "effect": -0.2852,
    167         "n": 6
    168       },
    169       "wikipedia_100k": {
    170         "mean": 0.3776,
    171         "effect": -0.3071,
    172         "n": 3
    173       },
    174       "wikipedia_10k": {
    175         "mean": 0.2867,
    176         "effect": -0.398,
    177         "n": 3
    178       },
    179       "wikipedia_1k": {
    180         "mean": 0.1992,
    181         "effect": -0.4855,
    182         "n": 3
    183       },
    184       "wikipedia_50k": {
    185         "mean": 0.3265,
    186         "effect": -0.3581,
    187         "n": 3
    188       }
    189     },
    190     "spread": 0.5084
    191   },
    192   "renderer": {
    193     "values": {
    194       "canvas": {
    195         "mean": 0.4155,
    196         "effect": -0.2692,
    197         "n": 7
    198       },
    199       "dom": {
    200         "mean": 0.3856,
    201         "effect": -0.2991,
    202         "n": 5
    203       },
    204       "none": {
    205         "mean": 0.6981,
    206         "effect": 0.0134,
    207         "n": 487
    208       },
    209       "svg": {
    210         "mean": 0.3176,
    211         "effect": -0.367,
    212         "n": 7
    213       },
    214       "webgl": {
    215         "mean": 0.5393,
    216         "effect": -0.1454,
    217         "n": 4
    218       }
    219     },
    220     "spread": 0.3805
    221   },
    222   "max_budget": {
    223     "values": {
    224       "high": {
    225         "mean": 0.4427,
    226         "effect": -0.242,
    227         "n": 24
    228       },
    229       "low": {
    230         "mean": 0.6966,
    231         "effect": 0.012,
    232         "n": 486
    233       }
    234     },
    235     "spread": 0.2539
    236   },
    237   "architecture": {
    238     "values": {
    239       "best_practices": {
    240         "mean": 0.7915,
    241         "effect": 0.1068,
    242         "n": 4
    243       },
    244       "none": {
    245         "mean": 0.6853,
    246         "effect": 0.0006,
    247         "n": 501
    248       },
    249       "separation": {
    250         "mean": 0.5392,
    251         "effect": -0.1455,
    252         "n": 5
    253       }
    254     },
    255     "spread": 0.2523
    256   },
    257   "human_language": {
    258     "values": {
    259       "en": {
    260         "mean": 0.6986,
    261         "effect": 0.0139,
    262         "n": 481
    263       },
    264       "es": {
    265         "mean": 0.4541,
    266         "effect": -0.2306,
    267         "n": 29
    268       }
    269     },
    270     "spread": 0.2445
    271   },
    272   "context_file": {
    273     "values": {
    274       "none": {
    275         "mean": 0.699,
    276         "effect": 0.0144,
    277         "n": 479
    278       },
    279       "provided": {
    280         "mean": 0.4627,
    281         "effect": -0.222,
    282         "n": 31
    283       }
    284     },
    285     "spread": 0.2363
    286   },
    287   "linter": {
    288     "values": {
    289       "off": {
    290         "mean": 0.4721,
    291         "effect": -0.2126,
    292         "n": 39
    293       },
    294       "on": {
    295         "mean": 0.7023,
    296         "effect": 0.0176,
    297         "n": 471
    298       }
    299     },
    300     "spread": 0.2302
    301   },
    302   "design_guidance": {
    303     "values": {
    304       "none": {
    305         "mean": 0.6829,
    306         "effect": -0.0017,
    307         "n": 500
    308       },
    309       "specific": {
    310         "mean": 0.689,
    311         "effect": 0.0043,
    312         "n": 5
    313       },
    314       "vague": {
    315         "mean": 0.8542,
    316         "effect": 0.1695,
    317         "n": 5
    318       }
    319     },
    320     "spread": 0.1713
    321   },
    322   "prompt_style": {
    323     "values": {
    324       "detailed": {
    325         "mean": 0.5406,
    326         "effect": -0.144,
    327         "n": 30
    328       },
    329       "simple": {
    330         "mean": 0.6937,
    331         "effect": 0.009,
    332         "n": 480
    333       }
    334     },
    335     "spread": 0.1531
    336   },
    337   "web_search": {
    338     "values": {
    339       "off": {
    340         "mean": 0.5497,
    341         "effect": -0.135,
    342         "n": 36
    343       },
    344       "on": {
    345         "mean": 0.6949,
    346         "effect": 0.0103,
    347         "n": 474
    348       }
    349     },
    350     "spread": 0.1452
    351   },
    352   "tool_edit": {
    353     "values": {
    354       "off": {
    355         "mean": 0.5573,
    356         "effect": -0.1274,
    357         "n": 35
    358       },
    359       "on": {
    360         "mean": 0.6941,
    361         "effect": 0.0094,
    362         "n": 475
    363       }
    364     },
    365     "spread": 0.1368
    366   },
    367   "language": {
    368     "values": {
    369       "javascript": {
    370         "mean": 0.617,
    371         "effect": -0.0677,
    372         "n": 21
    373       },
    374       "typescript": {
    375         "mean": 0.693,
    376         "effect": 0.0083,
    377         "n": 469
    378       },
    379       "unspecified": {
    380         "mean": 0.5603,
    381         "effect": -0.1244,
    382         "n": 20
    383       }
    384     },
    385     "spread": 0.1327
    386   },
    387   "tool_grep": {
    388     "values": {
    389       "off": {
    390         "mean": 0.5673,
    391         "effect": -0.1173,
    392         "n": 31
    393       },
    394       "on": {
    395         "mean": 0.6923,
    396         "effect": 0.0076,
    397         "n": 479
    398       }
    399     },
    400     "spread": 0.125
    401   },
    402   "tool_write": {
    403     "values": {
    404       "off": {
    405         "mean": 0.5712,
    406         "effect": -0.1134,
    407         "n": 33
    408       },
    409       "on": {
    410         "mean": 0.6925,
    411         "effect": 0.0078,
    412         "n": 477
    413       }
    414     },
    415     "spread": 0.1213
    416   },
    417   "tool_glob": {
    418     "values": {
    419       "off": {
    420         "mean": 0.5848,
    421         "effect": -0.0998,
    422         "n": 30
    423       },
    424       "on": {
    425         "mean": 0.6909,
    426         "effect": 0.0062,
    427         "n": 480
    428       }
    429     },
    430     "spread": 0.1061
    431   },
    432   "tool_read": {
    433     "values": {
    434       "off": {
    435         "mean": 0.5882,
    436         "effect": -0.0964,
    437         "n": 31
    438       },
    439       "on": {
    440         "mean": 0.6909,
    441         "effect": 0.0062,
    442         "n": 479
    443       }
    444     },
    445     "spread": 0.1027
    446   },
    447   "error_checking": {
    448     "values": {
    449       "none": {
    450         "mean": 0.6841,
    451         "effect": -0.0006,
    452         "n": 506
    453       },
    454       "self_verify": {
    455         "mean": 0.7612,
    456         "effect": 0.0765,
    457         "n": 4
    458       }
    459     },
    460     "spread": 0.0771
    461   },
    462   "effort": {
    463     "values": {
    464       "high": {
    465         "mean": 0.6818,
    466         "effect": -0.0029,
    467         "n": 491
    468       },
    469       "max": {
    470         "mean": 0.7587,
    471         "effect": 0.0741,
    472         "n": 19
    473       }
    474     },
    475     "spread": 0.0769
    476   }
    477 }

Impressum · Datenschutz