loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

main_effects_transcript.json (8268B)


      1 {
      2   "model": {
      3     "values": {
      4       "gemma-4-26b": {
      5         "mean": 0.9814,
      6         "effect": 0.0449,
      7         "n": 43
      8       },
      9       "glm-4.5-air": {
     10         "mean": 0.9488,
     11         "effect": 0.0123,
     12         "n": 41
     13       },
     14       "glm-4.7": {
     15         "mean": 0.8926,
     16         "effect": -0.0439,
     17         "n": 81
     18       },
     19       "glm-5.1": {
     20         "mean": 0.9898,
     21         "effect": 0.0534,
     22         "n": 123
     23       },
     24       "haiku-4.5": {
     25         "mean": 0.7949,
     26         "effect": -0.1415,
     27         "n": 89
     28       },
     29       "kimi-k2.5": {
     30         "mean": 0.9333,
     31         "effect": -0.0031,
     32         "n": 3
     33       },
     34       "minimax-m2.7": {
     35         "mean": 1.0,
     36         "effect": 0.0635,
     37         "n": 3
     38       },
     39       "opus-4.6": {
     40         "mean": 1.0,
     41         "effect": 0.0635,
     42         "n": 52
     43       },
     44       "qwen-3.6-plus": {
     45         "mean": 0.9864,
     46         "effect": 0.0499,
     47         "n": 22
     48       },
     49       "sonnet-4.6": {
     50         "mean": 0.9849,
     51         "effect": 0.0484,
     52         "n": 53
     53       }
     54     },
     55     "spread": 0.2051
     56   },
     57   "renderer": {
     58     "values": {
     59       "canvas": {
     60         "mean": 0.9071,
     61         "effect": -0.0293,
     62         "n": 7
     63       },
     64       "dom": {
     65         "mean": 1.0,
     66         "effect": 0.0635,
     67         "n": 5
     68       },
     69       "none": {
     70         "mean": 0.9368,
     71         "effect": 0.0003,
     72         "n": 487
     73       },
     74       "svg": {
     75         "mean": 0.9,
     76         "effect": -0.0365,
     77         "n": 7
     78       },
     79       "webgl": {
     80         "mean": 0.9375,
     81         "effect": 0.001,
     82         "n": 4
     83       }
     84     },
     85     "spread": 0.1
     86   },
     87   "context_noise": {
     88     "values": {
     89       "clean": {
     90         "mean": 0.9345,
     91         "effect": -0.002,
     92         "n": 477
     93       },
     94       "lorem_100k": {
     95         "mean": 0.9167,
     96         "effect": -0.0198,
     97         "n": 6
     98       },
     99       "lorem_10k": {
    100         "mean": 0.9333,
    101         "effect": -0.0031,
    102         "n": 6
    103       },
    104       "lorem_1k": {
    105         "mean": 1.0,
    106         "effect": 0.0635,
    107         "n": 3
    108       },
    109       "lorem_50k": {
    110         "mean": 0.9583,
    111         "effect": 0.0219,
    112         "n": 6
    113       },
    114       "wikipedia_100k": {
    115         "mean": 1.0,
    116         "effect": 0.0635,
    117         "n": 3
    118       },
    119       "wikipedia_10k": {
    120         "mean": 1.0,
    121         "effect": 0.0635,
    122         "n": 3
    123       },
    124       "wikipedia_1k": {
    125         "mean": 1.0,
    126         "effect": 0.0635,
    127         "n": 3
    128       },
    129       "wikipedia_50k": {
    130         "mean": 1.0,
    131         "effect": 0.0635,
    132         "n": 3
    133       }
    134     },
    135     "spread": 0.0833
    136   },
    137   "strategy": {
    138     "values": {
    139       "creative_validate": {
    140         "mean": 0.9688,
    141         "effect": 0.0323,
    142         "n": 8
    143       },
    144       "delegate": {
    145         "mean": 0.9286,
    146         "effect": -0.0079,
    147         "n": 7
    148       },
    149       "iterate": {
    150         "mean": 0.9455,
    151         "effect": 0.009,
    152         "n": 11
    153       },
    154       "none": {
    155         "mean": 0.9448,
    156         "effect": 0.0084,
    157         "n": 300
    158       },
    159       "plan_first": {
    160         "mean": 0.94,
    161         "effect": 0.0035,
    162         "n": 10
    163       },
    164       "review": {
    165         "mean": 0.92,
    166         "effect": -0.0165,
    167         "n": 5
    168       },
    169       "split_work": {
    170         "mean": 1.0,
    171         "effect": 0.0635,
    172         "n": 5
    173       },
    174       "use_subagents": {
    175         "mean": 0.9177,
    176         "effect": -0.0188,
    177         "n": 164
    178       }
    179     },
    180     "spread": 0.0823
    181   },
    182   "provider": {
    183     "values": {
    184       "anthropic": {
    185         "mean": 0.9018,
    186         "effect": -0.0347,
    187         "n": 194
    188       },
    189       "openrouter": {
    190         "mean": 0.9817,
    191         "effect": 0.0452,
    192         "n": 71
    193       },
    194       "zai": {
    195         "mean": 0.9508,
    196         "effect": 0.0143,
    197         "n": 245
    198       }
    199     },
    200     "spread": 0.0799
    201   },
    202   "tool_write": {
    203     "values": {
    204       "off": {
    205         "mean": 0.8621,
    206         "effect": -0.0743,
    207         "n": 33
    208       },
    209       "on": {
    210         "mean": 0.9416,
    211         "effect": 0.0051,
    212         "n": 477
    213       }
    214     },
    215     "spread": 0.0795
    216   },
    217   "tool_glob": {
    218     "values": {
    219       "off": {
    220         "mean": 0.8733,
    221         "effect": -0.0631,
    222         "n": 30
    223       },
    224       "on": {
    225         "mean": 0.9404,
    226         "effect": 0.0039,
    227         "n": 480
    228       }
    229     },
    230     "spread": 0.0671
    231   },
    232   "context_file": {
    233     "values": {
    234       "none": {
    235         "mean": 0.9404,
    236         "effect": 0.0039,
    237         "n": 479
    238       },
    239       "provided": {
    240         "mean": 0.8758,
    241         "effect": -0.0607,
    242         "n": 31
    243       }
    244     },
    245     "spread": 0.0646
    246   },
    247   "tool_read": {
    248     "values": {
    249       "off": {
    250         "mean": 0.8774,
    251         "effect": -0.0591,
    252         "n": 31
    253       },
    254       "on": {
    255         "mean": 0.9403,
    256         "effect": 0.0038,
    257         "n": 479
    258       }
    259     },
    260     "spread": 0.0629
    261   },
    262   "human_language": {
    263     "values": {
    264       "en": {
    265         "mean": 0.94,
    266         "effect": 0.0036,
    267         "n": 481
    268       },
    269       "es": {
    270         "mean": 0.8776,
    271         "effect": -0.0589,
    272         "n": 29
    273       }
    274     },
    275     "spread": 0.0624
    276   },
    277   "tool_edit": {
    278     "values": {
    279       "off": {
    280         "mean": 0.88,
    281         "effect": -0.0565,
    282         "n": 35
    283       },
    284       "on": {
    285         "mean": 0.9406,
    286         "effect": 0.0042,
    287         "n": 475
    288       }
    289     },
    290     "spread": 0.0606
    291   },
    292   "linter": {
    293     "values": {
    294       "off": {
    295         "mean": 0.8821,
    296         "effect": -0.0544,
    297         "n": 39
    298       },
    299       "on": {
    300         "mean": 0.941,
    301         "effect": 0.0045,
    302         "n": 471
    303       }
    304     },
    305     "spread": 0.0589
    306   },
    307   "language": {
    308     "values": {
    309       "javascript": {
    310         "mean": 0.9905,
    311         "effect": 0.054,
    312         "n": 21
    313       },
    314       "typescript": {
    315         "mean": 0.932,
    316         "effect": -0.0045,
    317         "n": 469
    318       },
    319       "unspecified": {
    320         "mean": 0.985,
    321         "effect": 0.0485,
    322         "n": 20
    323       }
    324     },
    325     "spread": 0.0585
    326   },
    327   "tool_grep": {
    328     "values": {
    329       "off": {
    330         "mean": 0.8823,
    331         "effect": -0.0542,
    332         "n": 31
    333       },
    334       "on": {
    335         "mean": 0.94,
    336         "effect": 0.0035,
    337         "n": 479
    338       }
    339     },
    340     "spread": 0.0577
    341   },
    342   "playwright": {
    343     "values": {
    344       "available": {
    345         "mean": 0.9155,
    346         "effect": -0.021,
    347         "n": 165
    348       },
    349       "instructed": {
    350         "mean": 0.9727,
    351         "effect": 0.0363,
    352         "n": 11
    353       },
    354       "off": {
    355         "mean": 0.9457,
    356         "effect": 0.0092,
    357         "n": 334
    358       }
    359     },
    360     "spread": 0.0572
    361   },
    362   "web_search": {
    363     "values": {
    364       "off": {
    365         "mean": 0.8833,
    366         "effect": -0.0531,
    367         "n": 36
    368       },
    369       "on": {
    370         "mean": 0.9405,
    371         "effect": 0.004,
    372         "n": 474
    373       }
    374     },
    375     "spread": 0.0572
    376   },
    377   "architecture": {
    378     "values": {
    379       "best_practices": {
    380         "mean": 0.975,
    381         "effect": 0.0385,
    382         "n": 4
    383       },
    384       "none": {
    385         "mean": 0.9356,
    386         "effect": -0.0008,
    387         "n": 501
    388       },
    389       "separation": {
    390         "mean": 0.99,
    391         "effect": 0.0535,
    392         "n": 5
    393       }
    394     },
    395     "spread": 0.0544
    396   },
    397   "prompt_style": {
    398     "values": {
    399       "detailed": {
    400         "mean": 0.89,
    401         "effect": -0.0465,
    402         "n": 30
    403       },
    404       "simple": {
    405         "mean": 0.9394,
    406         "effect": 0.0029,
    407         "n": 480
    408       }
    409     },
    410     "spread": 0.0494
    411   },
    412   "max_budget": {
    413     "values": {
    414       "high": {
    415         "mean": 0.8917,
    416         "effect": -0.0448,
    417         "n": 24
    418       },
    419       "low": {
    420         "mean": 0.9387,
    421         "effect": 0.0022,
    422         "n": 486
    423       }
    424     },
    425     "spread": 0.047
    426   },
    427   "design_guidance": {
    428     "values": {
    429       "none": {
    430         "mean": 0.936,
    431         "effect": -0.0005,
    432         "n": 500
    433       },
    434       "specific": {
    435         "mean": 0.95,
    436         "effect": 0.0135,
    437         "n": 5
    438       },
    439       "vague": {
    440         "mean": 0.97,
    441         "effect": 0.0335,
    442         "n": 5
    443       }
    444     },
    445     "spread": 0.034
    446   },
    447   "error_checking": {
    448     "values": {
    449       "none": {
    450         "mean": 0.9363,
    451         "effect": -0.0002,
    452         "n": 506
    453       },
    454       "self_verify": {
    455         "mean": 0.9625,
    456         "effect": 0.026,
    457         "n": 4
    458       }
    459     },
    460     "spread": 0.0262
    461   },
    462   "effort": {
    463     "values": {
    464       "high": {
    465         "mean": 0.9363,
    466         "effect": -0.0002,
    467         "n": 491
    468       },
    469       "max": {
    470         "mean": 0.9421,
    471         "effect": 0.0056,
    472         "n": 19
    473       }
    474     },
    475     "spread": 0.0058
    476   }
    477 }

Impressum · Datenschutz