loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

main_effects_structural.json (8300B)


      1 {
      2   "model": {
      3     "values": {
      4       "gemma-4-26b": {
      5         "mean": 0.7442,
      6         "effect": -0.144,
      7         "n": 43
      8       },
      9       "glm-4.5-air": {
     10         "mean": 0.7927,
     11         "effect": -0.0956,
     12         "n": 41
     13       },
     14       "glm-4.7": {
     15         "mean": 0.8827,
     16         "effect": -0.0055,
     17         "n": 81
     18       },
     19       "glm-5.1": {
     20         "mean": 0.8923,
     21         "effect": 0.004,
     22         "n": 123
     23       },
     24       "haiku-4.5": {
     25         "mean": 0.8736,
     26         "effect": -0.0146,
     27         "n": 89
     28       },
     29       "kimi-k2.5": {
     30         "mean": 0.75,
     31         "effect": -0.1382,
     32         "n": 3
     33       },
     34       "minimax-m2.7": {
     35         "mean": 0.9167,
     36         "effect": 0.0284,
     37         "n": 3
     38       },
     39       "opus-4.6": {
     40         "mean": 0.9904,
     41         "effect": 0.1021,
     42         "n": 52
     43       },
     44       "qwen-3.6-plus": {
     45         "mean": 0.9659,
     46         "effect": 0.0777,
     47         "n": 22
     48       },
     49       "sonnet-4.6": {
     50         "mean": 0.9764,
     51         "effect": 0.0882,
     52         "n": 53
     53       }
     54     },
     55     "spread": 0.2462
     56   },
     57   "strategy": {
     58     "values": {
     59       "creative_validate": {
     60         "mean": 0.8125,
     61         "effect": -0.0757,
     62         "n": 8
     63       },
     64       "delegate": {
     65         "mean": 0.8929,
     66         "effect": 0.0046,
     67         "n": 7
     68       },
     69       "iterate": {
     70         "mean": 0.8636,
     71         "effect": -0.0246,
     72         "n": 11
     73       },
     74       "none": {
     75         "mean": 0.8683,
     76         "effect": -0.0199,
     77         "n": 300
     78       },
     79       "plan_first": {
     80         "mean": 0.875,
     81         "effect": -0.0132,
     82         "n": 10
     83       },
     84       "review": {
     85         "mean": 0.85,
     86         "effect": -0.0382,
     87         "n": 5
     88       },
     89       "split_work": {
     90         "mean": 0.7,
     91         "effect": -0.1882,
     92         "n": 5
     93       },
     94       "use_subagents": {
     95         "mean": 0.9375,
     96         "effect": 0.0493,
     97         "n": 164
     98       }
     99     },
    100     "spread": 0.2375
    101   },
    102   "renderer": {
    103     "values": {
    104       "canvas": {
    105         "mean": 0.9286,
    106         "effect": 0.0403,
    107         "n": 7
    108       },
    109       "dom": {
    110         "mean": 0.9,
    111         "effect": 0.0118,
    112         "n": 5
    113       },
    114       "none": {
    115         "mean": 0.8886,
    116         "effect": 0.0004,
    117         "n": 487
    118       },
    119       "svg": {
    120         "mean": 0.8929,
    121         "effect": 0.0046,
    122         "n": 7
    123       },
    124       "webgl": {
    125         "mean": 0.75,
    126         "effect": -0.1382,
    127         "n": 4
    128       }
    129     },
    130     "spread": 0.1786
    131   },
    132   "context_noise": {
    133     "values": {
    134       "clean": {
    135         "mean": 0.8873,
    136         "effect": -0.0009,
    137         "n": 477
    138       },
    139       "lorem_100k": {
    140         "mean": 0.875,
    141         "effect": -0.0132,
    142         "n": 6
    143       },
    144       "lorem_10k": {
    145         "mean": 0.9167,
    146         "effect": 0.0284,
    147         "n": 6
    148       },
    149       "lorem_1k": {
    150         "mean": 0.9167,
    151         "effect": 0.0284,
    152         "n": 3
    153       },
    154       "lorem_50k": {
    155         "mean": 0.875,
    156         "effect": -0.0132,
    157         "n": 6
    158       },
    159       "wikipedia_100k": {
    160         "mean": 1.0,
    161         "effect": 0.1118,
    162         "n": 3
    163       },
    164       "wikipedia_10k": {
    165         "mean": 0.8333,
    166         "effect": -0.0549,
    167         "n": 3
    168       },
    169       "wikipedia_1k": {
    170         "mean": 0.9167,
    171         "effect": 0.0284,
    172         "n": 3
    173       },
    174       "wikipedia_50k": {
    175         "mean": 0.9167,
    176         "effect": 0.0284,
    177         "n": 3
    178       }
    179     },
    180     "spread": 0.1667
    181   },
    182   "error_checking": {
    183     "values": {
    184       "none": {
    185         "mean": 0.8893,
    186         "effect": 0.0011,
    187         "n": 506
    188       },
    189       "self_verify": {
    190         "mean": 0.75,
    191         "effect": -0.1382,
    192         "n": 4
    193       }
    194     },
    195     "spread": 0.1393
    196   },
    197   "language": {
    198     "values": {
    199       "javascript": {
    200         "mean": 1.0,
    201         "effect": 0.1118,
    202         "n": 21
    203       },
    204       "typescript": {
    205         "mean": 0.8785,
    206         "effect": -0.0098,
    207         "n": 469
    208       },
    209       "unspecified": {
    210         "mean": 1.0,
    211         "effect": 0.1118,
    212         "n": 20
    213       }
    214     },
    215     "spread": 0.1215
    216   },
    217   "provider": {
    218     "values": {
    219       "anthropic": {
    220         "mean": 0.933,
    221         "effect": 0.0448,
    222         "n": 194
    223       },
    224       "openrouter": {
    225         "mean": 0.8204,
    226         "effect": -0.0678,
    227         "n": 71
    228       },
    229       "zai": {
    230         "mean": 0.8724,
    231         "effect": -0.0158,
    232         "n": 245
    233       }
    234     },
    235     "spread": 0.1126
    236   },
    237   "playwright": {
    238     "values": {
    239       "available": {
    240         "mean": 0.9303,
    241         "effect": 0.0421,
    242         "n": 165
    243       },
    244       "instructed": {
    245         "mean": 0.8182,
    246         "effect": -0.0701,
    247         "n": 11
    248       },
    249       "off": {
    250         "mean": 0.8698,
    251         "effect": -0.0185,
    252         "n": 334
    253       }
    254     },
    255     "spread": 0.1121
    256   },
    257   "design_guidance": {
    258     "values": {
    259       "none": {
    260         "mean": 0.8895,
    261         "effect": 0.0013,
    262         "n": 500
    263       },
    264       "specific": {
    265         "mean": 0.85,
    266         "effect": -0.0382,
    267         "n": 5
    268       },
    269       "vague": {
    270         "mean": 0.8,
    271         "effect": -0.0882,
    272         "n": 5
    273       }
    274     },
    275     "spread": 0.0895
    276   },
    277   "architecture": {
    278     "values": {
    279       "best_practices": {
    280         "mean": 0.9375,
    281         "effect": 0.0493,
    282         "n": 4
    283       },
    284       "none": {
    285         "mean": 0.8882,
    286         "effect": -0.0,
    287         "n": 501
    288       },
    289       "separation": {
    290         "mean": 0.85,
    291         "effect": -0.0382,
    292         "n": 5
    293       }
    294     },
    295     "spread": 0.0875
    296   },
    297   "tool_write": {
    298     "values": {
    299       "off": {
    300         "mean": 0.8258,
    301         "effect": -0.0625,
    302         "n": 33
    303       },
    304       "on": {
    305         "mean": 0.8926,
    306         "effect": 0.0043,
    307         "n": 477
    308       }
    309     },
    310     "spread": 0.0668
    311   },
    312   "prompt_style": {
    313     "values": {
    314       "detailed": {
    315         "mean": 0.8333,
    316         "effect": -0.0549,
    317         "n": 30
    318       },
    319       "simple": {
    320         "mean": 0.8917,
    321         "effect": 0.0034,
    322         "n": 480
    323       }
    324     },
    325     "spread": 0.0584
    326   },
    327   "effort": {
    328     "values": {
    329       "high": {
    330         "mean": 0.887,
    331         "effect": -0.0013,
    332         "n": 491
    333       },
    334       "max": {
    335         "mean": 0.9211,
    336         "effect": 0.0328,
    337         "n": 19
    338       }
    339     },
    340     "spread": 0.0341
    341   },
    342   "max_budget": {
    343     "values": {
    344       "high": {
    345         "mean": 0.9167,
    346         "effect": 0.0284,
    347         "n": 24
    348       },
    349       "low": {
    350         "mean": 0.8868,
    351         "effect": -0.0014,
    352         "n": 486
    353       }
    354     },
    355     "spread": 0.0299
    356   },
    357   "human_language": {
    358     "values": {
    359       "en": {
    360         "mean": 0.8898,
    361         "effect": 0.0016,
    362         "n": 481
    363       },
    364       "es": {
    365         "mean": 0.8621,
    366         "effect": -0.0262,
    367         "n": 29
    368       }
    369     },
    370     "spread": 0.0277
    371   },
    372   "tool_grep": {
    373     "values": {
    374       "off": {
    375         "mean": 0.8629,
    376         "effect": -0.0253,
    377         "n": 31
    378       },
    379       "on": {
    380         "mean": 0.8899,
    381         "effect": 0.0016,
    382         "n": 479
    383       }
    384     },
    385     "spread": 0.027
    386   },
    387   "tool_glob": {
    388     "values": {
    389       "off": {
    390         "mean": 0.8667,
    391         "effect": -0.0216,
    392         "n": 30
    393       },
    394       "on": {
    395         "mean": 0.8896,
    396         "effect": 0.0013,
    397         "n": 480
    398       }
    399     },
    400     "spread": 0.0229
    401   },
    402   "linter": {
    403     "values": {
    404       "off": {
    405         "mean": 0.8718,
    406         "effect": -0.0164,
    407         "n": 39
    408       },
    409       "on": {
    410         "mean": 0.8896,
    411         "effect": 0.0014,
    412         "n": 471
    413       }
    414     },
    415     "spread": 0.0178
    416   },
    417   "tool_read": {
    418     "values": {
    419       "off": {
    420         "mean": 0.879,
    421         "effect": -0.0092,
    422         "n": 31
    423       },
    424       "on": {
    425         "mean": 0.8888,
    426         "effect": 0.0006,
    427         "n": 479
    428       }
    429     },
    430     "spread": 0.0098
    431   },
    432   "context_file": {
    433     "values": {
    434       "none": {
    435         "mean": 0.8878,
    436         "effect": -0.0004,
    437         "n": 479
    438       },
    439       "provided": {
    440         "mean": 0.8952,
    441         "effect": 0.0069,
    442         "n": 31
    443       }
    444     },
    445     "spread": 0.0074
    446   },
    447   "web_search": {
    448     "values": {
    449       "off": {
    450         "mean": 0.8819,
    451         "effect": -0.0063,
    452         "n": 36
    453       },
    454       "on": {
    455         "mean": 0.8887,
    456         "effect": 0.0005,
    457         "n": 474
    458       }
    459     },
    460     "spread": 0.0068
    461   },
    462   "tool_edit": {
    463     "values": {
    464       "off": {
    465         "mean": 0.8857,
    466         "effect": -0.0025,
    467         "n": 35
    468       },
    469       "on": {
    470         "mean": 0.8884,
    471         "effect": 0.0002,
    472         "n": 475
    473       }
    474     },
    475     "spread": 0.0027
    476   }
    477 }

Impressum · Datenschutz