loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

main_effects_code_quality.json (8282B)


      1 {
      2   "model": {
      3     "values": {
      4       "gemma-4-26b": {
      5         "mean": 0.4488,
      6         "effect": 0.0525,
      7         "n": 43
      8       },
      9       "glm-4.5-air": {
     10         "mean": 0.318,
     11         "effect": -0.0783,
     12         "n": 41
     13       },
     14       "glm-4.7": {
     15         "mean": 0.3222,
     16         "effect": -0.0741,
     17         "n": 81
     18       },
     19       "glm-5.1": {
     20         "mean": 0.393,
     21         "effect": -0.0033,
     22         "n": 123
     23       },
     24       "haiku-4.5": {
     25         "mean": 0.3464,
     26         "effect": -0.0499,
     27         "n": 89
     28       },
     29       "kimi-k2.5": {
     30         "mean": 0.3667,
     31         "effect": -0.0297,
     32         "n": 3
     33       },
     34       "minimax-m2.7": {
     35         "mean": 0.4167,
     36         "effect": 0.0203,
     37         "n": 3
     38       },
     39       "opus-4.6": {
     40         "mean": 0.5083,
     41         "effect": 0.1119,
     42         "n": 52
     43       },
     44       "qwen-3.6-plus": {
     45         "mean": 0.3973,
     46         "effect": 0.0009,
     47         "n": 22
     48       },
     49       "sonnet-4.6": {
     50         "mean": 0.5094,
     51         "effect": 0.1131,
     52         "n": 53
     53       }
     54     },
     55     "spread": 0.1914
     56   },
     57   "strategy": {
     58     "values": {
     59       "creative_validate": {
     60         "mean": 0.27,
     61         "effect": -0.1263,
     62         "n": 8
     63       },
     64       "delegate": {
     65         "mean": 0.3029,
     66         "effect": -0.0935,
     67         "n": 7
     68       },
     69       "iterate": {
     70         "mean": 0.3091,
     71         "effect": -0.0872,
     72         "n": 11
     73       },
     74       "none": {
     75         "mean": 0.3774,
     76         "effect": -0.0189,
     77         "n": 300
     78       },
     79       "plan_first": {
     80         "mean": 0.368,
     81         "effect": -0.0283,
     82         "n": 10
     83       },
     84       "review": {
     85         "mean": 0.38,
     86         "effect": -0.0163,
     87         "n": 5
     88       },
     89       "split_work": {
     90         "mean": 0.414,
     91         "effect": 0.0177,
     92         "n": 5
     93       },
     94       "use_subagents": {
     95         "mean": 0.4486,
     96         "effect": 0.0523,
     97         "n": 164
     98       }
     99     },
    100     "spread": 0.1786
    101   },
    102   "language": {
    103     "values": {
    104       "javascript": {
    105         "mean": 0.4838,
    106         "effect": 0.0875,
    107         "n": 21
    108       },
    109       "typescript": {
    110         "mean": 0.388,
    111         "effect": -0.0083,
    112         "n": 469
    113       },
    114       "unspecified": {
    115         "mean": 0.5,
    116         "effect": 0.1037,
    117         "n": 20
    118       }
    119     },
    120     "spread": 0.112
    121   },
    122   "renderer": {
    123     "values": {
    124       "canvas": {
    125         "mean": 0.3,
    126         "effect": -0.0963,
    127         "n": 7
    128       },
    129       "dom": {
    130         "mean": 0.4,
    131         "effect": 0.0037,
    132         "n": 5
    133       },
    134       "none": {
    135         "mean": 0.399,
    136         "effect": 0.0027,
    137         "n": 487
    138       },
    139       "svg": {
    140         "mean": 0.3029,
    141         "effect": -0.0935,
    142         "n": 7
    143       },
    144       "webgl": {
    145         "mean": 0.4,
    146         "effect": 0.0037,
    147         "n": 4
    148       }
    149     },
    150     "spread": 0.1
    151   },
    152   "playwright": {
    153     "values": {
    154       "available": {
    155         "mean": 0.4485,
    156         "effect": 0.0522,
    157         "n": 165
    158       },
    159       "instructed": {
    160         "mean": 0.3582,
    161         "effect": -0.0382,
    162         "n": 11
    163       },
    164       "off": {
    165         "mean": 0.3718,
    166         "effect": -0.0245,
    167         "n": 334
    168       }
    169     },
    170     "spread": 0.0903
    171   },
    172   "context_noise": {
    173     "values": {
    174       "clean": {
    175         "mean": 0.3978,
    176         "effect": 0.0015,
    177         "n": 477
    178       },
    179       "lorem_100k": {
    180         "mean": 0.3283,
    181         "effect": -0.068,
    182         "n": 6
    183       },
    184       "lorem_10k": {
    185         "mean": 0.3833,
    186         "effect": -0.013,
    187         "n": 6
    188       },
    189       "lorem_1k": {
    190         "mean": 0.35,
    191         "effect": -0.0463,
    192         "n": 3
    193       },
    194       "lorem_50k": {
    195         "mean": 0.3583,
    196         "effect": -0.038,
    197         "n": 6
    198       },
    199       "wikipedia_100k": {
    200         "mean": 0.4167,
    201         "effect": 0.0203,
    202         "n": 3
    203       },
    204       "wikipedia_10k": {
    205         "mean": 0.4,
    206         "effect": 0.0037,
    207         "n": 3
    208       },
    209       "wikipedia_1k": {
    210         "mean": 0.4167,
    211         "effect": 0.0203,
    212         "n": 3
    213       },
    214       "wikipedia_50k": {
    215         "mean": 0.4,
    216         "effect": 0.0037,
    217         "n": 3
    218       }
    219     },
    220     "spread": 0.0884
    221   },
    222   "error_checking": {
    223     "values": {
    224       "none": {
    225         "mean": 0.397,
    226         "effect": 0.0007,
    227         "n": 506
    228       },
    229       "self_verify": {
    230         "mean": 0.3125,
    231         "effect": -0.0838,
    232         "n": 4
    233       }
    234     },
    235     "spread": 0.0845
    236   },
    237   "provider": {
    238     "values": {
    239       "anthropic": {
    240         "mean": 0.4343,
    241         "effect": 0.038,
    242         "n": 194
    243       },
    244       "openrouter": {
    245         "mean": 0.428,
    246         "effect": 0.0317,
    247         "n": 71
    248       },
    249       "zai": {
    250         "mean": 0.3571,
    251         "effect": -0.0393,
    252         "n": 245
    253       }
    254     },
    255     "spread": 0.0772
    256   },
    257   "prompt_style": {
    258     "values": {
    259       "detailed": {
    260         "mean": 0.445,
    261         "effect": 0.0487,
    262         "n": 30
    263       },
    264       "simple": {
    265         "mean": 0.3933,
    266         "effect": -0.003,
    267         "n": 480
    268       }
    269     },
    270     "spread": 0.0517
    271   },
    272   "tool_grep": {
    273     "values": {
    274       "off": {
    275         "mean": 0.4326,
    276         "effect": 0.0362,
    277         "n": 31
    278       },
    279       "on": {
    280         "mean": 0.394,
    281         "effect": -0.0023,
    282         "n": 479
    283       }
    284     },
    285     "spread": 0.0386
    286   },
    287   "tool_read": {
    288     "values": {
    289       "off": {
    290         "mean": 0.431,
    291         "effect": 0.0346,
    292         "n": 31
    293       },
    294       "on": {
    295         "mean": 0.3941,
    296         "effect": -0.0022,
    297         "n": 479
    298       }
    299     },
    300     "spread": 0.0369
    301   },
    302   "linter": {
    303     "values": {
    304       "off": {
    305         "mean": 0.4231,
    306         "effect": 0.0267,
    307         "n": 39
    308       },
    309       "on": {
    310         "mean": 0.3941,
    311         "effect": -0.0022,
    312         "n": 471
    313       }
    314     },
    315     "spread": 0.029
    316   },
    317   "architecture": {
    318     "values": {
    319       "best_practices": {
    320         "mean": 0.375,
    321         "effect": -0.0213,
    322         "n": 4
    323       },
    324       "none": {
    325         "mean": 0.3968,
    326         "effect": 0.0004,
    327         "n": 501
    328       },
    329       "separation": {
    330         "mean": 0.37,
    331         "effect": -0.0263,
    332         "n": 5
    333       }
    334     },
    335     "spread": 0.0268
    336   },
    337   "human_language": {
    338     "values": {
    339       "en": {
    340         "mean": 0.3978,
    341         "effect": 0.0014,
    342         "n": 481
    343       },
    344       "es": {
    345         "mean": 0.3724,
    346         "effect": -0.0239,
    347         "n": 29
    348       }
    349     },
    350     "spread": 0.0254
    351   },
    352   "tool_edit": {
    353     "values": {
    354       "off": {
    355         "mean": 0.4089,
    356         "effect": 0.0125,
    357         "n": 35
    358       },
    359       "on": {
    360         "mean": 0.3954,
    361         "effect": -0.0009,
    362         "n": 475
    363       }
    364     },
    365     "spread": 0.0135
    366   },
    367   "tool_glob": {
    368     "values": {
    369       "off": {
    370         "mean": 0.4057,
    371         "effect": 0.0093,
    372         "n": 30
    373       },
    374       "on": {
    375         "mean": 0.3957,
    376         "effect": -0.0006,
    377         "n": 480
    378       }
    379     },
    380     "spread": 0.01
    381   },
    382   "design_guidance": {
    383     "values": {
    384       "none": {
    385         "mean": 0.3965,
    386         "effect": 0.0001,
    387         "n": 500
    388       },
    389       "specific": {
    390         "mean": 0.39,
    391         "effect": -0.0063,
    392         "n": 5
    393       },
    394       "vague": {
    395         "mean": 0.39,
    396         "effect": -0.0063,
    397         "n": 5
    398       }
    399     },
    400     "spread": 0.0065
    401   },
    402   "effort": {
    403     "values": {
    404       "high": {
    405         "mean": 0.3964,
    406         "effect": 0.0001,
    407         "n": 491
    408       },
    409       "max": {
    410         "mean": 0.3937,
    411         "effect": -0.0026,
    412         "n": 19
    413       }
    414     },
    415     "spread": 0.0027
    416   },
    417   "tool_write": {
    418     "values": {
    419       "off": {
    420         "mean": 0.3955,
    421         "effect": -0.0009,
    422         "n": 33
    423       },
    424       "on": {
    425         "mean": 0.3964,
    426         "effect": 0.0001,
    427         "n": 477
    428       }
    429     },
    430     "spread": 0.0009
    431   },
    432   "web_search": {
    433     "values": {
    434       "off": {
    435         "mean": 0.3956,
    436         "effect": -0.0008,
    437         "n": 36
    438       },
    439       "on": {
    440         "mean": 0.3964,
    441         "effect": 0.0001,
    442         "n": 474
    443       }
    444     },
    445     "spread": 0.0008
    446   },
    447   "context_file": {
    448     "values": {
    449       "none": {
    450         "mean": 0.3964,
    451         "effect": 0.0,
    452         "n": 479
    453       },
    454       "provided": {
    455         "mean": 0.3958,
    456         "effect": -0.0005,
    457         "n": 31
    458       }
    459     },
    460     "spread": 0.0006
    461   },
    462   "max_budget": {
    463     "values": {
    464       "high": {
    465         "mean": 0.3958,
    466         "effect": -0.0005,
    467         "n": 24
    468       },
    469       "low": {
    470         "mean": 0.3964,
    471         "effect": 0.0,
    472         "n": 486
    473       }
    474     },
    475     "spread": 0.0006
    476   }
    477 }

Impressum · Datenschutz