loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

main_effects_turns.json (8350B)


      1 {
      2   "architecture": {
      3     "values": {
      4       "best_practices": {
      5         "mean": 50.0,
      6         "effect": 24.502,
      7         "n": 4
      8       },
      9       "none": {
     10         "mean": 25.3413,
     11         "effect": -0.1567,
     12         "n": 501
     13       },
     14       "separation": {
     15         "mean": 21.6,
     16         "effect": -3.898,
     17         "n": 5
     18       }
     19     },
     20     "spread": 28.4
     21   },
     22   "strategy": {
     23     "values": {
     24       "creative_validate": {
     25         "mean": 41.875,
     26         "effect": 16.377,
     27         "n": 8
     28       },
     29       "delegate": {
     30         "mean": 38.5714,
     31         "effect": 13.0734,
     32         "n": 7
     33       },
     34       "iterate": {
     35         "mean": 40.0909,
     36         "effect": 14.5929,
     37         "n": 11
     38       },
     39       "none": {
     40         "mean": 24.5733,
     41         "effect": -0.9247,
     42         "n": 300
     43       },
     44       "plan_first": {
     45         "mean": 36.0,
     46         "effect": 10.502,
     47         "n": 10
     48       },
     49       "review": {
     50         "mean": 46.0,
     51         "effect": 20.502,
     52         "n": 5
     53       },
     54       "split_work": {
     55         "mean": 49.2,
     56         "effect": 23.702,
     57         "n": 5
     58       },
     59       "use_subagents": {
     60         "mean": 22.8659,
     61         "effect": -2.6322,
     62         "n": 164
     63       }
     64     },
     65     "spread": 26.3341
     66   },
     67   "model": {
     68     "values": {
     69       "gemma-4-26b": {
     70         "mean": 45.2558,
     71         "effect": 19.7578,
     72         "n": 43
     73       },
     74       "glm-4.5-air": {
     75         "mean": 23.6585,
     76         "effect": -1.8395,
     77         "n": 41
     78       },
     79       "glm-4.7": {
     80         "mean": 27.4938,
     81         "effect": 1.9958,
     82         "n": 81
     83       },
     84       "glm-5.1": {
     85         "mean": 22.0569,
     86         "effect": -3.4411,
     87         "n": 123
     88       },
     89       "haiku-4.5": {
     90         "mean": 27.5393,
     91         "effect": 2.0413,
     92         "n": 89
     93       },
     94       "kimi-k2.5": {
     95         "mean": 27.0,
     96         "effect": 1.502,
     97         "n": 3
     98       },
     99       "minimax-m2.7": {
    100         "mean": 32.6667,
    101         "effect": 7.1686,
    102         "n": 3
    103       },
    104       "opus-4.6": {
    105         "mean": 19.4808,
    106         "effect": -6.0173,
    107         "n": 52
    108       },
    109       "qwen-3.6-plus": {
    110         "mean": 20.3182,
    111         "effect": -5.1799,
    112         "n": 22
    113       },
    114       "sonnet-4.6": {
    115         "mean": 19.9623,
    116         "effect": -5.5358,
    117         "n": 53
    118       }
    119     },
    120     "spread": 25.775
    121   },
    122   "playwright": {
    123     "values": {
    124       "available": {
    125         "mean": 22.5697,
    126         "effect": -2.9283,
    127         "n": 165
    128       },
    129       "instructed": {
    130         "mean": 43.8182,
    131         "effect": 18.3201,
    132         "n": 11
    133       },
    134       "off": {
    135         "mean": 26.3413,
    136         "effect": 0.8433,
    137         "n": 334
    138       }
    139     },
    140     "spread": 21.2485
    141   },
    142   "error_checking": {
    143     "values": {
    144       "none": {
    145         "mean": 25.3715,
    146         "effect": -0.1265,
    147         "n": 506
    148       },
    149       "self_verify": {
    150         "mean": 41.5,
    151         "effect": 16.002,
    152         "n": 4
    153       }
    154     },
    155     "spread": 16.1285
    156   },
    157   "language": {
    158     "values": {
    159       "javascript": {
    160         "mean": 13.1905,
    161         "effect": -12.3076,
    162         "n": 21
    163       },
    164       "typescript": {
    165         "mean": 26.6716,
    166         "effect": 1.1736,
    167         "n": 469
    168       },
    169       "unspecified": {
    170         "mean": 10.9,
    171         "effect": -14.598,
    172         "n": 20
    173       }
    174     },
    175     "spread": 15.7716
    176   },
    177   "provider": {
    178     "values": {
    179       "anthropic": {
    180         "mean": 23.3093,
    181         "effect": -2.1888,
    182         "n": 194
    183       },
    184       "openrouter": {
    185         "mean": 36.2254,
    186         "effect": 10.7273,
    187         "n": 71
    188       },
    189       "zai": {
    190         "mean": 24.1224,
    191         "effect": -1.3756,
    192         "n": 245
    193       }
    194     },
    195     "spread": 12.9161
    196   },
    197   "context_noise": {
    198     "values": {
    199       "clean": {
    200         "mean": 25.9182,
    201         "effect": 0.4202,
    202         "n": 477
    203       },
    204       "lorem_100k": {
    205         "mean": 21.8333,
    206         "effect": -3.6647,
    207         "n": 6
    208       },
    209       "lorem_10k": {
    210         "mean": 20.5,
    211         "effect": -4.998,
    212         "n": 6
    213       },
    214       "lorem_1k": {
    215         "mean": 23.3333,
    216         "effect": -2.1647,
    217         "n": 3
    218       },
    219       "lorem_50k": {
    220         "mean": 19.3333,
    221         "effect": -6.1647,
    222         "n": 6
    223       },
    224       "wikipedia_100k": {
    225         "mean": 15.6667,
    226         "effect": -9.8314,
    227         "n": 3
    228       },
    229       "wikipedia_10k": {
    230         "mean": 15.0,
    231         "effect": -10.498,
    232         "n": 3
    233       },
    234       "wikipedia_1k": {
    235         "mean": 16.0,
    236         "effect": -9.498,
    237         "n": 3
    238       },
    239       "wikipedia_50k": {
    240         "mean": 20.3333,
    241         "effect": -5.1647,
    242         "n": 3
    243       }
    244     },
    245     "spread": 10.9182
    246   },
    247   "design_guidance": {
    248     "values": {
    249       "none": {
    250         "mean": 25.572,
    251         "effect": 0.074,
    252         "n": 500
    253       },
    254       "specific": {
    255         "mean": 18.0,
    256         "effect": -7.498,
    257         "n": 5
    258       },
    259       "vague": {
    260         "mean": 25.6,
    261         "effect": 0.102,
    262         "n": 5
    263       }
    264     },
    265     "spread": 7.6
    266   },
    267   "prompt_style": {
    268     "values": {
    269       "detailed": {
    270         "mean": 18.9,
    271         "effect": -6.598,
    272         "n": 30
    273       },
    274       "simple": {
    275         "mean": 25.9104,
    276         "effect": 0.4124,
    277         "n": 480
    278       }
    279     },
    280     "spread": 7.0104
    281   },
    282   "linter": {
    283     "values": {
    284       "off": {
    285         "mean": 20.6667,
    286         "effect": -4.8314,
    287         "n": 39
    288       },
    289       "on": {
    290         "mean": 25.8981,
    291         "effect": 0.4,
    292         "n": 471
    293       }
    294     },
    295     "spread": 5.2314
    296   },
    297   "renderer": {
    298     "values": {
    299       "canvas": {
    300         "mean": 28.1429,
    301         "effect": 2.6448,
    302         "n": 7
    303       },
    304       "dom": {
    305         "mean": 27.4,
    306         "effect": 1.902,
    307         "n": 5
    308       },
    309       "none": {
    310         "mean": 25.3778,
    311         "effect": -0.1202,
    312         "n": 487
    313       },
    314       "svg": {
    315         "mean": 27.0,
    316         "effect": 1.502,
    317         "n": 7
    318       },
    319       "webgl": {
    320         "mean": 30.5,
    321         "effect": 5.002,
    322         "n": 4
    323       }
    324     },
    325     "spread": 5.1222
    326   },
    327   "max_budget": {
    328     "values": {
    329       "high": {
    330         "mean": 21.7083,
    331         "effect": -3.7897,
    332         "n": 24
    333       },
    334       "low": {
    335         "mean": 25.6852,
    336         "effect": 0.1871,
    337         "n": 486
    338       }
    339     },
    340     "spread": 3.9769
    341   },
    342   "context_file": {
    343     "values": {
    344       "none": {
    345         "mean": 25.6743,
    346         "effect": 0.1763,
    347         "n": 479
    348       },
    349       "provided": {
    350         "mean": 22.7742,
    351         "effect": -2.7238,
    352         "n": 31
    353       }
    354     },
    355     "spread": 2.9001
    356   },
    357   "human_language": {
    358     "values": {
    359       "en": {
    360         "mean": 25.657,
    361         "effect": 0.1589,
    362         "n": 481
    363       },
    364       "es": {
    365         "mean": 22.8621,
    366         "effect": -2.636,
    367         "n": 29
    368       }
    369     },
    370     "spread": 2.7949
    371   },
    372   "tool_glob": {
    373     "values": {
    374       "off": {
    375         "mean": 27.9333,
    376         "effect": 2.4353,
    377         "n": 30
    378       },
    379       "on": {
    380         "mean": 25.3458,
    381         "effect": -0.1522,
    382         "n": 480
    383       }
    384     },
    385     "spread": 2.5875
    386   },
    387   "tool_grep": {
    388     "values": {
    389       "off": {
    390         "mean": 27.4516,
    391         "effect": 1.9536,
    392         "n": 31
    393       },
    394       "on": {
    395         "mean": 25.3716,
    396         "effect": -0.1264,
    397         "n": 479
    398       }
    399     },
    400     "spread": 2.08
    401   },
    402   "web_search": {
    403     "values": {
    404       "off": {
    405         "mean": 24.0278,
    406         "effect": -1.4703,
    407         "n": 36
    408       },
    409       "on": {
    410         "mean": 25.6097,
    411         "effect": 0.1117,
    412         "n": 474
    413       }
    414     },
    415     "spread": 1.5819
    416   },
    417   "tool_write": {
    418     "values": {
    419       "off": {
    420         "mean": 26.6364,
    421         "effect": 1.1383,
    422         "n": 33
    423       },
    424       "on": {
    425         "mean": 25.4193,
    426         "effect": -0.0788,
    427         "n": 477
    428       }
    429     },
    430     "spread": 1.2171
    431   },
    432   "tool_read": {
    433     "values": {
    434       "off": {
    435         "mean": 26.2903,
    436         "effect": 0.7923,
    437         "n": 31
    438       },
    439       "on": {
    440         "mean": 25.4468,
    441         "effect": -0.0513,
    442         "n": 479
    443       }
    444     },
    445     "spread": 0.8435
    446   },
    447   "effort": {
    448     "values": {
    449       "high": {
    450         "mean": 25.5132,
    451         "effect": 0.0152,
    452         "n": 491
    453       },
    454       "max": {
    455         "mean": 25.1053,
    456         "effect": -0.3928,
    457         "n": 19
    458       }
    459     },
    460     "spread": 0.4079
    461   },
    462   "tool_edit": {
    463     "values": {
    464       "off": {
    465         "mean": 25.1429,
    466         "effect": -0.3552,
    467         "n": 35
    468       },
    469       "on": {
    470         "mean": 25.5242,
    471         "effect": 0.0262,
    472         "n": 475
    473       }
    474     },
    475     "spread": 0.3813
    476   }
    477 }

Impressum · Datenschutz