loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

main_effects_gameplay.json (8302B)


      1 {
      2   "model": {
      3     "values": {
      4       "gemma-4-26b": {
      5         "mean": 0.0926,
      6         "effect": -0.5699,
      7         "n": 43
      8       },
      9       "glm-4.5-air": {
     10         "mean": 0.5924,
     11         "effect": -0.0701,
     12         "n": 41
     13       },
     14       "glm-4.7": {
     15         "mean": 0.7019,
     16         "effect": 0.0394,
     17         "n": 81
     18       },
     19       "glm-5.1": {
     20         "mean": 0.6854,
     21         "effect": 0.023,
     22         "n": 123
     23       },
     24       "haiku-4.5": {
     25         "mean": 0.7642,
     26         "effect": 0.1017,
     27         "n": 89
     28       },
     29       "kimi-k2.5": {
     30         "mean": 0.67,
     31         "effect": 0.0075,
     32         "n": 3
     33       },
     34       "minimax-m2.7": {
     35         "mean": 0.4667,
     36         "effect": -0.1958,
     37         "n": 3
     38       },
     39       "opus-4.6": {
     40         "mean": 0.8331,
     41         "effect": 0.1706,
     42         "n": 52
     43       },
     44       "qwen-3.6-plus": {
     45         "mean": 0.5236,
     46         "effect": -0.1389,
     47         "n": 22
     48       },
     49       "sonnet-4.6": {
     50         "mean": 0.7958,
     51         "effect": 0.1334,
     52         "n": 53
     53       }
     54     },
     55     "spread": 0.7405
     56   },
     57   "strategy": {
     58     "values": {
     59       "creative_validate": {
     60         "mean": 0.46,
     61         "effect": -0.2025,
     62         "n": 8
     63       },
     64       "delegate": {
     65         "mean": 0.6071,
     66         "effect": -0.0553,
     67         "n": 7
     68       },
     69       "iterate": {
     70         "mean": 0.59,
     71         "effect": -0.0725,
     72         "n": 11
     73       },
     74       "none": {
     75         "mean": 0.625,
     76         "effect": -0.0375,
     77         "n": 300
     78       },
     79       "plan_first": {
     80         "mean": 0.563,
     81         "effect": -0.0995,
     82         "n": 10
     83       },
     84       "review": {
     85         "mean": 0.536,
     86         "effect": -0.1265,
     87         "n": 5
     88       },
     89       "split_work": {
     90         "mean": 0.2,
     91         "effect": -0.4625,
     92         "n": 5
     93       },
     94       "use_subagents": {
     95         "mean": 0.7723,
     96         "effect": 0.1098,
     97         "n": 164
     98       }
     99     },
    100     "spread": 0.5723
    101   },
    102   "provider": {
    103     "values": {
    104       "anthropic": {
    105         "mean": 0.7913,
    106         "effect": 0.1288,
    107         "n": 194
    108       },
    109       "openrouter": {
    110         "mean": 0.2663,
    111         "effect": -0.3962,
    112         "n": 71
    113       },
    114       "zai": {
    115         "mean": 0.6753,
    116         "effect": 0.0128,
    117         "n": 245
    118       }
    119     },
    120     "spread": 0.525
    121   },
    122   "playwright": {
    123     "values": {
    124       "available": {
    125         "mean": 0.7867,
    126         "effect": 0.1242,
    127         "n": 165
    128       },
    129       "instructed": {
    130         "mean": 0.4691,
    131         "effect": -0.1934,
    132         "n": 11
    133       },
    134       "off": {
    135         "mean": 0.6075,
    136         "effect": -0.055,
    137         "n": 334
    138       }
    139     },
    140     "spread": 0.3176
    141   },
    142   "renderer": {
    143     "values": {
    144       "canvas": {
    145         "mean": 0.6729,
    146         "effect": 0.0104,
    147         "n": 7
    148       },
    149       "dom": {
    150         "mean": 0.838,
    151         "effect": 0.1755,
    152         "n": 5
    153       },
    154       "none": {
    155         "mean": 0.6631,
    156         "effect": 0.0007,
    157         "n": 487
    158       },
    159       "svg": {
    160         "mean": 0.5257,
    161         "effect": -0.1368,
    162         "n": 7
    163       },
    164       "webgl": {
    165         "mean": 0.585,
    166         "effect": -0.0775,
    167         "n": 4
    168       }
    169     },
    170     "spread": 0.3123
    171   },
    172   "architecture": {
    173     "values": {
    174       "best_practices": {
    175         "mean": 0.665,
    176         "effect": 0.0025,
    177         "n": 4
    178       },
    179       "none": {
    180         "mean": 0.6652,
    181         "effect": 0.0028,
    182         "n": 501
    183       },
    184       "separation": {
    185         "mean": 0.384,
    186         "effect": -0.2785,
    187         "n": 5
    188       }
    189     },
    190     "spread": 0.2812
    191   },
    192   "design_guidance": {
    193     "values": {
    194       "none": {
    195         "mean": 0.6647,
    196         "effect": 0.0022,
    197         "n": 500
    198       },
    199       "specific": {
    200         "mean": 0.46,
    201         "effect": -0.2025,
    202         "n": 5
    203       },
    204       "vague": {
    205         "mean": 0.646,
    206         "effect": -0.0165,
    207         "n": 5
    208       }
    209     },
    210     "spread": 0.2047
    211   },
    212   "prompt_style": {
    213     "values": {
    214       "detailed": {
    215         "mean": 0.8383,
    216         "effect": 0.1758,
    217         "n": 30
    218       },
    219       "simple": {
    220         "mean": 0.6515,
    221         "effect": -0.011,
    222         "n": 480
    223       }
    224     },
    225     "spread": 0.1868
    226   },
    227   "context_noise": {
    228     "values": {
    229       "clean": {
    230         "mean": 0.6618,
    231         "effect": -0.0007,
    232         "n": 477
    233       },
    234       "lorem_100k": {
    235         "mean": 0.605,
    236         "effect": -0.0575,
    237         "n": 6
    238       },
    239       "lorem_10k": {
    240         "mean": 0.725,
    241         "effect": 0.0625,
    242         "n": 6
    243       },
    244       "lorem_1k": {
    245         "mean": 0.67,
    246         "effect": 0.0075,
    247         "n": 3
    248       },
    249       "lorem_50k": {
    250         "mean": 0.6133,
    251         "effect": -0.0492,
    252         "n": 6
    253       },
    254       "wikipedia_100k": {
    255         "mean": 0.5833,
    256         "effect": -0.0792,
    257         "n": 3
    258       },
    259       "wikipedia_10k": {
    260         "mean": 0.7633,
    261         "effect": 0.1008,
    262         "n": 3
    263       },
    264       "wikipedia_1k": {
    265         "mean": 0.7333,
    266         "effect": 0.0708,
    267         "n": 3
    268       },
    269       "wikipedia_50k": {
    270         "mean": 0.7633,
    271         "effect": 0.1008,
    272         "n": 3
    273       }
    274     },
    275     "spread": 0.18
    276   },
    277   "human_language": {
    278     "values": {
    279       "en": {
    280         "mean": 0.6555,
    281         "effect": -0.007,
    282         "n": 481
    283       },
    284       "es": {
    285         "mean": 0.7783,
    286         "effect": 0.1158,
    287         "n": 29
    288       }
    289     },
    290     "spread": 0.1228
    291   },
    292   "max_budget": {
    293     "values": {
    294       "high": {
    295         "mean": 0.7746,
    296         "effect": 0.1121,
    297         "n": 24
    298       },
    299       "low": {
    300         "mean": 0.657,
    301         "effect": -0.0055,
    302         "n": 486
    303       }
    304     },
    305     "spread": 0.1176
    306   },
    307   "web_search": {
    308     "values": {
    309       "off": {
    310         "mean": 0.7697,
    311         "effect": 0.1072,
    312         "n": 36
    313       },
    314       "on": {
    315         "mean": 0.6543,
    316         "effect": -0.0081,
    317         "n": 474
    318       }
    319     },
    320     "spread": 0.1154
    321   },
    322   "language": {
    323     "values": {
    324       "javascript": {
    325         "mean": 0.7552,
    326         "effect": 0.0927,
    327         "n": 21
    328       },
    329       "typescript": {
    330         "mean": 0.6577,
    331         "effect": -0.0048,
    332         "n": 469
    333       },
    334       "unspecified": {
    335         "mean": 0.6775,
    336         "effect": 0.015,
    337         "n": 20
    338       }
    339     },
    340     "spread": 0.0975
    341   },
    342   "context_file": {
    343     "values": {
    344       "none": {
    345         "mean": 0.6577,
    346         "effect": -0.0048,
    347         "n": 479
    348       },
    349       "provided": {
    350         "mean": 0.7365,
    351         "effect": 0.074,
    352         "n": 31
    353       }
    354     },
    355     "spread": 0.0788
    356   },
    357   "tool_edit": {
    358     "values": {
    359       "off": {
    360         "mean": 0.722,
    361         "effect": 0.0595,
    362         "n": 35
    363       },
    364       "on": {
    365         "mean": 0.6581,
    366         "effect": -0.0044,
    367         "n": 475
    368       }
    369     },
    370     "spread": 0.0639
    371   },
    372   "tool_read": {
    373     "values": {
    374       "off": {
    375         "mean": 0.7087,
    376         "effect": 0.0462,
    377         "n": 31
    378       },
    379       "on": {
    380         "mean": 0.6595,
    381         "effect": -0.003,
    382         "n": 479
    383       }
    384     },
    385     "spread": 0.0492
    386   },
    387   "tool_grep": {
    388     "values": {
    389       "off": {
    390         "mean": 0.6971,
    391         "effect": 0.0346,
    392         "n": 31
    393       },
    394       "on": {
    395         "mean": 0.6603,
    396         "effect": -0.0022,
    397         "n": 479
    398       }
    399     },
    400     "spread": 0.0368
    401   },
    402   "tool_glob": {
    403     "values": {
    404       "off": {
    405         "mean": 0.692,
    406         "effect": 0.0295,
    407         "n": 30
    408       },
    409       "on": {
    410         "mean": 0.6606,
    411         "effect": -0.0018,
    412         "n": 480
    413       }
    414     },
    415     "spread": 0.0314
    416   },
    417   "tool_write": {
    418     "values": {
    419       "off": {
    420         "mean": 0.6858,
    421         "effect": 0.0233,
    422         "n": 33
    423       },
    424       "on": {
    425         "mean": 0.6609,
    426         "effect": -0.0016,
    427         "n": 477
    428       }
    429     },
    430     "spread": 0.0249
    431   },
    432   "linter": {
    433     "values": {
    434       "off": {
    435         "mean": 0.6828,
    436         "effect": 0.0203,
    437         "n": 39
    438       },
    439       "on": {
    440         "mean": 0.6608,
    441         "effect": -0.0017,
    442         "n": 471
    443       }
    444     },
    445     "spread": 0.022
    446   },
    447   "error_checking": {
    448     "values": {
    449       "none": {
    450         "mean": 0.6624,
    451         "effect": -0.0001,
    452         "n": 506
    453       },
    454       "self_verify": {
    455         "mean": 0.67,
    456         "effect": 0.0075,
    457         "n": 4
    458       }
    459     },
    460     "spread": 0.0076
    461   },
    462   "effort": {
    463     "values": {
    464       "high": {
    465         "mean": 0.6624,
    466         "effect": -0.0001,
    467         "n": 491
    468       },
    469       "max": {
    470         "mean": 0.6653,
    471         "effect": 0.0028,
    472         "n": 19
    473       }
    474     },
    475     "spread": 0.0029
    476   }
    477 }

Impressum · Datenschutz