loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

main_effects_score.json (8316B)


      1 {
      2   "model": {
      3     "values": {
      4       "gemma-4-26b": {
      5         "mean": 0.4963,
      6         "effect": -0.2352,
      7         "n": 43
      8       },
      9       "glm-4.5-air": {
     10         "mean": 0.67,
     11         "effect": -0.0615,
     12         "n": 41
     13       },
     14       "glm-4.7": {
     15         "mean": 0.7107,
     16         "effect": -0.0207,
     17         "n": 81
     18       },
     19       "glm-5.1": {
     20         "mean": 0.754,
     21         "effect": 0.0225,
     22         "n": 123
     23       },
     24       "haiku-4.5": {
     25         "mean": 0.7558,
     26         "effect": 0.0243,
     27         "n": 89
     28       },
     29       "kimi-k2.5": {
     30         "mean": 0.5667,
     31         "effect": -0.1648,
     32         "n": 3
     33       },
     34       "minimax-m2.7": {
     35         "mean": 0.725,
     36         "effect": -0.0065,
     37         "n": 3
     38       },
     39       "opus-4.6": {
     40         "mean": 0.8256,
     41         "effect": 0.0941,
     42         "n": 52
     43       },
     44       "qwen-3.6-plus": {
     45         "mean": 0.7148,
     46         "effect": -0.0167,
     47         "n": 22
     48       },
     49       "sonnet-4.6": {
     50         "mean": 0.8327,
     51         "effect": 0.1013,
     52         "n": 53
     53       }
     54     },
     55     "spread": 0.3364
     56   },
     57   "strategy": {
     58     "values": {
     59       "creative_validate": {
     60         "mean": 0.5981,
     61         "effect": -0.1333,
     62         "n": 8
     63       },
     64       "delegate": {
     65         "mean": 0.7086,
     66         "effect": -0.0229,
     67         "n": 7
     68       },
     69       "iterate": {
     70         "mean": 0.7318,
     71         "effect": 0.0003,
     72         "n": 11
     73       },
     74       "none": {
     75         "mean": 0.7085,
     76         "effect": -0.023,
     77         "n": 300
     78       },
     79       "plan_first": {
     80         "mean": 0.7115,
     81         "effect": -0.02,
     82         "n": 10
     83       },
     84       "review": {
     85         "mean": 0.705,
     86         "effect": -0.0265,
     87         "n": 5
     88       },
     89       "split_work": {
     90         "mean": 0.536,
     91         "effect": -0.1955,
     92         "n": 5
     93       },
     94       "use_subagents": {
     95         "mean": 0.7889,
     96         "effect": 0.0574,
     97         "n": 164
     98       }
     99     },
    100     "spread": 0.2529
    101   },
    102   "provider": {
    103     "values": {
    104       "anthropic": {
    105         "mean": 0.7955,
    106         "effect": 0.064,
    107         "n": 194
    108       },
    109       "openrouter": {
    110         "mean": 0.5766,
    111         "effect": -0.1549,
    112         "n": 71
    113       },
    114       "zai": {
    115         "mean": 0.7256,
    116         "effect": -0.0058,
    117         "n": 245
    118       }
    119     },
    120     "spread": 0.2189
    121   },
    122   "playwright": {
    123     "values": {
    124       "available": {
    125         "mean": 0.7907,
    126         "effect": 0.0592,
    127         "n": 165
    128       },
    129       "instructed": {
    130         "mean": 0.5918,
    131         "effect": -0.1397,
    132         "n": 11
    133       },
    134       "off": {
    135         "mean": 0.7068,
    136         "effect": -0.0246,
    137         "n": 334
    138       }
    139     },
    140     "spread": 0.1989
    141   },
    142   "context_noise": {
    143     "values": {
    144       "clean": {
    145         "mean": 0.731,
    146         "effect": -0.0004,
    147         "n": 477
    148       },
    149       "lorem_100k": {
    150         "mean": 0.6842,
    151         "effect": -0.0473,
    152         "n": 6
    153       },
    154       "lorem_10k": {
    155         "mean": 0.7492,
    156         "effect": 0.0177,
    157         "n": 6
    158       },
    159       "lorem_1k": {
    160         "mean": 0.7783,
    161         "effect": 0.0469,
    162         "n": 3
    163       },
    164       "lorem_50k": {
    165         "mean": 0.6742,
    166         "effect": -0.0573,
    167         "n": 6
    168       },
    169       "wikipedia_100k": {
    170         "mean": 0.69,
    171         "effect": -0.0415,
    172         "n": 3
    173       },
    174       "wikipedia_10k": {
    175         "mean": 0.7483,
    176         "effect": 0.0169,
    177         "n": 3
    178       },
    179       "wikipedia_1k": {
    180         "mean": 0.85,
    181         "effect": 0.1185,
    182         "n": 3
    183       },
    184       "wikipedia_50k": {
    185         "mean": 0.835,
    186         "effect": 0.1035,
    187         "n": 3
    188       }
    189     },
    190     "spread": 0.1758
    191   },
    192   "architecture": {
    193     "values": {
    194       "best_practices": {
    195         "mean": 0.795,
    196         "effect": 0.0635,
    197         "n": 4
    198       },
    199       "none": {
    200         "mean": 0.7319,
    201         "effect": 0.0004,
    202         "n": 501
    203       },
    204       "separation": {
    205         "mean": 0.638,
    206         "effect": -0.0935,
    207         "n": 5
    208       }
    209     },
    210     "spread": 0.157
    211   },
    212   "design_guidance": {
    213     "values": {
    214       "none": {
    215         "mean": 0.7323,
    216         "effect": 0.0008,
    217         "n": 500
    218       },
    219       "specific": {
    220         "mean": 0.62,
    221         "effect": -0.1115,
    222         "n": 5
    223       },
    224       "vague": {
    225         "mean": 0.762,
    226         "effect": 0.0305,
    227         "n": 5
    228       }
    229     },
    230     "spread": 0.142
    231   },
    232   "renderer": {
    233     "values": {
    234       "canvas": {
    235         "mean": 0.6971,
    236         "effect": -0.0343,
    237         "n": 7
    238       },
    239       "dom": {
    240         "mean": 0.749,
    241         "effect": 0.0175,
    242         "n": 5
    243       },
    244       "none": {
    245         "mean": 0.7333,
    246         "effect": 0.0018,
    247         "n": 487
    248       },
    249       "svg": {
    250         "mean": 0.6364,
    251         "effect": -0.095,
    252         "n": 7
    253       },
    254       "webgl": {
    255         "mean": 0.7137,
    256         "effect": -0.0177,
    257         "n": 4
    258       }
    259     },
    260     "spread": 0.1126
    261   },
    262   "prompt_style": {
    263     "values": {
    264       "detailed": {
    265         "mean": 0.8182,
    266         "effect": 0.0867,
    267         "n": 30
    268       },
    269       "simple": {
    270         "mean": 0.7261,
    271         "effect": -0.0054,
    272         "n": 480
    273       }
    274     },
    275     "spread": 0.0921
    276   },
    277   "language": {
    278     "values": {
    279       "javascript": {
    280         "mean": 0.8033,
    281         "effect": 0.0719,
    282         "n": 21
    283       },
    284       "typescript": {
    285         "mean": 0.7257,
    286         "effect": -0.0058,
    287         "n": 469
    288       },
    289       "unspecified": {
    290         "mean": 0.7923,
    291         "effect": 0.0608,
    292         "n": 20
    293       }
    294     },
    295     "spread": 0.0776
    296   },
    297   "human_language": {
    298     "values": {
    299       "en": {
    300         "mean": 0.7282,
    301         "effect": -0.0032,
    302         "n": 481
    303       },
    304       "es": {
    305         "mean": 0.7853,
    306         "effect": 0.0539,
    307         "n": 29
    308       }
    309     },
    310     "spread": 0.0571
    311   },
    312   "context_file": {
    313     "values": {
    314       "none": {
    315         "mean": 0.7284,
    316         "effect": -0.0031,
    317         "n": 479
    318       },
    319       "provided": {
    320         "mean": 0.7792,
    321         "effect": 0.0477,
    322         "n": 31
    323       }
    324     },
    325     "spread": 0.0508
    326   },
    327   "web_search": {
    328     "values": {
    329       "off": {
    330         "mean": 0.7747,
    331         "effect": 0.0433,
    332         "n": 36
    333       },
    334       "on": {
    335         "mean": 0.7282,
    336         "effect": -0.0033,
    337         "n": 474
    338       }
    339     },
    340     "spread": 0.0465
    341   },
    342   "tool_edit": {
    343     "values": {
    344       "off": {
    345         "mean": 0.7679,
    346         "effect": 0.0364,
    347         "n": 35
    348       },
    349       "on": {
    350         "mean": 0.7288,
    351         "effect": -0.0027,
    352         "n": 475
    353       }
    354     },
    355     "spread": 0.0391
    356   },
    357   "tool_grep": {
    358     "values": {
    359       "off": {
    360         "mean": 0.7668,
    361         "effect": 0.0353,
    362         "n": 31
    363       },
    364       "on": {
    365         "mean": 0.7292,
    366         "effect": -0.0023,
    367         "n": 479
    368       }
    369     },
    370     "spread": 0.0376
    371   },
    372   "max_budget": {
    373     "values": {
    374       "high": {
    375         "mean": 0.7583,
    376         "effect": 0.0269,
    377         "n": 24
    378       },
    379       "low": {
    380         "mean": 0.7301,
    381         "effect": -0.0013,
    382         "n": 486
    383       }
    384     },
    385     "spread": 0.0282
    386   },
    387   "tool_read": {
    388     "values": {
    389       "off": {
    390         "mean": 0.7573,
    391         "effect": 0.0258,
    392         "n": 31
    393       },
    394       "on": {
    395         "mean": 0.7298,
    396         "effect": -0.0017,
    397         "n": 479
    398       }
    399     },
    400     "spread": 0.0275
    401   },
    402   "error_checking": {
    403     "values": {
    404       "none": {
    405         "mean": 0.7316,
    406         "effect": 0.0002,
    407         "n": 506
    408       },
    409       "self_verify": {
    410         "mean": 0.71,
    411         "effect": -0.0215,
    412         "n": 4
    413       }
    414     },
    415     "spread": 0.0216
    416   },
    417   "effort": {
    418     "values": {
    419       "high": {
    420         "mean": 0.7323,
    421         "effect": 0.0008,
    422         "n": 491
    423       },
    424       "max": {
    425         "mean": 0.7111,
    426         "effect": -0.0204,
    427         "n": 19
    428       }
    429     },
    430     "spread": 0.0212
    431   },
    432   "linter": {
    433     "values": {
    434       "off": {
    435         "mean": 0.7396,
    436         "effect": 0.0081,
    437         "n": 39
    438       },
    439       "on": {
    440         "mean": 0.7308,
    441         "effect": -0.0007,
    442         "n": 471
    443       }
    444     },
    445     "spread": 0.0088
    446   },
    447   "tool_glob": {
    448     "values": {
    449       "off": {
    450         "mean": 0.7267,
    451         "effect": -0.0048,
    452         "n": 30
    453       },
    454       "on": {
    455         "mean": 0.7318,
    456         "effect": 0.0003,
    457         "n": 480
    458       }
    459     },
    460     "spread": 0.0051
    461   },
    462   "tool_write": {
    463     "values": {
    464       "off": {
    465         "mean": 0.7341,
    466         "effect": 0.0026,
    467         "n": 33
    468       },
    469       "on": {
    470         "mean": 0.7313,
    471         "effect": -0.0002,
    472         "n": 477
    473       }
    474     },
    475     "spread": 0.0028
    476   }
    477 }

Impressum · Datenschutz