grid.yaml - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

grid.yaml (7560B)
      1 version: 1
      2 
      3 defaults:
      4   runs_per_cell: 3
      5   timeout_seconds: 1200
      6   budget:
      7     low: 2.00
      8     high: 10.00
      9 
     10 axes:
     11   model:
     12     values: ["haiku-4.5", "sonnet-4.6", "opus-4.6", "glm-4.5-air", "glm-4.7", "glm-5.1", "qwen-3.6-plus", "gemma-4-26b", "minimax-m2.7", "kimi-k2.5"]
     13   effort:
     14     values: [high, max]
     15   prompt_style:
     16     values: [simple, detailed]
     17   language:
     18     values: [typescript, javascript, unspecified]
     19   human_language:
     20     values: [en, es]
     21   tool_read:
     22     values: ["on", "off"]
     23   tool_write:
     24     values: ["on", "off"]
     25   tool_edit:
     26     values: ["on", "off"]
     27   tool_glob:
     28     values: ["on", "off"]
     29   tool_grep:
     30     values: ["on", "off"]
     31   linter:
     32     values: ["on", "off"]
     33   playwright:
     34     values: ["off", "available", "instructed"]
     35   context_file:
     36     values: [none, provided]
     37   web_search:
     38     values: ["on", "off"]
     39   max_budget:
     40     values: [low, high]
     41   tests_provided:
     42     values: ["none", "a_few", "many"]
     43   strategy:
     44     values: ["none", "plan_first", "iterate", "creative_validate", "use_subagents", "delegate", "review", "compete", "split_work"]
     45   design_guidance:
     46     values: ["none", "vague", "specific"]
     47   architecture:
     48     values: ["none", "separation", "best_practices"]
     49   error_checking:
     50     values: ["none", "self_verify"]
     51   context_noise:
     52     values: ["clean", "wikipedia_1k", "wikipedia_10k", "wikipedia_50k", "wikipedia_100k", "wikipedia_25", "wikipedia_50", "wikipedia_75", "lorem_1k", "lorem_10k", "lorem_50k", "lorem_100k", "lorem_25", "lorem_50", "lorem_75"]
     53   renderer:
     54     values: ["none", "canvas", "svg", "dom", "webgl"]
     55   provider:
     56     values: ["anthropic", "zai", "openrouter"]
     57 
     58 providers:
     59   anthropic:
     60     # Maps display names to CLI --model args
     61     cli_model_map:
     62       "haiku-4.5": "haiku"
     63       "sonnet-4.6": "sonnet"
     64       "opus-4.6": "opus"
     65   zai:
     66     base_url: "https://api.z.ai/api/anthropic"
     67     api_key_env: "ZAI_API_KEY"
     68     models: ["glm-4.5-air", "glm-4.7", "glm-5.1"]
     69   openrouter:
     70     base_url: "http://localhost:4000"
     71     auth_token: "dummy"
     72     cli_model_map:
     73       "qwen-3.6-plus": "openrouter/qwen/qwen3.6-plus"
     74       "gemma-4-26b": "openrouter/google/gemma-4-26b-a4b-it"
     75       "minimax-m2.7": "openrouter/minimax/minimax-m2.7"
     76       "kimi-k2.5": "openrouter/moonshotai/kimi-k2.5"
     77 
     78 exclusions:
     79   # Haiku does not support extended thinking
     80   - when:
     81       model: "haiku-4.5"
     82       effort: max
     83   - when:
     84       tests_provided: a_few
     85       playwright: "off"
     86   - when:
     87       tests_provided: many
     88       playwright: "off"
     89   - when:
     90       strategy: compete
     91   # GLM models only with zai provider
     92   - when: { provider: anthropic, model: "glm-4.5-air" }
     93   - when: { provider: anthropic, model: "glm-4.7" }
     94   - when: { provider: anthropic, model: "glm-5.1" }
     95   # Anthropic models only with anthropic provider
     96   - when: { provider: zai, model: "haiku-4.5" }
     97   - when: { provider: zai, model: "sonnet-4.6" }
     98   - when: { provider: zai, model: "opus-4.6" }
     99   - when: { provider: zai, model: "qwen-3.6-plus" }
    100   # OpenRouter models only with openrouter
    101   - when: { provider: anthropic, model: "qwen-3.6-plus" }
    102   - when: { provider: anthropic, model: "gemma-4-26b" }
    103   - when: { provider: anthropic, model: "minimax-m2.7" }
    104   - when: { provider: anthropic, model: "kimi-k2.5" }
    105   - when: { provider: zai, model: "gemma-4-26b" }
    106   - when: { provider: zai, model: "minimax-m2.7" }
    107   - when: { provider: zai, model: "kimi-k2.5" }
    108   - when: { provider: openrouter, model: "haiku-4.5" }
    109   - when: { provider: openrouter, model: "sonnet-4.6" }
    110   - when: { provider: openrouter, model: "opus-4.6" }
    111   - when: { provider: openrouter, model: "glm-4.5-air" }
    112   - when: { provider: openrouter, model: "glm-4.7" }
    113   - when: { provider: openrouter, model: "glm-5.1" }
    114 
    115 tasks:
    116   - tetris
    117 
    118 profiles:
    119   smoke:
    120     description: "Quick validation -- minimal grid"
    121     axes:
    122       model: ["haiku-4.5"]
    123       effort: [high]
    124       prompt_style: [simple, detailed]
    125       language: [typescript]
    126       human_language: [en]
    127       tool_read: ["on"]
    128       tool_write: ["on"]
    129       tool_edit: ["on"]
    130       tool_glob: ["on"]
    131       tool_grep: ["on"]
    132       linter: ["off"]
    133       playwright: ["off"]
    134       context_file: [none]
    135       web_search: ["off"]
    136       max_budget: [low]
    137       tests_provided: ["none"]
    138       strategy: ["none"]
    139       design_guidance: ["none"]
    140       architecture: ["none"]
    141       error_checking: ["none"]
    142       context_noise: ["clean"]
    143       renderer: ["none"]
    144       provider: ["anthropic", "zai"]
    145     runs_per_cell: 1
    146 
    147   zai-smoke:
    148     description: "Quick validation for Z.AI GLM models"
    149     axes:
    150       model: ["glm-4.5-air", "glm-4.7", "glm-5.1"]
    151       effort: [high]
    152       prompt_style: [simple, detailed]
    153       language: [typescript]
    154       human_language: [en]
    155       tool_read: ["on"]
    156       tool_write: ["on"]
    157       tool_edit: ["on"]
    158       tool_glob: ["on"]
    159       tool_grep: ["on"]
    160       linter: ["off"]
    161       playwright: ["off"]
    162       context_file: [none]
    163       web_search: ["off"]
    164       max_budget: [low]
    165       tests_provided: ["none"]
    166       strategy: ["none"]
    167       design_guidance: ["none"]
    168       architecture: ["none"]
    169       error_checking: ["none"]
    170       context_noise: ["clean"]
    171       renderer: ["none"]
    172       provider: ["zai"]
    173     runs_per_cell: 1
    174 
    175   core:
    176     description: "Core comparison -- models and effort levels"
    177     axes:
    178       model: ["haiku-4.5", "sonnet-4.6", "opus-4.6"]
    179       effort: [high, max]
    180       prompt_style: [simple, detailed]
    181       language: [typescript]
    182       human_language: [en]
    183       tool_read: ["on"]
    184       tool_write: ["on"]
    185       tool_edit: ["on"]
    186       tool_glob: ["on"]
    187       tool_grep: ["on"]
    188       linter: ["off"]
    189       playwright: ["off"]
    190       context_file: [none]
    191       web_search: ["off"]
    192       max_budget: [high]
    193       tests_provided: ["none"]
    194       strategy: ["none"]
    195       design_guidance: ["none"]
    196       architecture: ["none"]
    197       error_checking: ["none"]
    198       context_noise: ["clean"]
    199       renderer: ["none"]
    200       provider: ["anthropic", "zai"]
    201     runs_per_cell: 3
    202 
    203   all-on:
    204     description: "Everything enabled -- max tooling"
    205     axes:
    206       model: ["haiku-4.5"]
    207       effort: [high]
    208       prompt_style: [simple]
    209       language: [typescript]
    210       human_language: [en]
    211       tool_read: ["on"]
    212       tool_write: ["on"]
    213       tool_edit: ["on"]
    214       tool_glob: ["on"]
    215       tool_grep: ["on"]
    216       linter: ["on"]
    217       playwright: ["instructed"]
    218       context_file: [provided]
    219       web_search: ["on"]
    220       max_budget: [high]
    221       tests_provided: ["many"]
    222       strategy: ["delegate"]
    223       design_guidance: ["specific"]
    224       architecture: ["best_practices"]
    225       error_checking: ["self_verify"]
    226       context_noise: ["clean"]
    227       renderer: ["canvas"]
    228       provider: ["anthropic", "zai"]
    229     runs_per_cell: 3
    230 
    231   all-off:
    232     description: "Everything disabled -- bare minimum (Bash only)"
    233     axes:
    234       model: ["haiku-4.5"]
    235       effort: [high]
    236       prompt_style: [simple]
    237       language: [typescript]
    238       human_language: [en]
    239       tool_read: ["off"]
    240       tool_write: ["off"]
    241       tool_edit: ["off"]
    242       tool_glob: ["off"]
    243       tool_grep: ["off"]
    244       linter: ["off"]
    245       playwright: ["off"]
    246       context_file: [none]
    247       web_search: ["off"]
    248       max_budget: [low]
    249       tests_provided: ["none"]
    250       strategy: ["none"]
    251       design_guidance: ["none"]
    252       architecture: ["none"]
    253       error_checking: ["none"]
    254       context_noise: ["clean"]
    255       renderer: ["none"]
    256       provider: ["anthropic", "zai"]
    257     runs_per_cell: 3
    258 
    259   full:
    260     description: "Full grid -- all dimensions"
    261     # Uses top-level axes definition
    262     runs_per_cell: 3
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README