loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit f801efc9b7f7880049fdeeeed53d55f0ecae5ecc
parent 2fae566a4db8aae4b68eb9a4ff587a6a8e4245a2
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Thu, 16 Apr 2026 09:54:00 +0200

Add human trial labels for 4 calibration runs

Covers qwen-3.6-plus, haiku-4.5, opus-4.6, glm-5.1 (each with
strat=usub or strat=none). All four are reported playable by the
human tester but the bot currently scores them near zero because
renderer detection fails (renderer=unknown, grid_detected=false),
so aggregate bot-vs-human agreement drops to 47.6%.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mtasks/tetris/eval/gameplay-bot/calibration/2763232d.json | 46+++++++++++++++++++++++-----------------------
Mtasks/tetris/eval/gameplay-bot/calibration/5ae88633.json | 39+++++++++++++++++++--------------------
Mtasks/tetris/eval/gameplay-bot/calibration/6f157de1.json | 46+++++++++++++++++++++++-----------------------
Mtasks/tetris/eval/gameplay-bot/calibration/7c167ef9.json | 44++++++++++++++++++++++----------------------
4 files changed, 87 insertions(+), 88 deletions(-)

diff --git a/tasks/tetris/eval/gameplay-bot/calibration/2763232d.json b/tasks/tetris/eval/gameplay-bot/calibration/2763232d.json @@ -3,33 +3,34 @@ "short_id": "2763232d", "label": "Calibration (qwen-3.6-plus, en)", "notes": "", - "human_tested_at": "", + "human_tested_at": "2026-04-16", "human_tests": { - "game_loads": null, - "game_starts": null, - "auto_drop": null, - "move_left": null, - "move_right": null, - "move_down": null, - "rotate": null, - "hard_drop": null, + "game_loads": true, + "game_starts": true, + "auto_drop": true, + "move_left": true, + "move_right": true, + "move_down": true, + "rotate": true, + "hard_drop": true, "all_pieces_rotate": null, - "piece_locks": null, - "new_piece_spawns": null, - "multiple_pieces": null, - "line_clear": null, + "piece_locks": true, + "new_piece_spawns": true, + "multiple_pieces": true, + "line_clear": true, "score_increases_on_clear": null, "score_element_visible": null, - "game_over": null, - "playable_30s": null, - "multi_line_clear": null, + "game_over": true, + "playable_30s": true, + "multi_line_clear": true, "score_scaling": null, "level_progression": null, "speed_progression": null, - "next_piece_preview": null, - "game_over_display": null, - "counter_clockwise_rotation": null, - "soft_drop_distinct": null, - "rendering_clean": null + "next_piece_preview": true, + "game_over_display": true, + "counter_clockwise_rotation": true, + "soft_drop_distinct": true, + "rendering_clean": null, + "score_changes": true } -} -\ No newline at end of file +} diff --git a/tasks/tetris/eval/gameplay-bot/calibration/5ae88633.json b/tasks/tetris/eval/gameplay-bot/calibration/5ae88633.json @@ -3,33 +3,33 @@ "short_id": "5ae88633", "label": "Calibration (haiku-4.5, en)", "notes": "", - "human_tested_at": "", + "human_tested_at": "2026-04-16", "human_tests": { - "game_loads": null, - "game_starts": null, - "auto_drop": null, - "move_left": null, - "move_right": null, - "move_down": null, - "rotate": null, - "hard_drop": null, - "all_pieces_rotate": null, - "piece_locks": null, - "new_piece_spawns": null, - "multiple_pieces": null, - "line_clear": null, + "game_loads": true, + "game_starts": true, + "auto_drop": false, + "move_left": true, + "move_right": true, + "move_down": true, + "rotate": true, + "hard_drop": true, + "all_pieces_rotate": true, + "piece_locks": true, + "new_piece_spawns": true, + "multiple_pieces": true, + "line_clear": true, "score_increases_on_clear": null, "score_element_visible": null, - "game_over": null, - "playable_30s": null, + "game_over": true, + "playable_30s": true, "multi_line_clear": null, "score_scaling": null, "level_progression": null, "speed_progression": null, - "next_piece_preview": null, - "game_over_display": null, + "next_piece_preview": true, + "game_over_display": true, "counter_clockwise_rotation": null, "soft_drop_distinct": null, "rendering_clean": null } -} -\ No newline at end of file +} diff --git a/tasks/tetris/eval/gameplay-bot/calibration/6f157de1.json b/tasks/tetris/eval/gameplay-bot/calibration/6f157de1.json @@ -3,33 +3,34 @@ "short_id": "6f157de1", "label": "Calibration (opus-4.6, en)", "notes": "", - "human_tested_at": "", + "human_tested_at": "2026-04-16", "human_tests": { - "game_loads": null, - "game_starts": null, - "auto_drop": null, - "move_left": null, - "move_right": null, - "move_down": null, - "rotate": null, - "hard_drop": null, - "all_pieces_rotate": null, - "piece_locks": null, - "new_piece_spawns": null, - "multiple_pieces": null, - "line_clear": null, + "game_loads": true, + "game_starts": true, + "auto_drop": true, + "move_left": true, + "move_right": true, + "move_down": true, + "rotate": true, + "hard_drop": true, + "all_pieces_rotate": true, + "piece_locks": true, + "new_piece_spawns": true, + "multiple_pieces": true, + "line_clear": true, "score_increases_on_clear": null, "score_element_visible": null, - "game_over": null, - "playable_30s": null, - "multi_line_clear": null, + "game_over": true, + "playable_30s": true, + "multi_line_clear": true, "score_scaling": null, "level_progression": null, "speed_progression": null, - "next_piece_preview": null, - "game_over_display": null, - "counter_clockwise_rotation": null, + "next_piece_preview": true, + "game_over_display": true, + "counter_clockwise_rotation": true, "soft_drop_distinct": null, - "rendering_clean": null + "rendering_clean": null, + "score_changes": true } -} -\ No newline at end of file +} diff --git a/tasks/tetris/eval/gameplay-bot/calibration/7c167ef9.json b/tasks/tetris/eval/gameplay-bot/calibration/7c167ef9.json @@ -3,33 +3,34 @@ "short_id": "7c167ef9", "label": "Calibration (glm-5.1, en)", "notes": "", - "human_tested_at": "", + "human_tested_at": "2026-04-16", "human_tests": { - "game_loads": null, - "game_starts": null, - "auto_drop": null, - "move_left": null, - "move_right": null, - "move_down": null, - "rotate": null, - "hard_drop": null, - "all_pieces_rotate": null, - "piece_locks": null, - "new_piece_spawns": null, - "multiple_pieces": null, - "line_clear": null, + "game_loads": true, + "game_starts": true, + "auto_drop": true, + "move_left": true, + "move_right": true, + "move_down": true, + "rotate": true, + "hard_drop": true, + "all_pieces_rotate": true, + "piece_locks": true, + "new_piece_spawns": true, + "multiple_pieces": true, + "line_clear": true, "score_increases_on_clear": null, "score_element_visible": null, - "game_over": null, - "playable_30s": null, - "multi_line_clear": null, + "game_over": true, + "playable_30s": true, + "multi_line_clear": true, "score_scaling": null, "level_progression": null, "speed_progression": null, - "next_piece_preview": null, - "game_over_display": null, + "next_piece_preview": true, + "game_over_display": true, "counter_clockwise_rotation": null, "soft_drop_distinct": null, - "rendering_clean": null + "rendering_clean": null, + "score_changes": true } -} -\ No newline at end of file +}

Impressum · Datenschutz