commit f801efc9b7f7880049fdeeeed53d55f0ecae5ecc
parent 2fae566a4db8aae4b68eb9a4ff587a6a8e4245a2
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Thu, 16 Apr 2026 09:54:00 +0200
Add human trial labels for 4 calibration runs
Covers qwen-3.6-plus, haiku-4.5, opus-4.6, glm-5.1 (each with
strat=usub or strat=none). All four are reported playable by the
human tester but the bot currently scores them near zero because
renderer detection fails (renderer=unknown, grid_detected=false),
so aggregate bot-vs-human agreement drops to 47.6%.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
4 files changed, 87 insertions(+), 88 deletions(-)
diff --git a/tasks/tetris/eval/gameplay-bot/calibration/2763232d.json b/tasks/tetris/eval/gameplay-bot/calibration/2763232d.json
@@ -3,33 +3,34 @@
"short_id": "2763232d",
"label": "Calibration (qwen-3.6-plus, en)",
"notes": "",
- "human_tested_at": "",
+ "human_tested_at": "2026-04-16",
"human_tests": {
- "game_loads": null,
- "game_starts": null,
- "auto_drop": null,
- "move_left": null,
- "move_right": null,
- "move_down": null,
- "rotate": null,
- "hard_drop": null,
+ "game_loads": true,
+ "game_starts": true,
+ "auto_drop": true,
+ "move_left": true,
+ "move_right": true,
+ "move_down": true,
+ "rotate": true,
+ "hard_drop": true,
"all_pieces_rotate": null,
- "piece_locks": null,
- "new_piece_spawns": null,
- "multiple_pieces": null,
- "line_clear": null,
+ "piece_locks": true,
+ "new_piece_spawns": true,
+ "multiple_pieces": true,
+ "line_clear": true,
"score_increases_on_clear": null,
"score_element_visible": null,
- "game_over": null,
- "playable_30s": null,
- "multi_line_clear": null,
+ "game_over": true,
+ "playable_30s": true,
+ "multi_line_clear": true,
"score_scaling": null,
"level_progression": null,
"speed_progression": null,
- "next_piece_preview": null,
- "game_over_display": null,
- "counter_clockwise_rotation": null,
- "soft_drop_distinct": null,
- "rendering_clean": null
+ "next_piece_preview": true,
+ "game_over_display": true,
+ "counter_clockwise_rotation": true,
+ "soft_drop_distinct": true,
+ "rendering_clean": null,
+ "score_changes": true
}
-}
-\ No newline at end of file
+}
diff --git a/tasks/tetris/eval/gameplay-bot/calibration/5ae88633.json b/tasks/tetris/eval/gameplay-bot/calibration/5ae88633.json
@@ -3,33 +3,33 @@
"short_id": "5ae88633",
"label": "Calibration (haiku-4.5, en)",
"notes": "",
- "human_tested_at": "",
+ "human_tested_at": "2026-04-16",
"human_tests": {
- "game_loads": null,
- "game_starts": null,
- "auto_drop": null,
- "move_left": null,
- "move_right": null,
- "move_down": null,
- "rotate": null,
- "hard_drop": null,
- "all_pieces_rotate": null,
- "piece_locks": null,
- "new_piece_spawns": null,
- "multiple_pieces": null,
- "line_clear": null,
+ "game_loads": true,
+ "game_starts": true,
+ "auto_drop": false,
+ "move_left": true,
+ "move_right": true,
+ "move_down": true,
+ "rotate": true,
+ "hard_drop": true,
+ "all_pieces_rotate": true,
+ "piece_locks": true,
+ "new_piece_spawns": true,
+ "multiple_pieces": true,
+ "line_clear": true,
"score_increases_on_clear": null,
"score_element_visible": null,
- "game_over": null,
- "playable_30s": null,
+ "game_over": true,
+ "playable_30s": true,
"multi_line_clear": null,
"score_scaling": null,
"level_progression": null,
"speed_progression": null,
- "next_piece_preview": null,
- "game_over_display": null,
+ "next_piece_preview": true,
+ "game_over_display": true,
"counter_clockwise_rotation": null,
"soft_drop_distinct": null,
"rendering_clean": null
}
-}
-\ No newline at end of file
+}
diff --git a/tasks/tetris/eval/gameplay-bot/calibration/6f157de1.json b/tasks/tetris/eval/gameplay-bot/calibration/6f157de1.json
@@ -3,33 +3,34 @@
"short_id": "6f157de1",
"label": "Calibration (opus-4.6, en)",
"notes": "",
- "human_tested_at": "",
+ "human_tested_at": "2026-04-16",
"human_tests": {
- "game_loads": null,
- "game_starts": null,
- "auto_drop": null,
- "move_left": null,
- "move_right": null,
- "move_down": null,
- "rotate": null,
- "hard_drop": null,
- "all_pieces_rotate": null,
- "piece_locks": null,
- "new_piece_spawns": null,
- "multiple_pieces": null,
- "line_clear": null,
+ "game_loads": true,
+ "game_starts": true,
+ "auto_drop": true,
+ "move_left": true,
+ "move_right": true,
+ "move_down": true,
+ "rotate": true,
+ "hard_drop": true,
+ "all_pieces_rotate": true,
+ "piece_locks": true,
+ "new_piece_spawns": true,
+ "multiple_pieces": true,
+ "line_clear": true,
"score_increases_on_clear": null,
"score_element_visible": null,
- "game_over": null,
- "playable_30s": null,
- "multi_line_clear": null,
+ "game_over": true,
+ "playable_30s": true,
+ "multi_line_clear": true,
"score_scaling": null,
"level_progression": null,
"speed_progression": null,
- "next_piece_preview": null,
- "game_over_display": null,
- "counter_clockwise_rotation": null,
+ "next_piece_preview": true,
+ "game_over_display": true,
+ "counter_clockwise_rotation": true,
"soft_drop_distinct": null,
- "rendering_clean": null
+ "rendering_clean": null,
+ "score_changes": true
}
-}
-\ No newline at end of file
+}
diff --git a/tasks/tetris/eval/gameplay-bot/calibration/7c167ef9.json b/tasks/tetris/eval/gameplay-bot/calibration/7c167ef9.json
@@ -3,33 +3,34 @@
"short_id": "7c167ef9",
"label": "Calibration (glm-5.1, en)",
"notes": "",
- "human_tested_at": "",
+ "human_tested_at": "2026-04-16",
"human_tests": {
- "game_loads": null,
- "game_starts": null,
- "auto_drop": null,
- "move_left": null,
- "move_right": null,
- "move_down": null,
- "rotate": null,
- "hard_drop": null,
- "all_pieces_rotate": null,
- "piece_locks": null,
- "new_piece_spawns": null,
- "multiple_pieces": null,
- "line_clear": null,
+ "game_loads": true,
+ "game_starts": true,
+ "auto_drop": true,
+ "move_left": true,
+ "move_right": true,
+ "move_down": true,
+ "rotate": true,
+ "hard_drop": true,
+ "all_pieces_rotate": true,
+ "piece_locks": true,
+ "new_piece_spawns": true,
+ "multiple_pieces": true,
+ "line_clear": true,
"score_increases_on_clear": null,
"score_element_visible": null,
- "game_over": null,
- "playable_30s": null,
- "multi_line_clear": null,
+ "game_over": true,
+ "playable_30s": true,
+ "multi_line_clear": true,
"score_scaling": null,
"level_progression": null,
"speed_progression": null,
- "next_piece_preview": null,
- "game_over_display": null,
+ "next_piece_preview": true,
+ "game_over_display": true,
"counter_clockwise_rotation": null,
"soft_drop_distinct": null,
- "rendering_clean": null
+ "rendering_clean": null,
+ "score_changes": true
}
-}
-\ No newline at end of file
+}