loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit c7d67a0208000a33a1957c4a7c045a94d3bf9427
parent 1cc1aa1c2d533cc2b3737893c84456794cc80568
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 12:42:39 +0200

Re-eval 159 runs (57 haiku, 51 opus, 51 sonnet)

Diffstat:
Mresults/analysis/main_effects_build_quality.json | 292++++++++++++++++++++++++++++++++++++++++----------------------------------------
Mresults/analysis/main_effects_code_quality.json | 298++++++++++++++++++++++++++++++++++++++++----------------------------------------
Mresults/analysis/main_effects_gameplay.json | 304++++++++++++++++++++++++++++++++++++++++----------------------------------------
Mresults/analysis/main_effects_score.json | 308++++++++++++++++++++++++++++++++++++++++----------------------------------------
Mresults/analysis/main_effects_sonarqube.json | 320++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mresults/analysis/main_effects_structural.json | 324++++++++++++++++++++++++++++++++++++++++----------------------------------------
Mresults/analysis/main_effects_transcript.json | 206++++++++++++++++++++++++++++++++++++++++----------------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 162++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 26+++++++++++++-------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 2+-
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 19++++++++++++++-----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 2+-
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 161++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 16++++++++--------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 17+++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 17+++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 17+++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 2+-
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 17+++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=detailed_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json | 17+++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=detailed_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json | 2+-
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run1/eval_results.json | 161++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run1/gameplay-bot-report.json | 30+++++++++++++++---------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run2/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run2/gameplay-bot-report.json | 2+-
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run3/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run3/gameplay-bot-report.json | 2+-
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json | 96++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json | 73+++++++++++++++++++++++++++++++++++++++++++------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 160++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 85+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 64+++++++++++++++++++++++++++++++++++++---------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 77++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 54++++++++++++++++++++++++++++++++++--------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 17+++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 220+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 95++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 217++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 74++++++++++++++++++++++++++++++++++++++++++++------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 89++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 66++++++++++++++++++++++++++++++++++++++----------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 217++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 62++++++++++++++++++++++++++++++++++++--------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 220++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 48+++++++++++++++++++++++++++++++-----------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 85++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 62++++++++++++++++++++++++++++++++++++--------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 129+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 106++++++++++++++++++++++++++++++++++++++++++-------------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 73++++++++++++++++++++++++++++++++++++++++---------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 218++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 62+++++++++++++++++++++++++++++++++++++-------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 162++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 56++++++++++++++++++++++++++++++++++----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 218++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 58+++++++++++++++++++++++++++++++++++-----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 217++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 60+++++++++++++++++++++++++++++++++++-------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 217++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 60+++++++++++++++++++++++++++++++++++-------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 85+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 62+++++++++++++++++++++++++++++++++++++-------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 163++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 55++++++++++++++++++++++++++++++++++---------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 162++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 48++++++++++++++++++++++++++++++------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 160++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 52+++++++++++++++++++++++++++++++---------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 93+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 69+++++++++++++++++++++++++++++++++++++++++++++------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 48++++++++++++++++++++++++++++++------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 143+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 120+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 160++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 65+++++++++++++++++++++++++++++++++++++++--------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 161++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 62+++++++++++++++++++++++++++++++++++++-------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 164+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 63++++++++++++++++++++++++++++++++++++++-------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 147+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 120+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 159++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 58+++++++++++++++++++++++++++++++++++-----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 162++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 67+++++++++++++++++++++++++++++++++++++++++--------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 87++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 64+++++++++++++++++++++++++++++++++++++---------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 104+++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 81+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 160++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 52+++++++++++++++++++++++++++++++---------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 161++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 68+++++++++++++++++++++++++++++++++++++++++---------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 235+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 66++++++++++++++++++++++++++++++++++++++++--------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 235+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 78+++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/eval_results.json | 240+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 92+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/eval_results.json | 233+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 69+++++++++++++++++++++++++++++++++++++--------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/eval_results.json | 233+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 72++++++++++++++++++++++++++++++++++++++++--------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/eval_results.json | 250+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/gameplay-bot-report.json | 105+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/eval_results.json | 245++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/gameplay-bot-report.json | 66++++++++++++++++++++++++++++++++++++++++--------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/eval_results.json | 243+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/gameplay-bot-report.json | 42++++++++++++++++++++++++++----------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json | 238+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json | 69+++++++++++++++++++++++++++++++++++++--------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/eval_results.json | 154++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/gameplay-bot-report.json | 56++++++++++++++++++++++++++++++++++----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 232+++++++++++++++++++++++++++++++++----------------------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 241+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 61+++++++++++++++++++++++++++++++++++++------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 245+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 46+++++++++++++++++++++++++++++-----------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 246+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 56++++++++++++++++++++++++++++++++++----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 244+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 50+++++++++++++++++++++++++++++++-------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 181++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 58++++++++++++++++++++++++++++++++++------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 179+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 51++++++++++++++++++++++++++++++++-------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 34++++++++++++++++++++++++++++------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 98++++++++++++++++++++++++++++++++++++++++---------------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 183++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 46+++++++++++++++++++++++++++++-----------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 203+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 58++++++++++++++++++++++++++++++++++------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 128+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 61+++++++++++++++++++++++++++++++++++++------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 208++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 58++++++++++++++++++++++++++++++++++++----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 203+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 60+++++++++++++++++++++++++++++++++++-------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 203+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 58++++++++++++++++++++++++++++++++++------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 205+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 53+++++++++++++++++++++++++++++++++--------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 205+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 57+++++++++++++++++++++++++++++++++++----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 209++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 31++++++++++++++++---------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 204+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 54+++++++++++++++++++++++++++++++++---------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 206+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 49++++++++++++++++++++++++++++++++-----------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 204+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 54+++++++++++++++++++++++++++++++++---------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 182+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 52++++++++++++++++++++++++++++++++++------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/eval_results.json | 204+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 48++++++++++++++++++++++++++++++------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/eval_results.json | 203+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 60+++++++++++++++++++++++++++++++++++-------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/eval_results.json | 205+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 61+++++++++++++++++++++++++++++++++++++------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/eval_results.json | 205+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/gameplay-bot-report.json | 59++++++++++++++++++++++++++++++++++++-----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/eval_results.json | 206++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/gameplay-bot-report.json | 56++++++++++++++++++++++++++++++++++----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/eval_results.json | 204+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/gameplay-bot-report.json | 62++++++++++++++++++++++++++++++++++++--------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json | 129++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json | 60+++++++++++++++++++++++++++++++++++-------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/eval_results.json | 245+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/gameplay-bot-report.json | 49+++++++++++++++++++++++++++++++------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run3/eval_results.json | 178+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run3/gameplay-bot-report.json | 60++++++++++++++++++++++++++++++++++++------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 243+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 62++++++++++++++++++++++++++++++++++++--------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 243+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 62++++++++++++++++++++++++++++++++++++--------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 248+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 60++++++++++++++++++++++++++++++++++++++----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 33+++++++++++++++++++++++++++------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 244+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 66++++++++++++++++++++++++++++++++++++++----------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 247+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 66++++++++++++++++++++++++++++++++++++++----------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 138++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 98++++++++++++++++++++++++++++++++++++++++---------------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 166++++++++++++++-----------------------------------------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 110++++++++++++++++++++++++++++++++++++++++---------------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 219++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 17+++++++++--------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 236+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 43++++++++++++++++++++++++-------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 181++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 66++++++++++++++++++++++++++++++++++++++----------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 178+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 58+++++++++++++++++++++++++++++++++++-----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 33+++++++++++++++++++++++++++------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 183++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 51++++++++++++++++++++++++++++++++-------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 203+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 72+++++++++++++++++++++++++++++++++++++++++++-----------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 183++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 49+++++++++++++++++++++++++++++++------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 59++++++++++++++++++++++++++++++++++++++++++++---------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 197+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 67+++++++++++++++++++++++++++++++++++++++++--------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 190++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 59+++++++++++++++++++++++++++++++++++++----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 162++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 56++++++++++++++++++++++++++++++++++----------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 99+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 76+++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/eval_results.json | 161++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 64+++++++++++++++++++++++++++++++++++++++-------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/eval_results.json | 161++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 80++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/eval_results.json | 160++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 83++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/eval_results.json | 17+++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/eval_results.json | 161++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/gameplay-bot-report.json | 84++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/eval_results.json | 141+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/gameplay-bot-report.json | 118+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json | 165++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json | 75+++++++++++++++++++++++++++++++++++++++++++++------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/eval_results.json | 118+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/gameplay-bot-report.json | 95+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run3/eval_results.json | 162++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run3/gameplay-bot-report.json | 92++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 161++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 74++++++++++++++++++++++++++++++++++++++++++++------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 21+++++++++++++++------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 153+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 102+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 58++++++++++++++++++++++++++++++++++------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 64+++++++++++++++++++++++++++++++++++++---------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 74++++++++++++++++++++++++++++++++++++++++----------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 17+++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 17+++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 17+++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 79++++++++++++++++++++++++++++++++++++++++++-------------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 52+++++++++++++++++++++++++++++++---------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 21+++++++++++++++------
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 64+++++++++++++++++++++++++++++++++++++---------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 64+++++++++++++++++++++++++++++++++++++---------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 164++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 71++++++++++++++++++++++++++++++++++++++++++++---------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 17+++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 17+++++++++++++----
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 162++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 89++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 148++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 93+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 162++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 96++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 161++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 113++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 17+++++++++++++----
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 90+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 160++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 73+++++++++++++++++++++++++++++++++++++++++++------------------------------
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 161++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 80++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 95+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 153+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 32++++++++++++++++----------------
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 54++++++++++++++++++++++++++----------------------------
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 44+++++++++++++++++++++-----------------------
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 106++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 21+++++++++++----------
Mresults/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 21+++++++++++----------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 73++++++++++++++++++++++++++++++++++++-------------------------------------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 49++++++++++++++++++++++++-------------------------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 16++++++++--------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 16++++++++--------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 169++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 40++++++++++++++++++++--------------------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 56++++++++++++++++++++++++++++----------------------------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 30+++++++++++++++---------------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 144++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 4++--
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 32++++++++++++++++----------------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 12++++++------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 18+++++++++---------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 8++++----
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 55+++++++++++++++++++++++++++----------------------------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 35+++++++++++++++++------------------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 52++++++++++++++++++++++++++--------------------------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 32++++++++++++++++----------------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json | 14+++++++-------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json | 6+++---
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json | 18+++++++++---------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json | 18+++++++++---------
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json | 8++++----
Mresults/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json | 8++++----
307 files changed, 27096 insertions(+), 6732 deletions(-)

diff --git a/results/analysis/main_effects_build_quality.json b/results/analysis/main_effects_build_quality.json @@ -3,265 +3,265 @@ "values": { "javascript": { "mean": 1.0, - "effect": 0.1164, + "effect": 0.0913, "n": 9 }, "typescript": { - "mean": 0.8899, - "effect": 0.0063, - "n": 105 + "mean": 0.9163, + "effect": 0.0077, + "n": 142 }, "unspecified": { "mean": 0.67, - "effect": -0.2136, + "effect": -0.2387, "n": 8 } }, "spread": 0.33 }, - "tool_edit": { - "values": { - "off": { - "mean": 0.67, - "effect": -0.2136, - "n": 6 - }, - "on": { - "mean": 0.8947, - "effect": 0.011, - "n": 116 - } - }, - "spread": 0.2247 - }, "prompt_style": { "values": { "detailed": { - "mean": 0.67, - "effect": -0.2136, - "n": 5 + "mean": 0.736, + "effect": -0.1727, + "n": 10 }, "simple": { - "mean": 0.8927, - "effect": 0.0091, - "n": 117 + "mean": 0.9203, + "effect": 0.0116, + "n": 149 } }, - "spread": 0.2227 + "spread": 0.1843 }, "model": { "values": { "haiku": { - "mean": 0.8167, - "effect": -0.0669, - "n": 54 + "mean": 0.8205, + "effect": -0.0882, + "n": 57 }, "opus": { - "mean": 0.9633, - "effect": 0.0797, - "n": 27 + "mean": 0.9676, + "effect": 0.059, + "n": 51 }, "sonnet": { - "mean": 0.9193, - "effect": 0.0357, - "n": 41 + "mean": 0.9482, + "effect": 0.0396, + "n": 51 } }, - "spread": 0.1466 + "spread": 0.1471 }, - "tool_glob": { + "claude_version": { "values": { - "off": { - "mean": 0.7512, - "effect": -0.1324, - "n": 8 + "2.1.91 (Claude Code)": { + "mean": 0.8188, + "effect": -0.0899, + "n": 51 }, - "on": { - "mean": 0.8929, - "effect": 0.0093, - "n": 114 + "2.1.92 (Claude Code)": { + "mean": 0.9511, + "effect": 0.0424, + "n": 108 } }, - "spread": 0.1417 + "spread": 0.1323 }, - "sub_agents": { + "tool_edit": { "values": { "off": { - "mean": 0.7525, - "effect": -0.1311, - "n": 8 + "mean": 0.835, + "effect": -0.0737, + "n": 12 }, "on": { - "mean": 0.8928, - "effect": 0.0092, - "n": 114 + "mean": 0.9147, + "effect": 0.006, + "n": 147 } }, - "spread": 0.1403 + "spread": 0.0797 }, - "claude_version": { + "tool_read": { "values": { - "2.1.91 (Claude Code)": { - "mean": 0.8144, - "effect": -0.0692, - "n": 48 + "off": { + "mean": 0.835, + "effect": -0.0737, + "n": 12 }, - "2.1.92 (Claude Code)": { - "mean": 0.9285, - "effect": 0.0449, - "n": 74 + "on": { + "mean": 0.9147, + "effect": 0.006, + "n": 147 } }, - "spread": 0.1141 + "spread": 0.0797 }, - "tool_read": { + "sub_agents": { "values": { "off": { - "mean": 0.78, - "effect": -0.1036, - "n": 9 + "mean": 0.8586, + "effect": -0.0501, + "n": 14 }, "on": { - "mean": 0.8919, - "effect": 0.0083, - "n": 113 + "mean": 0.9135, + "effect": 0.0048, + "n": 145 } }, - "spread": 0.1119 + "spread": 0.0549 }, "tool_write": { "values": { "off": { - "mean": 0.8167, - "effect": -0.0669, - "n": 9 + "mean": 0.8625, + "effect": -0.0462, + "n": 12 }, "on": { - "mean": 0.8889, - "effect": 0.0053, - "n": 113 + "mean": 0.9124, + "effect": 0.0038, + "n": 147 } }, - "spread": 0.0722 + "spread": 0.0499 }, "context_file": { "values": { "none": { - "mean": 0.8769, - "effect": -0.0067, - "n": 110 + "mean": 0.9057, + "effect": -0.003, + "n": 147 }, "provided": { "mean": 0.945, - "effect": 0.0614, + "effect": 0.0363, "n": 12 } }, - "spread": 0.0681 + "spread": 0.0393 }, - "effort": { - "values": { - "high": { - "mean": 0.8802, - "effect": -0.0034, - "n": 113 - }, - "max": { - "mean": 0.9267, - "effect": 0.0431, - "n": 9 - } - }, - "spread": 0.0465 - }, - "tool_grep": { + "linter": { "values": { "off": { - "mean": 0.8586, - "effect": -0.025, - "n": 7 + "mean": 0.8821, + "effect": -0.0265, + "n": 14 }, "on": { - "mean": 0.8851, - "effect": 0.0015, - "n": 115 + "mean": 0.9112, + "effect": 0.0026, + "n": 145 } }, - "spread": 0.0265 + "spread": 0.0291 }, "playwright": { "values": { "off": { - "mean": 0.8625, - "effect": -0.0211, - "n": 12 + "mean": 0.8821, + "effect": -0.0265, + "n": 14 }, "on": { - "mean": 0.8859, - "effect": 0.0023, - "n": 110 + "mean": 0.9112, + "effect": 0.0026, + "n": 145 } }, - "spread": 0.0234 + "spread": 0.0291 }, - "web_search": { + "max_budget": { "values": { - "off": { - "mean": 0.868, - "effect": -0.0156, - "n": 10 + "high": { + "mean": 0.89, + "effect": -0.0187, + "n": 12 }, - "on": { - "mean": 0.885, - "effect": 0.0014, - "n": 112 + "low": { + "mean": 0.9102, + "effect": 0.0015, + "n": 147 } }, - "spread": 0.017 + "spread": 0.0202 }, - "max_budget": { + "tool_grep": { "values": { - "high": { + "off": { "mean": 0.89, - "effect": 0.0064, + "effect": -0.0187, "n": 12 }, - "low": { - "mean": 0.8829, - "effect": -0.0007, - "n": 110 + "on": { + "mean": 0.9102, + "effect": 0.0015, + "n": 147 } }, - "spread": 0.0071 + "spread": 0.0202 }, "human_language": { "values": { "en": { - "mean": 0.8831, - "effect": -0.0005, - "n": 113 + "mean": 0.9098, + "effect": 0.0011, + "n": 150 }, "es": { "mean": 0.89, - "effect": 0.0064, + "effect": -0.0187, "n": 9 } }, - "spread": 0.0069 + "spread": 0.0198 }, - "linter": { + "effort": { + "values": { + "high": { + "mean": 0.9076, + "effect": -0.0011, + "n": 150 + }, + "max": { + "mean": 0.9267, + "effect": 0.018, + "n": 9 + } + }, + "spread": 0.0191 + }, + "web_search": { "values": { "off": { - "mean": 0.8821, - "effect": -0.0015, - "n": 14 + "mean": 0.8985, + "effect": -0.0102, + "n": 13 }, "on": { - "mean": 0.8838, - "effect": 0.0002, - "n": 108 + "mean": 0.9096, + "effect": 0.0009, + "n": 146 + } + }, + "spread": 0.0111 + }, + "tool_glob": { + "values": { + "off": { + "mean": 0.9175, + "effect": 0.0088, + "n": 12 + }, + "on": { + "mean": 0.908, + "effect": -0.0007, + "n": 147 } }, - "spread": 0.0017 + "spread": 0.0095 } } \ No newline at end of file diff --git a/results/analysis/main_effects_code_quality.json b/results/analysis/main_effects_code_quality.json @@ -1,267 +1,267 @@ { - "max_budget": { + "model": { "values": { - "high": { - "mean": 0.365, - "effect": -0.1907, - "n": 12 + "haiku": { + "mean": 0.606, + "effect": -0.1094, + "n": 57 }, - "low": { - "mean": 0.5713, - "effect": 0.0156, - "n": 147 + "opus": { + "mean": 0.8235, + "effect": 0.1082, + "n": 51 + }, + "sonnet": { + "mean": 0.7294, + "effect": 0.0141, + "n": 51 } }, - "spread": 0.2063 + "spread": 0.2175 + }, + "prompt_style": { + "values": { + "detailed": { + "mean": 0.56, + "effect": -0.1553, + "n": 10 + }, + "simple": { + "mean": 0.7258, + "effect": 0.0104, + "n": 149 + } + }, + "spread": 0.1658 }, "language": { "values": { "javascript": { "mean": 0.7389, - "effect": 0.1832, + "effect": 0.0235, "n": 9 }, "typescript": { - "mean": 0.543, - "effect": -0.0127, + "mean": 0.7218, + "effect": 0.0064, "n": 142 }, "unspecified": { "mean": 0.575, - "effect": 0.0193, + "effect": -0.1403, "n": 8 } }, - "spread": 0.1959 - }, - "tool_edit": { - "values": { - "off": { - "mean": 0.7217, - "effect": 0.1659, - "n": 12 - }, - "on": { - "mean": 0.5422, - "effect": -0.0135, - "n": 147 - } - }, - "spread": 0.1795 - }, - "effort": { - "values": { - "high": { - "mean": 0.5471, - "effect": -0.0086, - "n": 150 - }, - "max": { - "mean": 0.6989, - "effect": 0.1432, - "n": 9 - } - }, - "spread": 0.1518 + "spread": 0.1639 }, "claude_version": { "values": { "2.1.91 (Claude Code)": { - "mean": 0.4559, - "effect": -0.0998, + "mean": 0.6094, + "effect": -0.1059, "n": 51 }, "2.1.92 (Claude Code)": { - "mean": 0.6029, - "effect": 0.0471, + "mean": 0.7654, + "effect": 0.05, "n": 108 } }, - "spread": 0.147 + "spread": 0.156 }, - "model": { + "human_language": { "values": { - "haiku": { - "mean": 0.4686, - "effect": -0.0871, - "n": 57 - }, - "opus": { - "mean": 0.6108, - "effect": 0.0551, - "n": 51 + "en": { + "mean": 0.722, + "effect": 0.0067, + "n": 150 }, - "sonnet": { - "mean": 0.598, - "effect": 0.0423, - "n": 51 + "es": { + "mean": 0.6044, + "effect": -0.1109, + "n": 9 } }, - "spread": 0.1422 + "spread": 0.1176 }, - "playwright": { + "effort": { "values": { - "off": { - "mean": 0.4286, - "effect": -0.1272, - "n": 14 + "high": { + "mean": 0.7117, + "effect": -0.0037, + "n": 150 }, - "on": { - "mean": 0.568, - "effect": 0.0123, - "n": 145 + "max": { + "mean": 0.7767, + "effect": 0.0613, + "n": 9 } }, - "spread": 0.1394 + "spread": 0.065 }, - "sub_agents": { + "tool_grep": { "values": { "off": { - "mean": 0.6807, - "effect": 0.125, - "n": 14 + "mean": 0.7675, + "effect": 0.0522, + "n": 12 }, "on": { - "mean": 0.5437, - "effect": -0.0121, - "n": 145 + "mean": 0.7111, + "effect": -0.0043, + "n": 147 } }, - "spread": 0.137 + "spread": 0.0564 }, - "tool_grep": { + "tool_read": { "values": { "off": { - "mean": 0.65, - "effect": 0.0943, + "mean": 0.7667, + "effect": 0.0513, "n": 12 }, "on": { - "mean": 0.548, - "effect": -0.0077, + "mean": 0.7112, + "effect": -0.0042, + "n": 147 + } + }, + "spread": 0.0555 + }, + "context_file": { + "values": { + "none": { + "mean": 0.7193, + "effect": 0.0039, "n": 147 + }, + "provided": { + "mean": 0.6675, + "effect": -0.0478, + "n": 12 } }, - "spread": 0.102 + "spread": 0.0518 }, "tool_glob": { "values": { "off": { - "mean": 0.6458, - "effect": 0.0901, + "mean": 0.7625, + "effect": 0.0472, "n": 12 }, "on": { - "mean": 0.5484, - "effect": -0.0074, + "mean": 0.7115, + "effect": -0.0038, "n": 147 } }, - "spread": 0.0974 + "spread": 0.051 }, - "human_language": { + "playwright": { "values": { - "en": { - "mean": 0.5505, - "effect": -0.0053, - "n": 150 + "off": { + "mean": 0.6714, + "effect": -0.0439, + "n": 14 }, - "es": { - "mean": 0.6433, - "effect": 0.0876, - "n": 9 + "on": { + "mean": 0.7196, + "effect": 0.0042, + "n": 145 } }, - "spread": 0.0928 + "spread": 0.0482 }, - "context_file": { + "max_budget": { "values": { - "none": { - "mean": 0.549, - "effect": -0.0067, - "n": 147 - }, - "provided": { - "mean": 0.6383, - "effect": 0.0826, + "high": { + "mean": 0.6775, + "effect": -0.0378, "n": 12 + }, + "low": { + "mean": 0.7184, + "effect": 0.0031, + "n": 147 } }, - "spread": 0.0893 + "spread": 0.0409 }, - "linter": { + "sub_agents": { "values": { "off": { - "mean": 0.4893, - "effect": -0.0664, + "mean": 0.6914, + "effect": -0.0239, "n": 14 }, "on": { - "mean": 0.5621, - "effect": 0.0064, + "mean": 0.7177, + "effect": 0.0023, "n": 145 } }, - "spread": 0.0728 + "spread": 0.0263 }, - "prompt_style": { + "web_search": { "values": { - "detailed": { - "mean": 0.505, - "effect": -0.0507, - "n": 10 + "off": { + "mean": 0.7246, + "effect": 0.0093, + "n": 13 }, - "simple": { - "mean": 0.5591, - "effect": 0.0034, - "n": 149 + "on": { + "mean": 0.7145, + "effect": -0.0008, + "n": 146 } }, - "spread": 0.0541 + "spread": 0.0101 }, - "tool_read": { + "tool_edit": { "values": { "off": { - "mean": 0.575, - "effect": 0.0193, + "mean": 0.7217, + "effect": 0.0063, "n": 12 }, "on": { - "mean": 0.5541, - "effect": -0.0016, + "mean": 0.7148, + "effect": -0.0005, "n": 147 } }, - "spread": 0.0209 + "spread": 0.0069 }, - "web_search": { + "linter": { "values": { "off": { - "mean": 0.5423, - "effect": -0.0134, - "n": 13 + "mean": 0.7107, + "effect": -0.0046, + "n": 14 }, "on": { - "mean": 0.5569, - "effect": 0.0012, - "n": 146 + "mean": 0.7158, + "effect": 0.0004, + "n": 145 } }, - "spread": 0.0146 + "spread": 0.0051 }, "tool_write": { "values": { "off": { - "mean": 0.5542, - "effect": -0.0016, + "mean": 0.7192, + "effect": 0.0038, "n": 12 }, "on": { - "mean": 0.5559, - "effect": 0.0001, + "mean": 0.715, + "effect": -0.0003, "n": 147 } }, - "spread": 0.0017 + "spread": 0.0042 } } \ No newline at end of file diff --git a/results/analysis/main_effects_gameplay.json b/results/analysis/main_effects_gameplay.json @@ -1,267 +1,267 @@ { - "context_file": { + "language": { "values": { - "none": { - "mean": 0.0802, - "effect": -0.0396, - "n": 147 + "javascript": { + "mean": 0.1956, + "effect": -0.313, + "n": 9 }, - "provided": { - "mean": 0.605, - "effect": 0.4852, - "n": 12 + "typescript": { + "mean": 0.5504, + "effect": 0.0418, + "n": 142 + }, + "unspecified": { + "mean": 0.1187, + "effect": -0.3898, + "n": 8 } }, - "spread": 0.5248 + "spread": 0.4317 }, - "prompt_style": { + "human_language": { "values": { - "detailed": { - "mean": 0.27, - "effect": 0.1502, - "n": 10 + "en": { + "mean": 0.5249, + "effect": 0.0163, + "n": 150 }, - "simple": { - "mean": 0.1097, - "effect": -0.0101, - "n": 149 + "es": { + "mean": 0.2367, + "effect": -0.2719, + "n": 9 } }, - "spread": 0.1603 + "spread": 0.2882 }, - "max_budget": { + "tool_grep": { "values": { - "high": { - "mean": 0.25, - "effect": 0.1302, + "off": { + "mean": 0.6842, + "effect": 0.1756, "n": 12 }, - "low": { - "mean": 0.1092, - "effect": -0.0106, + "on": { + "mean": 0.4942, + "effect": -0.0143, "n": 147 } }, - "spread": 0.1408 + "spread": 0.19 }, - "language": { + "model": { "values": { - "javascript": { - "mean": 0.0211, - "effect": -0.0987, - "n": 9 + "haiku": { + "mean": 0.5023, + "effect": -0.0063, + "n": 57 }, - "typescript": { - "mean": 0.1328, - "effect": 0.013, - "n": 142 + "opus": { + "mean": 0.5976, + "effect": 0.0891, + "n": 51 }, - "unspecified": { - "mean": 0.0, - "effect": -0.1198, - "n": 8 + "sonnet": { + "mean": 0.4265, + "effect": -0.0821, + "n": 51 } }, - "spread": 0.1328 + "spread": 0.1711 }, - "tool_edit": { + "max_budget": { "values": { - "off": { - "mean": 0.0, - "effect": -0.1198, + "high": { + "mean": 0.6583, + "effect": 0.1498, "n": 12 }, - "on": { - "mean": 0.1296, - "effect": 0.0098, + "low": { + "mean": 0.4963, + "effect": -0.0122, "n": 147 } }, - "spread": 0.1296 + "spread": 0.162 }, - "tool_read": { + "context_file": { "values": { - "off": { - "mean": 0.0, - "effect": -0.1198, - "n": 12 - }, - "on": { - "mean": 0.1296, - "effect": 0.0098, + "none": { + "mean": 0.4964, + "effect": -0.0122, "n": 147 + }, + "provided": { + "mean": 0.6575, + "effect": 0.1489, + "n": 12 } }, - "spread": 0.1296 + "spread": 0.1611 }, - "tool_write": { + "web_search": { "values": { "off": { - "mean": 0.0, - "effect": -0.1198, - "n": 12 + "mean": 0.4054, + "effect": -0.1032, + "n": 13 }, "on": { - "mean": 0.1296, - "effect": 0.0098, - "n": 147 + "mean": 0.5177, + "effect": 0.0092, + "n": 146 } }, - "spread": 0.1296 + "spread": 0.1123 }, - "human_language": { + "tool_glob": { "values": { - "en": { - "mean": 0.127, - "effect": 0.0072, - "n": 150 + "off": { + "mean": 0.6058, + "effect": 0.0973, + "n": 12 }, - "es": { - "mean": 0.0, - "effect": -0.1198, - "n": 9 + "on": { + "mean": 0.5006, + "effect": -0.0079, + "n": 147 } }, - "spread": 0.127 + "spread": 0.1052 }, - "playwright": { + "linter": { "values": { "off": { - "mean": 0.0179, - "effect": -0.102, + "mean": 0.4214, + "effect": -0.0871, "n": 14 }, "on": { - "mean": 0.1297, - "effect": 0.0098, + "mean": 0.517, + "effect": 0.0084, "n": 145 } }, - "spread": 0.1118 + "spread": 0.0956 }, - "effort": { + "tool_read": { "values": { - "high": { - "mean": 0.1145, - "effect": -0.0053, - "n": 150 + "off": { + "mean": 0.4392, + "effect": -0.0694, + "n": 12 }, - "max": { - "mean": 0.2089, - "effect": 0.0891, - "n": 9 + "on": { + "mean": 0.5142, + "effect": 0.0057, + "n": 147 } }, - "spread": 0.0944 + "spread": 0.075 }, - "tool_grep": { + "tool_write": { "values": { "off": { - "mean": 0.0467, - "effect": -0.0731, + "mean": 0.5742, + "effect": 0.0656, "n": 12 }, "on": { - "mean": 0.1258, - "effect": 0.006, + "mean": 0.5032, + "effect": -0.0054, "n": 147 } }, - "spread": 0.0791 + "spread": 0.071 }, - "model": { + "playwright": { "values": { - "haiku": { - "mean": 0.1395, - "effect": 0.0197, - "n": 57 - }, - "opus": { - "mean": 0.0739, - "effect": -0.0459, - "n": 51 + "off": { + "mean": 0.4507, + "effect": -0.0578, + "n": 14 }, - "sonnet": { - "mean": 0.1437, - "effect": 0.0239, - "n": 51 + "on": { + "mean": 0.5141, + "effect": 0.0056, + "n": 145 } }, - "spread": 0.0698 + "spread": 0.0634 }, - "tool_glob": { + "tool_edit": { "values": { "off": { - "mean": 0.0625, - "effect": -0.0573, + "mean": 0.5425, + "effect": 0.0339, "n": 12 }, "on": { - "mean": 0.1245, - "effect": 0.0047, + "mean": 0.5058, + "effect": -0.0028, "n": 147 } }, - "spread": 0.062 + "spread": 0.0367 }, - "sub_agents": { + "effort": { "values": { - "off": { - "mean": 0.0807, - "effect": -0.0391, - "n": 14 + "high": { + "mean": 0.5099, + "effect": 0.0014, + "n": 150 }, - "on": { - "mean": 0.1236, - "effect": 0.0038, - "n": 145 + "max": { + "mean": 0.4856, + "effect": -0.023, + "n": 9 } }, - "spread": 0.0429 + "spread": 0.0243 }, "claude_version": { "values": { "2.1.91 (Claude Code)": { - "mean": 0.1363, - "effect": 0.0165, + "mean": 0.5245, + "effect": 0.016, "n": 51 }, "2.1.92 (Claude Code)": { - "mean": 0.112, - "effect": -0.0078, + "mean": 0.501, + "effect": -0.0075, "n": 108 } }, - "spread": 0.0243 + "spread": 0.0235 }, - "web_search": { + "prompt_style": { "values": { - "off": { - "mean": 0.14, - "effect": 0.0202, - "n": 13 + "detailed": { + "mean": 0.49, + "effect": -0.0186, + "n": 10 }, - "on": { - "mean": 0.118, - "effect": -0.0018, - "n": 146 + "simple": { + "mean": 0.5098, + "effect": 0.0012, + "n": 149 } }, - "spread": 0.022 + "spread": 0.0198 }, - "linter": { + "sub_agents": { "values": { "off": { - "mean": 0.1343, - "effect": 0.0145, + "mean": 0.5093, + "effect": 0.0007, "n": 14 }, "on": { - "mean": 0.1184, - "effect": -0.0014, + "mean": 0.5085, + "effect": -0.0001, "n": 145 } }, - "spread": 0.0159 + "spread": 0.0008 } } \ No newline at end of file diff --git a/results/analysis/main_effects_score.json b/results/analysis/main_effects_score.json @@ -1,267 +1,267 @@ { - "context_file": { - "values": { - "none": { - "mean": 0.0638, - "effect": -0.0408, - "n": 147 - }, - "provided": { - "mean": 0.6046, - "effect": 0.4999, - "n": 12 - } - }, - "spread": 0.5408 - }, - "effort": { + "human_language": { "values": { - "high": { - "mean": 0.0955, - "effect": -0.0092, + "en": { + "mean": 0.5721, + "effect": 0.0092, "n": 150 }, - "max": { - "mean": 0.2572, - "effect": 0.1526, + "es": { + "mean": 0.4089, + "effect": -0.154, "n": 9 } }, - "spread": 0.1617 + "spread": 0.1632 }, - "language": { + "model": { "values": { - "javascript": { - "mean": 0.0211, - "effect": -0.0835, - "n": 9 + "haiku": { + "mean": 0.5382, + "effect": -0.0247, + "n": 57 }, - "typescript": { - "mean": 0.1158, - "effect": 0.0112, - "n": 142 + "opus": { + "mean": 0.6375, + "effect": 0.0746, + "n": 51 }, - "unspecified": { - "mean": 0.0, - "effect": -0.1047, - "n": 8 + "sonnet": { + "mean": 0.5159, + "effect": -0.047, + "n": 51 } }, - "spread": 0.1158 + "spread": 0.1216 }, - "tool_edit": { + "tool_grep": { "values": { "off": { - "mean": 0.0, - "effect": -0.1047, + "mean": 0.6617, + "effect": 0.0988, "n": 12 }, "on": { - "mean": 0.1132, - "effect": 0.0085, + "mean": 0.5548, + "effect": -0.0081, "n": 147 } }, - "spread": 0.1132 + "spread": 0.1069 }, "tool_read": { "values": { "off": { - "mean": 0.0, - "effect": -0.1047, + "mean": 0.4846, + "effect": -0.0783, "n": 12 }, "on": { - "mean": 0.1132, - "effect": 0.0085, + "mean": 0.5693, + "effect": 0.0064, "n": 147 } }, - "spread": 0.1132 + "spread": 0.0847 }, - "tool_write": { + "context_file": { "values": { - "off": { - "mean": 0.0, - "effect": -0.1047, - "n": 12 - }, - "on": { - "mean": 0.1132, - "effect": 0.0085, + "none": { + "mean": 0.558, + "effect": -0.0049, "n": 147 + }, + "provided": { + "mean": 0.6225, + "effect": 0.0596, + "n": 12 } }, - "spread": 0.1132 + "spread": 0.0645 }, - "max_budget": { + "language": { "values": { - "high": { - "mean": 0.2063, - "effect": 0.1016, - "n": 12 + "javascript": { + "mean": 0.5228, + "effect": -0.0401, + "n": 9 }, - "low": { - "mean": 0.0964, - "effect": -0.0083, - "n": 147 + "typescript": { + "mean": 0.5686, + "effect": 0.0057, + "n": 142 + }, + "unspecified": { + "mean": 0.5069, + "effect": -0.056, + "n": 8 } }, - "spread": 0.1099 + "spread": 0.0617 }, - "playwright": { + "web_search": { "values": { "off": { - "mean": 0.0179, - "effect": -0.0868, - "n": 14 + "mean": 0.5169, + "effect": -0.0459, + "n": 13 }, "on": { - "mean": 0.113, - "effect": 0.0084, - "n": 145 + "mean": 0.567, + "effect": 0.0041, + "n": 146 } }, - "spread": 0.0951 + "spread": 0.0501 }, - "human_language": { + "effort": { "values": { - "en": { - "mean": 0.1092, - "effect": 0.0045, + "high": { + "mean": 0.5654, + "effect": 0.0025, "n": 150 }, - "es": { - "mean": 0.0289, - "effect": -0.0758, + "max": { + "mean": 0.5206, + "effect": -0.0423, "n": 9 } }, - "spread": 0.0803 + "spread": 0.0448 }, - "tool_glob": { + "linter": { "values": { "off": { - "mean": 0.0312, - "effect": -0.0734, - "n": 12 + "mean": 0.5254, + "effect": -0.0375, + "n": 14 }, "on": { - "mean": 0.1106, - "effect": 0.006, + "mean": 0.5665, + "effect": 0.0036, + "n": 145 + } + }, + "spread": 0.0411 + }, + "max_budget": { + "values": { + "high": { + "mean": 0.6, + "effect": 0.0371, + "n": 12 + }, + "low": { + "mean": 0.5598, + "effect": -0.003, "n": 147 } }, - "spread": 0.0794 + "spread": 0.0402 }, - "prompt_style": { + "claude_version": { "values": { - "detailed": { - "mean": 0.179, - "effect": 0.0743, - "n": 10 + "2.1.91 (Claude Code)": { + "mean": 0.5387, + "effect": -0.0241, + "n": 51 }, - "simple": { - "mean": 0.0997, - "effect": -0.005, - "n": 149 + "2.1.92 (Claude Code)": { + "mean": 0.5743, + "effect": 0.0114, + "n": 108 } }, - "spread": 0.0793 + "spread": 0.0356 }, - "tool_grep": { + "tool_edit": { "values": { "off": { - "mean": 0.0467, - "effect": -0.058, + "mean": 0.5358, + "effect": -0.027, "n": 12 }, "on": { - "mean": 0.1094, - "effect": 0.0047, + "mean": 0.5651, + "effect": 0.0022, "n": 147 } }, - "spread": 0.0627 + "spread": 0.0293 }, - "sub_agents": { + "tool_glob": { "values": { "off": { - "mean": 0.0493, - "effect": -0.0554, - "n": 14 + "mean": 0.5812, + "effect": 0.0184, + "n": 12 }, "on": { - "mean": 0.11, - "effect": 0.0053, - "n": 145 + "mean": 0.5614, + "effect": -0.0015, + "n": 147 } }, - "spread": 0.0607 + "spread": 0.0198 }, - "model": { + "tool_write": { "values": { - "haiku": { - "mean": 0.1214, - "effect": 0.0167, - "n": 57 - }, - "opus": { - "mean": 0.0674, - "effect": -0.0373, - "n": 51 + "off": { + "mean": 0.5783, + "effect": 0.0155, + "n": 12 }, - "sonnet": { - "mean": 0.1232, - "effect": 0.0186, - "n": 51 + "on": { + "mean": 0.5616, + "effect": -0.0013, + "n": 147 } }, - "spread": 0.0558 + "spread": 0.0167 }, - "linter": { + "sub_agents": { "values": { "off": { - "mean": 0.0761, - "effect": -0.0286, + "mean": 0.5507, + "effect": -0.0121, "n": 14 }, "on": { - "mean": 0.1074, - "effect": 0.0028, + "mean": 0.564, + "effect": 0.0012, "n": 145 } }, - "spread": 0.0313 + "spread": 0.0133 }, - "claude_version": { + "prompt_style": { "values": { - "2.1.91 (Claude Code)": { - "mean": 0.1234, - "effect": 0.0188, - "n": 51 + "detailed": { + "mean": 0.565, + "effect": 0.0021, + "n": 10 }, - "2.1.92 (Claude Code)": { - "mean": 0.0958, - "effect": -0.0089, - "n": 108 + "simple": { + "mean": 0.5627, + "effect": -0.0001, + "n": 149 } }, - "spread": 0.0276 + "spread": 0.0023 }, - "web_search": { + "playwright": { "values": { "off": { - "mean": 0.1158, - "effect": 0.0111, - "n": 13 + "mean": 0.5646, + "effect": 0.0018, + "n": 14 }, "on": { - "mean": 0.1037, - "effect": -0.001, - "n": 146 + "mean": 0.5627, + "effect": -0.0002, + "n": 145 } }, - "spread": 0.0121 + "spread": 0.0019 } } \ No newline at end of file diff --git a/results/analysis/main_effects_sonarqube.json b/results/analysis/main_effects_sonarqube.json @@ -1,217 +1,267 @@ { - "context_file": { + "language": { "values": { - "none": { - "mean": 0.1089, - "effect": -0.1213, - "n": 37 + "javascript": { + "mean": 0.85, + "effect": 0.2328, + "n": 9 }, - "provided": { - "mean": 0.6042, - "effect": 0.374, - "n": 12 + "typescript": { + "mean": 0.5868, + "effect": -0.0304, + "n": 142 + }, + "unspecified": { + "mean": 0.895, + "effect": 0.2778, + "n": 8 } }, - "spread": 0.4953 + "spread": 0.3082 }, - "effort": { + "model": { "values": { - "high": { - "mean": 0.1939, - "effect": -0.0363, - "n": 44 + "haiku": { + "mean": 0.574, + "effect": -0.0431, + "n": 57 }, - "max": { - "mean": 0.55, - "effect": 0.3198, - "n": 5 + "opus": { + "mean": 0.6773, + "effect": 0.0601, + "n": 51 + }, + "sonnet": { + "mean": 0.6053, + "effect": -0.0119, + "n": 51 } }, - "spread": 0.3561 + "spread": 0.1033 }, - "human_language": { + "tool_edit": { "values": { - "en": { - "mean": 0.2242, - "effect": -0.006, - "n": 48 + "off": { + "mean": 0.5292, + "effect": -0.088, + "n": 12 }, - "es": { - "mean": 0.52, - "effect": 0.2898, - "n": 1 + "on": { + "mean": 0.6244, + "effect": 0.0072, + "n": 147 } }, - "spread": 0.2958 + "spread": 0.0952 }, - "linter": { + "claude_version": { "values": { - "off": { - "mean": 0.0, - "effect": -0.2302, - "n": 8 + "2.1.91 (Claude Code)": { + "mean": 0.5529, + "effect": -0.0642, + "n": 51 }, - "on": { - "mean": 0.2751, - "effect": 0.0449, - "n": 41 + "2.1.92 (Claude Code)": { + "mean": 0.6475, + "effect": 0.0303, + "n": 108 } }, - "spread": 0.2751 + "spread": 0.0946 }, - "language": { + "tool_read": { "values": { - "javascript": { - "mean": 0.0, - "effect": -0.2302, - "n": 6 + "off": { + "mean": 0.53, + "effect": -0.0872, + "n": 12 }, - "typescript": { - "mean": 0.2623, - "effect": 0.0321, - "n": 43 + "on": { + "mean": 0.6243, + "effect": 0.0071, + "n": 147 } }, - "spread": 0.2623 + "spread": 0.0943 }, - "prompt_style": { + "max_budget": { "values": { - "detailed": { - "mean": 0.0, - "effect": -0.2302, - "n": 4 + "high": { + "mean": 0.5417, + "effect": -0.0755, + "n": 12 }, - "simple": { - "mean": 0.2507, - "effect": 0.0205, - "n": 45 + "low": { + "mean": 0.6233, + "effect": 0.0062, + "n": 147 } }, - "spread": 0.2507 + "spread": 0.0816 }, "playwright": { "values": { "off": { - "mean": 0.0, - "effect": -0.2302, - "n": 2 + "mean": 0.6786, + "effect": 0.0614, + "n": 14 }, "on": { - "mean": 0.24, - "effect": 0.0098, - "n": 47 + "mean": 0.6112, + "effect": -0.0059, + "n": 145 } }, - "spread": 0.24 + "spread": 0.0674 }, - "sub_agents": { + "tool_glob": { "values": { "off": { - "mean": 0.0, - "effect": -0.2302, - "n": 2 + "mean": 0.5567, + "effect": -0.0605, + "n": 12 }, "on": { - "mean": 0.24, - "effect": 0.0098, - "n": 47 + "mean": 0.6221, + "effect": 0.0049, + "n": 147 } }, - "spread": 0.24 + "spread": 0.0654 }, - "tool_glob": { + "effort": { "values": { - "off": { - "mean": 0.0, - "effect": -0.2302, - "n": 2 + "high": { + "mean": 0.6209, + "effect": 0.0037, + "n": 150 }, - "on": { - "mean": 0.24, - "effect": 0.0098, - "n": 47 + "max": { + "mean": 0.5556, + "effect": -0.0616, + "n": 9 } }, - "spread": 0.24 + "spread": 0.0653 }, - "web_search": { + "human_language": { "values": { - "off": { - "mean": 0.0, - "effect": -0.2302, - "n": 2 + "en": { + "mean": 0.6193, + "effect": 0.0022, + "n": 150 }, - "on": { - "mean": 0.24, - "effect": 0.0098, - "n": 47 + "es": { + "mean": 0.5811, + "effect": -0.0361, + "n": 9 } }, - "spread": 0.24 + "spread": 0.0382 }, - "tool_edit": { + "tool_write": { "values": { "off": { - "mean": 0.0, - "effect": -0.2302, - "n": 1 + "mean": 0.5825, + "effect": -0.0347, + "n": 12 }, "on": { - "mean": 0.235, - "effect": 0.0048, - "n": 48 + "mean": 0.62, + "effect": 0.0028, + "n": 147 } }, - "spread": 0.235 + "spread": 0.0375 }, - "claude_version": { + "context_file": { "values": { - "2.1.91 (Claude Code)": { - "mean": 0.2692, - "effect": 0.039, - "n": 13 + "none": { + "mean": 0.6196, + "effect": 0.0024, + "n": 147 }, - "2.1.92 (Claude Code)": { - "mean": 0.2161, - "effect": -0.0141, - "n": 36 + "provided": { + "mean": 0.5875, + "effect": -0.0297, + "n": 12 } }, - "spread": 0.0531 + "spread": 0.0321 }, - "max_budget": { + "sub_agents": { "values": { - "high": { - "mean": 0.195, - "effect": -0.0352, + "off": { + "mean": 0.5921, + "effect": -0.025, + "n": 14 + }, + "on": { + "mean": 0.6196, + "effect": 0.0024, + "n": 145 + } + }, + "spread": 0.0275 + }, + "prompt_style": { + "values": { + "detailed": { + "mean": 0.64, + "effect": 0.0228, "n": 10 }, - "low": { - "mean": 0.2392, - "effect": 0.009, - "n": 39 + "simple": { + "mean": 0.6156, + "effect": -0.0015, + "n": 149 } }, - "spread": 0.0442 + "spread": 0.0244 }, - "model": { + "tool_grep": { "values": { - "haiku": { - "mean": 0.2188, - "effect": -0.0115, - "n": 16 + "off": { + "mean": 0.6392, + "effect": 0.022, + "n": 12 }, - "opus": { - "mean": 0.2214, - "effect": -0.0088, + "on": { + "mean": 0.6154, + "effect": -0.0018, + "n": 147 + } + }, + "spread": 0.0238 + }, + "linter": { + "values": { + "off": { + "mean": 0.6293, + "effect": 0.0121, "n": 14 }, - "sonnet": { - "mean": 0.2463, - "effect": 0.0161, - "n": 19 + "on": { + "mean": 0.616, + "effect": -0.0012, + "n": 145 } }, - "spread": 0.0275 + "spread": 0.0133 + }, + "web_search": { + "values": { + "off": { + "mean": 0.6285, + "effect": 0.0113, + "n": 13 + }, + "on": { + "mean": 0.6162, + "effect": -0.001, + "n": 146 + } + }, + "spread": 0.0123 } } \ No newline at end of file diff --git a/results/analysis/main_effects_structural.json b/results/analysis/main_effects_structural.json @@ -1,267 +1,267 @@ { - "tool_read": { - "values": { - "off": { - "mean": 0.6944, - "effect": -0.1649, - "n": 9 - }, - "on": { - "mean": 0.8724, - "effect": 0.013, - "n": 114 - } - }, - "spread": 0.178 - }, "model": { "values": { "haiku": { - "mean": 0.7798, - "effect": -0.0795, - "n": 51 + "mean": 0.7986, + "effect": -0.0895, + "n": 57 }, "opus": { - "mean": 0.9507, - "effect": 0.0914, - "n": 27 + "mean": 0.9739, + "effect": 0.0859, + "n": 51 }, "sonnet": { - "mean": 0.8947, - "effect": 0.0353, - "n": 45 + "mean": 0.9022, + "effect": 0.0141, + "n": 51 } }, - "spread": 0.1709 + "spread": 0.1753 }, - "tool_edit": { + "prompt_style": { + "values": { + "detailed": { + "mean": 0.759, + "effect": -0.1291, + "n": 10 + }, + "simple": { + "mean": 0.8967, + "effect": 0.0087, + "n": 149 + } + }, + "spread": 0.1377 + }, + "tool_read": { "values": { "off": { - "mean": 0.7188, - "effect": -0.1406, - "n": 8 + "mean": 0.7708, + "effect": -0.1172, + "n": 12 }, "on": { - "mean": 0.8691, - "effect": 0.0098, - "n": 115 + "mean": 0.8976, + "effect": 0.0096, + "n": 147 } }, - "spread": 0.1503 + "spread": 0.1268 }, "claude_version": { "values": { "2.1.91 (Claude Code)": { - "mean": 0.7836, - "effect": -0.0758, - "n": 45 + "mean": 0.8041, + "effect": -0.0839, + "n": 51 }, "2.1.92 (Claude Code)": { - "mean": 0.9031, - "effect": 0.0437, - "n": 78 + "mean": 0.9277, + "effect": 0.0396, + "n": 108 } }, - "spread": 0.1195 + "spread": 0.1236 }, "tool_grep": { "values": { "off": { - "mean": 0.75, - "effect": -0.1093, - "n": 9 + "mean": 0.7917, + "effect": -0.0964, + "n": 12 }, "on": { - "mean": 0.868, - "effect": 0.0086, - "n": 114 + "mean": 0.8959, + "effect": 0.0079, + "n": 147 } }, - "spread": 0.118 + "spread": 0.1042 }, - "tool_glob": { + "language": { + "values": { + "javascript": { + "mean": 0.89, + "effect": 0.0019, + "n": 9 + }, + "typescript": { + "mean": 0.8932, + "effect": 0.0052, + "n": 142 + }, + "unspecified": { + "mean": 0.7938, + "effect": -0.0943, + "n": 8 + } + }, + "spread": 0.0994 + }, + "tool_edit": { "values": { "off": { - "mean": 0.75, - "effect": -0.1093, - "n": 7 + "mean": 0.8125, + "effect": -0.0756, + "n": 12 }, "on": { - "mean": 0.8659, - "effect": 0.0066, - "n": 116 + "mean": 0.8942, + "effect": 0.0062, + "n": 147 } }, - "spread": 0.1159 + "spread": 0.0817 }, - "prompt_style": { + "linter": { "values": { - "detailed": { - "mean": 0.759, - "effect": -0.1003, - "n": 10 + "off": { + "mean": 0.8157, + "effect": -0.0723, + "n": 14 }, - "simple": { - "mean": 0.8682, - "effect": 0.0089, - "n": 113 + "on": { + "mean": 0.895, + "effect": 0.007, + "n": 145 } }, - "spread": 0.1092 + "spread": 0.0793 }, "playwright": { "values": { "off": { - "mean": 0.767, - "effect": -0.0923, - "n": 10 + "mean": 0.8336, + "effect": -0.0545, + "n": 14 }, "on": { - "mean": 0.8675, - "effect": 0.0082, - "n": 113 + "mean": 0.8933, + "effect": 0.0053, + "n": 145 } }, - "spread": 0.1005 + "spread": 0.0597 }, - "language": { + "human_language": { "values": { - "javascript": { - "mean": 0.89, - "effect": 0.0307, - "n": 9 - }, - "typescript": { - "mean": 0.8617, - "effect": 0.0023, - "n": 106 + "en": { + "mean": 0.8913, + "effect": 0.0033, + "n": 150 }, - "unspecified": { - "mean": 0.7938, - "effect": -0.0656, - "n": 8 + "es": { + "mean": 0.8333, + "effect": -0.0547, + "n": 9 } }, - "spread": 0.0962 + "spread": 0.058 }, - "tool_write": { + "sub_agents": { "values": { "off": { - "mean": 0.7917, - "effect": -0.0677, - "n": 6 + "mean": 0.8514, + "effect": -0.0366, + "n": 14 }, "on": { - "mean": 0.8628, + "mean": 0.8916, "effect": 0.0035, - "n": 117 + "n": 145 } }, - "spread": 0.0711 + "spread": 0.0402 }, - "effort": { + "max_budget": { "values": { "high": { - "mean": 0.8548, - "effect": -0.0045, - "n": 114 + "mean": 0.8542, + "effect": -0.0339, + "n": 12 }, - "max": { - "mean": 0.9167, - "effect": 0.0573, - "n": 9 + "low": { + "mean": 0.8908, + "effect": 0.0028, + "n": 147 } }, - "spread": 0.0619 + "spread": 0.0366 }, - "sub_agents": { + "tool_glob": { "values": { "off": { - "mean": 0.8109, - "effect": -0.0484, - "n": 11 + "mean": 0.8542, + "effect": -0.0339, + "n": 12 }, "on": { - "mean": 0.8641, - "effect": 0.0048, - "n": 112 + "mean": 0.8908, + "effect": 0.0028, + "n": 147 } }, - "spread": 0.0532 + "spread": 0.0366 }, - "linter": { + "web_search": { "values": { "off": { - "mean": 0.8157, - "effect": -0.0436, - "n": 14 + "mean": 0.8592, + "effect": -0.0288, + "n": 13 }, "on": { - "mean": 0.865, - "effect": 0.0056, - "n": 109 + "mean": 0.8906, + "effect": 0.0026, + "n": 146 } }, - "spread": 0.0493 + "spread": 0.0314 }, - "web_search": { + "effort": { + "values": { + "high": { + "mean": 0.8863, + "effect": -0.0017, + "n": 150 + }, + "max": { + "mean": 0.9167, + "effect": 0.0286, + "n": 9 + } + }, + "spread": 0.0304 + }, + "tool_write": { "values": { "off": { - "mean": 0.817, - "effect": -0.0423, - "n": 10 + "mean": 0.875, + "effect": -0.0131, + "n": 12 }, "on": { - "mean": 0.8631, - "effect": 0.0037, - "n": 113 + "mean": 0.8891, + "effect": 0.0011, + "n": 147 } }, - "spread": 0.0461 + "spread": 0.0141 }, "context_file": { "values": { "none": { - "mean": 0.8554, - "effect": -0.0039, - "n": 111 + "mean": 0.8874, + "effect": -0.0006, + "n": 147 }, "provided": { "mean": 0.8958, - "effect": 0.0365, + "effect": 0.0078, "n": 12 } }, - "spread": 0.0404 - }, - "human_language": { - "values": { - "en": { - "mean": 0.8614, - "effect": 0.0021, - "n": 114 - }, - "es": { - "mean": 0.8333, - "effect": -0.026, - "n": 9 - } - }, - "spread": 0.0281 - }, - "max_budget": { - "values": { - "high": { - "mean": 0.8542, - "effect": -0.0052, - "n": 12 - }, - "low": { - "mean": 0.8599, - "effect": 0.0006, - "n": 111 - } - }, - "spread": 0.0057 + "spread": 0.0084 } } \ No newline at end of file diff --git a/results/analysis/main_effects_transcript.json b/results/analysis/main_effects_transcript.json @@ -2,266 +2,266 @@ "model": { "values": { "haiku": { - "mean": 0.7368, - "effect": -0.1638, + "mean": 0.7895, + "effect": -0.13, "n": 57 }, "opus": { "mean": 1.0, - "effect": 0.0994, + "effect": 0.0805, "n": 51 }, "sonnet": { "mean": 0.9843, - "effect": 0.0837, + "effect": 0.0648, "n": 51 } }, - "spread": 0.2632 + "spread": 0.2105 }, "claude_version": { "values": { "2.1.91 (Claude Code)": { - "mean": 0.7255, - "effect": -0.1751, + "mean": 0.7843, + "effect": -0.1352, "n": 51 }, "2.1.92 (Claude Code)": { "mean": 0.9833, - "effect": 0.0827, + "effect": 0.0638, "n": 108 } }, - "spread": 0.2578 - }, - "tool_read": { - "values": { - "off": { - "mean": 0.6875, - "effect": -0.2131, - "n": 12 - }, - "on": { - "mean": 0.918, - "effect": 0.0174, - "n": 147 - } - }, - "spread": 0.2305 + "spread": 0.199 }, "language": { "values": { "javascript": { "mean": 1.0, - "effect": 0.0994, + "effect": 0.0805, "n": 9 }, "typescript": { - "mean": 0.8905, - "effect": -0.0101, + "mean": 0.9116, + "effect": -0.0079, "n": 142 }, "unspecified": { "mean": 0.9688, - "effect": 0.0681, + "effect": 0.0493, "n": 8 } }, - "spread": 0.1095 + "spread": 0.0884 }, "playwright": { "values": { "off": { "mean": 0.8643, - "effect": -0.0363, + "effect": -0.0552, "n": 14 }, "on": { - "mean": 0.9041, - "effect": 0.0035, + "mean": 0.9248, + "effect": 0.0053, "n": 145 } }, - "spread": 0.0398 + "spread": 0.0605 }, "linter": { "values": { "off": { "mean": 0.8679, - "effect": -0.0328, + "effect": -0.0516, "n": 14 }, "on": { - "mean": 0.9038, - "effect": 0.0032, + "mean": 0.9245, + "effect": 0.005, "n": 145 } }, - "spread": 0.0359 + "spread": 0.0566 + }, + "tool_read": { + "values": { + "off": { + "mean": 0.875, + "effect": -0.0445, + "n": 12 + }, + "on": { + "mean": 0.9231, + "effect": 0.0036, + "n": 147 + } + }, + "spread": 0.0481 }, "max_budget": { "values": { "high": { "mean": 0.8792, - "effect": -0.0215, + "effect": -0.0403, "n": 12 }, "low": { - "mean": 0.9024, - "effect": 0.0018, + "mean": 0.9228, + "effect": 0.0033, "n": 147 } }, - "spread": 0.0232 + "spread": 0.0436 }, "tool_edit": { "values": { "off": { "mean": 0.8792, - "effect": -0.0215, + "effect": -0.0403, "n": 12 }, "on": { - "mean": 0.9024, - "effect": 0.0018, + "mean": 0.9228, + "effect": 0.0033, "n": 147 } }, - "spread": 0.0232 + "spread": 0.0436 }, "tool_write": { "values": { "off": { "mean": 0.8792, - "effect": -0.0215, + "effect": -0.0403, "n": 12 }, "on": { - "mean": 0.9024, - "effect": 0.0018, + "mean": 0.9228, + "effect": 0.0033, "n": 147 } }, - "spread": 0.0232 - }, - "human_language": { - "values": { - "en": { - "mean": 0.8993, - "effect": -0.0013, - "n": 150 - }, - "es": { - "mean": 0.9222, - "effect": 0.0216, - "n": 9 - } - }, - "spread": 0.0229 + "spread": 0.0436 }, "sub_agents": { "values": { "off": { "mean": 0.8821, - "effect": -0.0185, + "effect": -0.0374, "n": 14 }, "on": { - "mean": 0.9024, - "effect": 0.0018, + "mean": 0.9231, + "effect": 0.0036, "n": 145 } }, - "spread": 0.0203 + "spread": 0.041 }, "tool_glob": { "values": { "off": { "mean": 0.8833, - "effect": -0.0173, + "effect": -0.0362, "n": 12 }, "on": { - "mean": 0.902, - "effect": 0.0014, + "mean": 0.9224, + "effect": 0.003, "n": 147 } }, - "spread": 0.0187 - }, - "effort": { - "values": { - "high": { - "mean": 0.8997, - "effect": -0.001, - "n": 150 - }, - "max": { - "mean": 0.9167, - "effect": 0.016, - "n": 9 - } - }, - "spread": 0.017 + "spread": 0.0391 }, "tool_grep": { "values": { "off": { "mean": 0.8875, - "effect": -0.0131, + "effect": -0.032, "n": 12 }, "on": { - "mean": 0.9017, - "effect": 0.0011, + "mean": 0.9221, + "effect": 0.0026, "n": 147 } }, - "spread": 0.0142 + "spread": 0.0346 }, "context_file": { "values": { "none": { - "mean": 0.9014, - "effect": 0.0007, + "mean": 0.9218, + "effect": 0.0023, "n": 147 }, "provided": { "mean": 0.8917, - "effect": -0.009, + "effect": -0.0278, "n": 12 } }, - "spread": 0.0097 + "spread": 0.0301 }, "web_search": { "values": { "off": { "mean": 0.8923, - "effect": -0.0083, + "effect": -0.0272, "n": 13 }, "on": { - "mean": 0.9014, - "effect": 0.0007, + "mean": 0.9219, + "effect": 0.0024, "n": 146 } }, - "spread": 0.0091 + "spread": 0.0296 }, "prompt_style": { "values": { "detailed": { "mean": 0.895, - "effect": -0.0056, + "effect": -0.0245, "n": 10 }, "simple": { - "mean": 0.901, - "effect": 0.0004, + "mean": 0.9211, + "effect": 0.0016, "n": 149 } }, - "spread": 0.006 + "spread": 0.0261 + }, + "effort": { + "values": { + "high": { + "mean": 0.9197, + "effect": 0.0002, + "n": 150 + }, + "max": { + "mean": 0.9167, + "effect": -0.0028, + "n": 9 + } + }, + "spread": 0.003 + }, + "human_language": { + "values": { + "en": { + "mean": 0.9193, + "effect": -0.0002, + "n": 150 + }, + "es": { + "mean": 0.9222, + "effect": 0.0027, + "n": 9 + } + }, + "spread": 0.0029 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -117,13 +117,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.88, + "total": 16, + "passed": 14, + "failed": 2, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "4 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 196" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 39 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 14, + "failed": 2, + "score": 0.88 + }, + "gameplay": { + "pieces_placed": 39, + "lines_cleared": 0, + "max_score_observed": 196, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 304, + "events_count": 10, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 0, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 26 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.825, + "score": 0.825, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 1, + "cognitive_complexity": 90, + "lines_of_code": 591, + "duplication_pct": 0.0, + "tech_debt_minutes": 9, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.77 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -73,7 +73,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "2 new piece(s) detected at top of grid" + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -82,13 +82,13 @@ }, { "name": "line_clear", - "pass": true, - "detail": "1 line(s) cleared (grid-verified)" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "score stayed at 250" + "detail": "score stayed at 196" }, { "name": "game_over", @@ -103,30 +103,30 @@ ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 14, + "failed": 2, + "score": 0.88 }, "gameplay": { "pieces_placed": 39, - "lines_cleared": 1, - "max_score_observed": 250, + "lines_cleared": 0, + "max_score_observed": 196, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 305, + "frames": 304, "events_count": 10, - "pieces_spawned": 2, + "pieces_spawned": 4, "pieces_locked": 11, - "lines_cleared": 1, + "lines_cleared": 0, "piece_types_seen": [ "unknown" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 30 + "load_time_ms": 26 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -117,13 +117,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 20 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.565, + "score": 0.565, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 0, + "lines_of_code": 604, + "duplication_pct": 0.0, + "tech_debt_minutes": 23, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.94 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -119,7 +119,7 @@ "grid_read_success_rate": 0 }, "performance": { - "load_time_ms": 47 + "load_time_ms": 20 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -242,7 +242,7 @@ "grid_read_success_rate": 0 }, "performance": { - "load_time_ms": 42 + "load_time_ms": 25 }, "accessibility": { "issues": [], @@ -251,10 +251,19 @@ } } }, - "outcome_score": 0.19, - "score": 0.19, + "outcome_score": 0.565, + "score": 0.565, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 0, + "lines_of_code": 604, + "duplication_pct": 0.0, + "tech_debt_minutes": 23, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.94 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -119,7 +119,7 @@ "grid_read_success_rate": 0 }, "performance": { - "load_time_ms": 42 + "load_time_ms": 25 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -117,13 +117,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 50, + "height": 100 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 168" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 168, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 347, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 262 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.48, + "score": 0.48, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 7, + "cognitive_complexity": 137, + "lines_of_code": 867, + "duplication_pct": 0.0, + "tech_debt_minutes": 56, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.65 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -87,8 +87,8 @@ }, { "name": "score_changes", - "pass": true, - "detail": "score changed from 194 to 222" + "pass": false, + "detail": "score stayed at 168" }, { "name": "game_over", @@ -103,19 +103,19 @@ ], "summary": { "total": 16, - "passed": 6, - "failed": 10, - "score": 0.38 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 222, + "max_score_observed": 168, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 349, + "frames": 347, "events_count": 2, "pieces_spawned": 0, "pieces_locked": 10, @@ -124,7 +124,7 @@ "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 50 + "load_time_ms": 262 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -120,10 +120,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.47, + "score": 0.47, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 0, + "lines_of_code": 651, + "duplication_pct": 0.0, + "tech_debt_minutes": 10, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.94 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -120,10 +120,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.365, + "score": 0.365, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 4, + "cognitive_complexity": 101, + "lines_of_code": 655, + "duplication_pct": 3.4, + "tech_debt_minutes": 34, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.73 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -120,10 +120,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.47, + "score": 0.47, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 0, + "lines_of_code": 486, + "duplication_pct": 0.0, + "tech_debt_minutes": 16, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.94 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -117,13 +117,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 61 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.495, + "score": 0.495, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 9, + "cognitive_complexity": 0, + "lines_of_code": 787, + "duplication_pct": 0.0, + "tech_debt_minutes": 232, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.8 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -119,7 +119,7 @@ "grid_read_success_rate": 0 }, "performance": { - "load_time_ms": 20 + "load_time_ms": 61 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=javascript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -120,10 +120,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.47, + "score": 0.47, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 0, + "lines_of_code": 486, + "duplication_pct": 0.0, + "tech_debt_minutes": 16, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.94 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=detailed_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=detailed_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json @@ -123,10 +123,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.47, + "score": 0.47, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 0, + "lines_of_code": 840, + "duplication_pct": 0.0, + "tech_debt_minutes": 13, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.94 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=detailed_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=detailed_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json @@ -119,7 +119,7 @@ "grid_read_success_rate": 0 }, "performance": { - "load_time_ms": 59 + "load_time_ms": 72 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run1/eval_results.json @@ -124,13 +124,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.69, + "total": 16, + "passed": 11, + "failed": 5, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "2 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 152" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 35 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 11, + "failed": 5, + "score": 0.69 + }, + "gameplay": { + "pieces_placed": 35, + "lines_cleared": 1, + "max_score_observed": 152, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 313, + "events_count": 7, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 72 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.62, + "score": 0.62, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 15, + "cognitive_complexity": 168, + "lines_of_code": 785, + "duplication_pct": 43.1, + "tech_debt_minutes": 73, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run1/gameplay-bot-report.json @@ -12,7 +12,7 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "x", "drop": "Space" }, "start_mechanism": "auto", @@ -73,7 +73,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "1 new piece(s) detected at top of grid" + "detail": "2 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -88,45 +88,45 @@ { "name": "score_changes", "pass": false, - "detail": "score stayed at 155" + "detail": "score stayed at 152" }, { "name": "game_over", - "pass": false, - "detail": "could not trigger or detect game over" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 33 pieces, no crashes" + "detail": "played for 30s, placed 35 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 10, - "failed": 6, - "score": 0.63 + "passed": 11, + "failed": 5, + "score": 0.69 }, "gameplay": { - "pieces_placed": 33, + "pieces_placed": 35, "lines_cleared": 1, - "max_score_observed": 155, + "max_score_observed": 152, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 319, + "frames": 313, "events_count": 7, - "pieces_spawned": 1, + "pieces_spawned": 2, "pieces_locked": 11, "lines_cleared": 1, "piece_types_seen": [ - "I" + "unknown" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 204 + "load_time_ms": 72 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run2/eval_results.json @@ -124,13 +124,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 46 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.37, + "score": 0.37, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 9, + "cognitive_complexity": 160, + "lines_of_code": 1029, + "duplication_pct": 39.4, + "tech_debt_minutes": 55, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run2/gameplay-bot-report.json @@ -119,7 +119,7 @@ "grid_read_success_rate": 0 }, "performance": { - "load_time_ms": 20 + "load_time_ms": 46 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run3/eval_results.json @@ -124,13 +124,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 40 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.37, + "score": 0.37, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 9, + "cognitive_complexity": 160, + "lines_of_code": 1029, + "duplication_pct": 39.4, + "tech_debt_minutes": 55, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=off_tool_glob=off_tool_grep=off_tool_read=off_tool_write=off_web_search=off_run3/gameplay-bot-report.json @@ -119,7 +119,7 @@ "grid_read_success_rate": 0 }, "performance": { - "load_time_ms": 21 + "load_time_ms": 40 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json @@ -123,10 +123,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.25, + "score": 0.31, "total": 16, - "passed": 4, - "failed": 12, + "passed": 5, + "failed": 11, "report": { "implementation": { "renderer": "canvas", @@ -145,7 +145,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -161,101 +162,122 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "could not read score element" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 4, - "failed": 12, - "score": 0.25 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 60, - "lines_cleared": 0, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 345, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 750 + "load_time_ms": 31 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } }, - "outcome_score": 0.25, - "score": 0.25, + "outcome_score": 0.48, + "score": 0.48, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 5, + "cognitive_complexity": 90, + "lines_of_code": 708, + "duplication_pct": 0.0, + "tech_debt_minutes": 20, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.65 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,93 +33,105 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "could not read score element" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 4, - "failed": 12, - "score": 0.25 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 60, - "lines_cleared": 0, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 345, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 750 + "load_time_ms": 31 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -123,13 +123,161 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 60, + "height": 120 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 345, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 27 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.28, + "score": 0.28, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 16, + "cognitive_complexity": 190, + "lines_of_code": 992, + "duplication_pct": 35.2, + "tech_debt_minutes": 73, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -9,14 +9,15 @@ "height": 120 }, "controls": { - "left": "a", - "right": "d", - "down": "s", - "rotate": "z", + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,94 +32,106 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "grid state changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "exception: keyboard.press: Test timeout of 180000ms exceeded." + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "could not read score element" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 6, - "failed": 10, - "score": 0.38 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 60, - "lines_cleared": 0, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 345, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 1515 + "load_time_ms": 27 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -125,13 +125,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 62 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.37, + "score": 0.37, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 8, + "cognitive_complexity": 190, + "lines_of_code": 985, + "duplication_pct": 31.8, + "tech_debt_minutes": 76, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -10,106 +10,116 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "exception: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:38155/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via click_canvas" }, { "name": "auto_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", "pass": false, - "detail": "skipped: page did not load" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "skipped: page did not load" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "skipped: page did not load" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", "pass": false, - "detail": "skipped: page did not load" + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 0, + "pieces_placed": 26, "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": 4052 + "load_time_ms": 62 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -123,10 +123,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.88, + "score": 0.94, "total": 16, - "passed": 14, - "failed": 2, + "passed": 15, + "failed": 1, "report": { "implementation": { "renderer": "canvas", @@ -145,7 +145,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -161,62 +162,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "rotated: [] failed: [unknown] (tested 1 piece types in 60 attempts)" + "pass": true, + "detail": "rotation observed, piece types seen: [I, J, unknown]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "visual change suggests new piece spawned" + "detail": "6 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 8 -> 12" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -226,24 +227,37 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 14, - "failed": 2, - "score": 0.88 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 137, + "pieces_placed": 37, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 300, + "events_count": 11, + "pieces_spawned": 6, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "I", + "J", + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 68 + "load_time_ms": 40 }, "accessibility": { "issues": [ @@ -254,10 +268,19 @@ } } }, - "outcome_score": 0.44, - "score": 0.44, + "outcome_score": 0.745, + "score": 0.745, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 8, + "cognitive_complexity": 190, + "lines_of_code": 985, + "duplication_pct": 31.8, + "tech_debt_minutes": 76, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "rotated: [] failed: [unknown] (tested 1 piece types in 60 attempts)" + "pass": true, + "detail": "rotation observed, piece types seen: [I, J, unknown]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "visual change suggests new piece spawned" + "detail": "6 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 8 -> 12" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,24 +98,37 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 14, - "failed": 2, - "score": 0.88 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 137, + "pieces_placed": 37, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 300, + "events_count": 11, + "pieces_spawned": 6, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "I", + "J", + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 68 + "load_time_ms": 40 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -124,10 +124,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.46, + "score": 0.46, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 1, + "cognitive_complexity": 100, + "lines_of_code": 608, + "duplication_pct": 0.0, + "tech_debt_minutes": 5, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.92 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -42,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-pf5imcma/loop-bench-y7z0amb9', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 8, + "code": 3, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 943, + "dependencies": { + "production": 0, + "dev": 7, + "total": 7 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 26, + "excessive": true + }, + "function_length": { + "count": 55, + "average": 6.8, + "max": 26, + "long_functions": 0 + }, + "max_nesting_depth": 8, + "global_declarations": 62, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 297, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 39, + "source_lines": 681, + "ratio_pct": 5.7 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 2 + }, + "duplication_percentage": 0.0, + "score": 0.9 }, "transcript_analysis": { "total_events": 46, @@ -68,14 +120,164 @@ "score": 1.0 }, "gameplay_bot": { - "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "pass": true, + "score": 1, + "total": 16, + "passed": 16, + "failed": 0, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "3 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "2 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 376 to 458" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 33 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 16, + "failed": 0, + "score": 1 + }, + "gameplay": { + "pieces_placed": 33, + "lines_cleared": 2, + "max_score_observed": 458, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 287, + "events_count": 11, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 2, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 23 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.945, + "score": 0.945, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 87, + "lines_of_code": 487, + "duplication_pct": 0.0, + "tech_debt_minutes": 22, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.89 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,94 +32,108 @@ }, { "name": "auto_drop", - "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", - "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", - "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "exception: page.reload: Target page, context or browser has been closed" + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", - "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "2 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "could not read score element" + "pass": true, + "detail": "score changed from 376 to 458" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 33 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 2, - "failed": 14, - "score": 0.13 + "passed": 16, + "failed": 0, + "score": 1 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, - "max_score_observed": 0, - "play_duration_seconds": 0, + "pieces_placed": 33, + "lines_cleared": 2, + "max_score_observed": 458, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 287, + "events_count": 11, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 2, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": -1 + "load_time_ms": 23 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -42,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-traf_u58/loop-bench-drho0eka', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 8, + "code": 3, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1526, + "dependencies": { + "production": 0, + "dev": 7, + "total": 7 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 36, + "excessive": true + }, + "function_length": { + "count": 67, + "average": 7.4, + "max": 48, + "long_functions": 0 + }, + "max_nesting_depth": 14, + "global_declarations": 38, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 526, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 86, + "source_lines": 1180, + "ratio_pct": 7.3 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 8 + }, + "duplication_percentage": 0.0, + "score": 0.85 }, "transcript_analysis": { "total_events": 47, @@ -69,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.38, + "total": 16, + "passed": 6, + "failed": 10, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 60, + "height": 120 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 172 to 194" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 6, + "failed": 10, + "score": 0.38 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 194, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 342, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 26 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.44, + "score": 0.44, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 15, + "cognitive_complexity": 164, + "lines_of_code": 746, + "duplication_pct": 0.0, + "tech_debt_minutes": 82, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,93 +33,106 @@ { "name": "auto_drop", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "no grid change detected after key press" }, { "name": "rotate", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "exception: page.reload: Target page, context or browser has been closed" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "could not read score element" + "pass": true, + "detail": "score changed from 172 to 194" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 2, - "failed": 14, - "score": 0.13 + "passed": 6, + "failed": 10, + "score": 0.38 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, - "max_score_observed": 0, - "play_duration_seconds": 0, + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 194, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 342, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 75 + "load_time_ms": 26 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -121,10 +121,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.75, + "score": 0.38, "total": 16, - "passed": 12, - "failed": 4, + "passed": 6, + "failed": 10, "report": { "implementation": { "renderer": "canvas", @@ -143,7 +143,8 @@ "drop": "Space" }, "start_mechanism": "enter", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -158,63 +159,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": true, - "detail": "visual change detected after hard drop" + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", - "pass": true, - "detail": "game still responding after 10 piece drops" + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [188] -> no change after polling" + "pass": true, + "detail": "score changed from 164 to 168" }, { "name": "game_over", @@ -224,24 +225,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 76 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 12, - "failed": 4, - "score": 0.75 + "passed": 6, + "failed": 10, + "score": 0.38 }, "gameplay": { - "pieces_placed": 186, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 180, + "max_score_observed": 168, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 349, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 29 + "load_time_ms": 43 }, "accessibility": { "issues": [ @@ -255,10 +265,19 @@ } } }, - "outcome_score": 0.375, - "score": 0.375, + "outcome_score": 0.53, + "score": 0.53, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 4, + "cognitive_complexity": 90, + "lines_of_code": 667, + "duplication_pct": 0.0, + "tech_debt_minutes": 19, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.68 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "enter", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": true, - "detail": "visual change detected after hard drop" + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", - "pass": true, - "detail": "game still responding after 10 piece drops" + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [188] -> no change after polling" + "pass": true, + "detail": "score changed from 164 to 168" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 76 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 12, - "failed": 4, - "score": 0.75 + "passed": 6, + "failed": 10, + "score": 0.38 }, "gameplay": { - "pieces_placed": 186, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 180, + "max_score_observed": 168, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 349, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 29 + "load_time_ms": 43 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -42,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-ivkll6nh/loop-bench-e35s4mjy', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 7, + "code": 3, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1723, + "dependencies": { + "production": 0, + "dev": 7, + "total": 7 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 36, + "excessive": true + }, + "function_length": { + "count": 59, + "average": 9.5, + "max": 45, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 32, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 556, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 132, + "source_lines": 1194, + "ratio_pct": 11.1 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 0 + }, + "duplication_percentage": 0.0, + "score": 0.5 }, "transcript_analysis": { "total_events": 57, @@ -69,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.38, + "total": 16, + "passed": 6, + "failed": 10, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 50, + "height": 100 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 202 to 226" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 6, + "failed": 10, + "score": 0.38 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 226, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 348, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 106 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.44, + "score": 0.44, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 15, + "cognitive_complexity": 165, + "lines_of_code": 835, + "duplication_pct": 0.0, + "tech_debt_minutes": 92, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [268] -> no change after polling" + "pass": true, + "detail": "score changed from 202 to 226" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 10, - "failed": 6, - "score": 0.63 + "passed": 6, + "failed": 10, + "score": 0.38 }, "gameplay": { - "pieces_placed": 188, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 200, + "max_score_observed": 226, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 348, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 44 + "load_time_ms": 106 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -42,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-1mwrka44/loop-bench-294r5yrr', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 7, + "code": 3, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1376, + "dependencies": { + "production": 0, + "dev": 7, + "total": 7 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 58, + "excessive": true + }, + "function_length": { + "count": 69, + "average": 5.9, + "max": 17, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 84, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 538, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 71, + "source_lines": 995, + "ratio_pct": 7.1 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 8 + }, + "duplication_percentage": 0.0, + "score": 0.85 }, "transcript_analysis": { "total_events": 41, @@ -69,13 +121,165 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, I, J]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "5 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 47 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 47, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 278, + "events_count": 11, + "pieces_spawned": 5, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "I", + "J" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 56 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.835, + "score": 0.835, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 4, + "cognitive_complexity": 126, + "lines_of_code": 688, + "duplication_pct": 0.0, + "tech_debt_minutes": 38, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.73 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=off_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [unknown, I, J]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "5 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "game still responding after 10 piece drops" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "11 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 86 pieces, no crashes" + "detail": "played for 30s, placed 47 pieces, no crashes" } ], "summary": { @@ -107,14 +108,27 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 127, - "lines_cleared": 41, + "pieces_placed": 47, + "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 278, + "events_count": 11, + "pieces_spawned": 5, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "I", + "J" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 79 + "load_time_ms": 56 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -125,10 +125,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.56, + "score": 0.38, "total": 16, - "passed": 9, - "failed": 7, + "passed": 6, + "failed": 10, "report": { "implementation": { "renderer": "canvas", @@ -147,7 +147,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -163,62 +164,62 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", - "pass": false, - "detail": "could not verify piece locking at bottom" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "2 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -228,24 +229,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 26s, placed 100 pieces, no crashes" + "detail": "played for 30s, placed 22 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 9, - "failed": 7, - "score": 0.56 + "passed": 6, + "failed": 10, + "score": 0.38 }, "gameplay": { - "pieces_placed": 200, - "lines_cleared": 6, + "pieces_placed": 22, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 26, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 339, + "events_count": 3, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 168 + "load_time_ms": 92 }, "accessibility": { "issues": [ @@ -257,10 +267,19 @@ } } }, - "outcome_score": 0.28, - "score": 0.28, + "outcome_score": 0.315, + "score": 0.315, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 20, + "cognitive_complexity": 186, + "lines_of_code": 1056, + "duplication_pct": 35.3, + "tech_debt_minutes": 72, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", - "pass": false, - "detail": "could not verify piece locking at bottom" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "2 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 26s, placed 100 pieces, no crashes" + "detail": "played for 30s, placed 22 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 9, - "failed": 7, - "score": 0.56 + "passed": 6, + "failed": 10, + "score": 0.38 }, "gameplay": { - "pieces_placed": 200, - "lines_cleared": 6, + "pieces_placed": 22, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 26, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 339, + "events_count": 3, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 168 + "load_time_ms": 92 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -125,15 +125,20 @@ }, "gameplay_bot": { "pass": false, - "score": 0, + "score": 0.69, "total": 16, - "passed": 0, - "failed": 16, + "passed": 11, + "failed": 5, "report": { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", @@ -141,128 +146,140 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false, - "grid_confidence": 0 + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "1 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "played for 30s, placed 30 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 11, + "failed": 5, + "score": 0.69 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 30, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 0, - "events_count": 0, - "pieces_spawned": 0, - "pieces_locked": 0, - "lines_cleared": 0, + "frames": 324, + "events_count": 6, + "pieces_spawned": 1, + "pieces_locked": 11, + "lines_cleared": 1, "piece_types_seen": [], - "grid_read_success_rate": 0 + "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": -1 + "load_time_ms": 80 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.62, + "score": 0.62, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 9, + "cognitive_complexity": 170, + "lines_of_code": 881, + "duplication_pct": 21.8, + "tech_debt_minutes": 41, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -1,8 +1,13 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", @@ -10,120 +15,123 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false, - "grid_confidence": 0 + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "1 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40141/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "played for 30s, placed 30 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 11, + "failed": 5, + "score": 0.69 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 30, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 0, - "events_count": 0, - "pieces_spawned": 0, - "pieces_locked": 0, - "lines_cleared": 0, + "frames": 324, + "events_count": 6, + "pieces_spawned": 1, + "pieces_locked": 11, + "lines_cleared": 1, "piece_types_seen": [], - "grid_read_success_rate": 0 + "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": -1 + "load_time_ms": 80 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -126,13 +126,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 38 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.22, + "score": 0.22, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 20, + "cognitive_complexity": 186, + "lines_of_code": 1056, + "duplication_pct": 35.3, + "tech_debt_minutes": 72, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -10,113 +10,120 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "1 console error(s): Unexpected token '<'" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via click_canvas" }, { "name": "auto_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", "pass": false, - "detail": "skipped: page did not load" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "skipped: page did not load" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "skipped: page did not load" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", "pass": false, - "detail": "skipped: page did not load" + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 0, + "pieces_placed": 26, "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": 165 + "load_time_ms": 38 }, "accessibility": { - "issues": [ - "canvas without aria-label or role", - "canvas without aria-label or role" - ], - "issue_count": 2, - "pass": false + "issues": [], + "issue_count": 0, + "pass": true } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -42,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-trbyqbmh/loop-bench-dwwdc2bs', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 8, + "code": 3, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1043, + "dependencies": { + "production": 0, + "dev": 7, + "total": 7 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 28, + "excessive": true + }, + "function_length": { + "count": 53, + "average": 7.2, + "max": 26, + "long_functions": 0 + }, + "max_nesting_depth": 10, + "global_declarations": 54, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 367, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 67, + "source_lines": 756, + "ratio_pct": 8.9 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 6 + }, + "duplication_percentage": 0.0, + "score": 0.85 }, "transcript_analysis": { "total_events": 42, @@ -69,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.81, + "total": 16, + "passed": 13, + "failed": 3, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "2 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 292" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 34 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 13, + "failed": 3, + "score": 0.81 + }, + "gameplay": { + "pieces_placed": 34, + "lines_cleared": 1, + "max_score_observed": 292, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 312, + "events_count": 10, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 35 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.835, + "score": 0.835, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 3, + "cognitive_complexity": 86, + "lines_of_code": 520, + "duplication_pct": 0.0, + "tech_debt_minutes": 16, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.86 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -12,11 +12,12 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "x", + "rotate": "ArrowUp", "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "no change detected after key press" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "2 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 49" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [158] -> no change after polling" + "detail": "score stayed at 292" }, { "name": "game_over", @@ -97,24 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 80 pieces, no crashes" + "detail": "played for 30s, placed 34 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 14, - "failed": 2, - "score": 0.88 + "passed": 13, + "failed": 3, + "score": 0.81 }, "gameplay": { - "pieces_placed": 121, - "lines_cleared": 2, - "max_score_observed": 164, + "pieces_placed": 34, + "lines_cleared": 1, + "max_score_observed": 292, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 312, + "events_count": 10, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 32 + "load_time_ms": 35 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -121,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.88, + "total": 16, + "passed": 14, + "failed": 2, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "3 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 310" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 43 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 14, + "failed": 2, + "score": 0.88 + }, + "gameplay": { + "pieces_placed": 43, + "lines_cleared": 0, + "max_score_observed": 310, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 286, + "events_count": 10, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 0, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 45 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.9, + "score": 0.9, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 1, + "cognitive_complexity": 77, + "lines_of_code": 486, + "duplication_pct": 0.0, + "tech_debt_minutes": 5, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.92 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 16 -> 40" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": true, - "detail": "line cleared via strategic placement" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [160] -> no change after polling" + "detail": "score stayed at 310" }, { "name": "game_over", @@ -97,24 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 79 pieces, no crashes" + "detail": "played for 30s, placed 43 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 14, + "failed": 2, + "score": 0.88 }, "gameplay": { - "pieces_placed": 130, - "lines_cleared": 1, - "max_score_observed": 222, + "pieces_placed": 43, + "lines_cleared": 0, + "max_score_observed": 310, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 286, + "events_count": 10, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 0, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 31 + "load_time_ms": 45 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -42,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-8enzhjux/loop-bench-qyww_eoe', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 11, + "code": 5, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1339, + "dependencies": { + "production": 0, + "dev": 7, + "total": 7 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 40, + "excessive": true + }, + "function_length": { + "count": 53, + "average": 8.5, + "max": 48, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 16, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 532, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 99, + "source_lines": 1000, + "ratio_pct": 9.9 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 3, + "files_with_logic": 3, + "files_with_both": 3 + }, + "html_validation": { + "valid": false, + "errors": 2 + }, + "duplication_percentage": 0.0, + "score": 0.85 }, "transcript_analysis": { "total_events": 49, @@ -69,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.81, + "total": 16, + "passed": 13, + "failed": 3, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "5 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 252" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 13, + "failed": 3, + "score": 0.81 + }, + "gameplay": { + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 252, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 295, + "events_count": 10, + "pieces_spawned": 5, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 27 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.74, + "score": 0.74, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 1, + "cognitive_complexity": 112, + "lines_of_code": 688, + "duplication_pct": 0.0, + "tech_debt_minutes": 5, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.67 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -12,11 +12,12 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "x", + "rotate": "ArrowUp", "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "5 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 43" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [214] -> no change after polling" + "detail": "score stayed at 252" }, { "name": "game_over", @@ -97,24 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 79 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 13, + "failed": 3, + "score": 0.81 }, "gameplay": { - "pieces_placed": 120, + "pieces_placed": 37, "lines_cleared": 1, - "max_score_observed": 182, + "max_score_observed": 252, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 295, + "events_count": 10, + "pieces_spawned": 5, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 35 + "load_time_ms": 27 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -42,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-82_506qo/loop-bench-5vcsw5m5', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 7, + "code": 3, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1441, + "dependencies": { + "production": 0, + "dev": 7, + "total": 7 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 34, + "excessive": true + }, + "function_length": { + "count": 77, + "average": 6.8, + "max": 50, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 18, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 576, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 115, + "source_lines": 1022, + "ratio_pct": 11.3 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 2 + }, + "duplication_percentage": 0.0, + "score": 0.85 }, "transcript_analysis": { "total_events": 98, @@ -69,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 60, + "height": 120 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "enter", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via enter" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 26" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 26, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 347, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 32 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.405, + "score": 0.405, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 20, + "cognitive_complexity": 123, + "lines_of_code": 698, + "duplication_pct": 0.0, + "tech_debt_minutes": 52, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "enter", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [210] -> no change after polling" + "detail": "score stayed at 26" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 10, - "failed": 6, - "score": 0.63 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 188, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 176, + "max_score_observed": 26, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 347, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 60 + "load_time_ms": 32 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -42,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-9ibz17n0/loop-bench-fffti9wy', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 9, + "code": 4, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1471, + "dependencies": { + "production": 0, + "dev": 7, + "total": 7 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 72, + "excessive": true + }, + "function_length": { + "count": 60, + "average": 7.9, + "max": 44, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 18, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 464, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 136, + "source_lines": 983, + "ratio_pct": 13.8 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 5 + }, + "duplication_percentage": 0.0, + "score": 0.85 }, "transcript_analysis": { "total_events": 46, @@ -69,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.38, + "total": 16, + "passed": 6, + "failed": 10, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 52, + "height": 104 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "enter", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via enter" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 186 to 208" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 6, + "failed": 10, + "score": 0.38 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 208, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 350, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 31 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.44, + "score": 0.44, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 16, + "cognitive_complexity": 111, + "lines_of_code": 696, + "duplication_pct": 0.0, + "tech_debt_minutes": 56, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "enter", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", "pass": false, - "detail": "no change detected after rotate key" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [208] -> no change after polling" + "pass": true, + "detail": "score changed from 186 to 208" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 9, - "failed": 7, - "score": 0.56 + "passed": 6, + "failed": 10, + "score": 0.38 }, "gameplay": { - "pieces_placed": 188, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 190, + "max_score_observed": 208, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 350, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 82 + "load_time_ms": 31 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -121,10 +121,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.69, + "score": 0.94, "total": 16, - "passed": 11, - "failed": 5, + "passed": 15, + "failed": 1, "report": { "implementation": { "renderer": "canvas", @@ -143,7 +143,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": false + "score_element_found": false, + "grid_confidence": 1 }, "tests": [ { @@ -159,62 +160,62 @@ { "name": "auto_drop", "pass": true, - "detail": "pixels changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "no change detected after key press" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "could not detect any piece rotations" + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", - "pass": false, - "detail": "no change detected after hard drop key" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "2 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "grid did not accumulate filled cells" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "no score element found and no number changed" + "detail": "no score element found" }, { "name": "game_over", @@ -224,24 +225,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 26s, placed 100 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 11, - "failed": 5, - "score": 0.69 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 200, - "lines_cleared": 2, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 26, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 350, + "events_count": 9, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 43 + "load_time_ms": 44 }, "accessibility": { "issues": [ @@ -252,10 +264,19 @@ } } }, - "outcome_score": 0.345, - "score": 0.345, + "outcome_score": 0.595, + "score": 0.595, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 14, + "cognitive_complexity": 140, + "lines_of_code": 961, + "duplication_pct": 33.6, + "tech_debt_minutes": 107, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": false + "score_element_found": false, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "pixels changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "no change detected after key press" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "could not detect any piece rotations" + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", - "pass": false, - "detail": "no change detected after hard drop key" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "2 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "grid did not accumulate filled cells" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "no score element found and no number changed" + "detail": "no score element found" }, { "name": "game_over", @@ -97,24 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 26s, placed 100 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 11, - "failed": 5, - "score": 0.69 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 200, - "lines_cleared": 2, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 26, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 350, + "events_count": 9, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 43 + "load_time_ms": 44 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -124,13 +124,164 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.81, + "total": 16, + "passed": 13, + "failed": 3, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "10 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 35 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 13, + "failed": 3, + "score": 0.81 + }, + "gameplay": { + "pieces_placed": 35, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 315, + "events_count": 8, + "pieces_spawned": 10, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "I" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 141 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.82, + "score": 0.82, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 4, + "cognitive_complexity": 79, + "lines_of_code": 564, + "duplication_pct": 0.0, + "tech_debt_minutes": 18, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.83 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotated: [I, other] failed: [] (tested 2 piece types in 60 attempts)" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "visual change suggests new piece spawned" + "detail": "10 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "game still responding after 10 piece drops" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,24 +98,36 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 80 pieces, no crashes" + "detail": "played for 30s, placed 35 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 13, + "failed": 3, + "score": 0.81 }, "gameplay": { - "pieces_placed": 167, + "pieces_placed": 35, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 315, + "events_count": 8, + "pieces_spawned": 10, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "I" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 29 + "load_time_ms": 141 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -124,13 +124,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "2 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 40 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 40, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 297, + "events_count": 10, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 120 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.745, + "score": 0.745, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 18, + "cognitive_complexity": 164, + "lines_of_code": 996, + "duplication_pct": 17.0, + "tech_debt_minutes": 48, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -12,11 +12,12 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "z", + "rotate": "ArrowUp", "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "2 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 16 -> 38" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 26s, placed 100 pieces, no crashes" + "detail": "played for 30s, placed 40 pieces, no crashes" } ], "summary": { @@ -107,14 +108,25 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 151, + "pieces_placed": 40, "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 26, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 297, + "events_count": 10, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 49 + "load_time_ms": 120 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -124,13 +124,161 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 75, + "height": 150 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 345, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 42 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.54, + "score": 0.54, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 6, + "cognitive_complexity": 85, + "lines_of_code": 721, + "duplication_pct": 0.0, + "tech_debt_minutes": 18, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.77 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 76 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 7, - "failed": 9, - "score": 0.44 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 186, + "pieces_placed": 20, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 345, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 21 + "load_time_ms": 42 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -117,13 +117,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 26 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.58, + "score": 0.58, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 1, + "cognitive_complexity": 0, + "lines_of_code": 657, + "duplication_pct": 0.0, + "tech_debt_minutes": 12, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.97 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -1,22 +1,18 @@ { "implementation": { - "renderer": "dom", - "grid_detected": true, - "grid_bounds": { - "x": 360.5, - "y": 50, - "width": 329, - "height": 639 - }, + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "z", + "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "auto", - "score_element_found": true + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { @@ -27,67 +23,67 @@ { "name": "game_starts", "pass": true, - "detail": "started via auto" + "detail": "started via click_canvas" }, { "name": "auto_drop", - "pass": true, - "detail": "grid state changed after 5s with no input" + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", - "pass": true, - "detail": "filled cells persist at bottom" + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", - "pass": true, - "detail": "new piece detected at top of grid" + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": true, - "detail": "line cleared via strategic placement" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "no score element found" }, { "name": "game_over", @@ -97,30 +93,37 @@ { "name": "playable_30s", "pass": false, - "detail": "390 console errors, 0 play errors during 30s" + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 11, - "failed": 5, - "score": 0.69 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 188, - "lines_cleared": 1, + "pieces_placed": 26, + "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 30, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": 27 + "load_time_ms": 26 }, "accessibility": { - "issues": [ - "no headings found" - ], - "issue_count": 1, - "pass": false + "issues": [], + "issue_count": 0, + "pass": true } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -95,10 +95,10 @@ }, "html_validation": { "valid": false, - "errors": 1 + "errors": 0 }, "duplication_percentage": 0.0, - "score": 0.8 + "score": 0.45 }, "transcript_analysis": { "total_events": 91, @@ -146,7 +146,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -162,62 +163,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": false, - "detail": "no change detected after key press" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [I] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "2 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 16 -> 40" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -227,7 +228,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 94 pieces, no crashes" + "detail": "played for 30s, placed 33 pieces, no crashes" } ], "summary": { @@ -237,14 +238,25 @@ "score": 0.88 }, "gameplay": { - "pieces_placed": 145, + "pieces_placed": 33, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 314, + "events_count": 10, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 21 + "load_time_ms": 32 }, "accessibility": { "issues": [ @@ -256,10 +268,19 @@ } } }, - "outcome_score": 0.88, - "score": 0.88, + "outcome_score": 0.925, + "score": 0.925, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 1, + "cognitive_complexity": 0, + "lines_of_code": 657, + "duplication_pct": 0.0, + "tech_debt_minutes": 12, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.97 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": false, - "detail": "no change detected after key press" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [I] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "2 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 16 -> 40" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 94 pieces, no crashes" + "detail": "played for 30s, placed 33 pieces, no crashes" } ], "summary": { @@ -107,14 +108,25 @@ "score": 0.88 }, "gameplay": { - "pieces_placed": 145, + "pieces_placed": 33, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 314, + "events_count": 10, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 21 + "load_time_ms": 32 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -125,144 +125,165 @@ }, "gameplay_bot": { "pass": false, - "score": 0, + "score": 0.88, "total": 16, - "passed": 0, - "failed": 16, + "passed": 14, + "failed": 2, "report": { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "z", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false, - "grid_confidence": 0 + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, L]" }, { "name": "hard_drop", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "score stayed at 0" }, { "name": "game_over", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "could not trigger or detect game over" }, { "name": "playable_30s", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "played for 30s, placed 45 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 14, + "failed": 2, + "score": 0.88 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 45, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 0, - "events_count": 0, - "pieces_spawned": 0, - "pieces_locked": 0, - "lines_cleared": 0, - "piece_types_seen": [], - "grid_read_success_rate": 0 + "frames": 278, + "events_count": 10, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "L" + ], + "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": -1 + "load_time_ms": 36 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.925, + "score": 0.925, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 1, + "cognitive_complexity": 0, + "lines_of_code": 657, + "duplication_pct": 0.0, + "tech_debt_minutes": 12, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.97 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -1,129 +1,141 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "z", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false, - "grid_confidence": 0 + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, L]" }, { "name": "hard_drop", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "score stayed at 0" }, { "name": "game_over", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "could not trigger or detect game over" }, { "name": "playable_30s", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:34125/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "played for 30s, placed 45 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 14, + "failed": 2, + "score": 0.88 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 45, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 0, - "events_count": 0, - "pieces_spawned": 0, - "pieces_locked": 0, - "lines_cleared": 0, - "piece_types_seen": [], - "grid_read_success_rate": 0 + "frames": 278, + "events_count": 10, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "L" + ], + "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": -1 + "load_time_ms": 36 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -124,13 +124,161 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 345, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 37 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.28, + "score": 0.28, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 3, + "vulnerabilities": 0, + "code_smells": 13, + "cognitive_complexity": 162, + "lines_of_code": 1064, + "duplication_pct": 16.2, + "tech_debt_minutes": 48, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -96,29 +97,41 @@ }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 6, - "failed": 10, - "score": 0.38 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 110, + "pieces_placed": 20, "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 345, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 39 + "load_time_ms": 37 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -124,13 +124,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.69, + "total": 16, + "passed": 11, + "failed": 5, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "a", + "right": "d", + "down": "s", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "3 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 41 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 11, + "failed": 5, + "score": 0.69 + }, + "gameplay": { + "pieces_placed": 41, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 293, + "events_count": 7, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 30 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.68, + "score": 0.68, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 6, + "cognitive_complexity": 103, + "lines_of_code": 612, + "duplication_pct": 0.0, + "tech_debt_minutes": 32, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.67 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -9,14 +9,15 @@ "height": 600 }, "controls": { - "left": "ArrowLeft", - "right": "ArrowRight", - "down": "ArrowDown", - "rotate": "ArrowUp", + "left": "a", + "right": "d", + "down": "s", + "rotate": "x", "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": false, - "detail": "no change detected after rotate key" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 16 -> 51" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,24 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 41 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 13, - "failed": 3, - "score": 0.81 + "passed": 11, + "failed": 5, + "score": 0.69 }, "gameplay": { - "pieces_placed": 129, + "pieces_placed": 41, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 293, + "events_count": 7, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 531 + "load_time_ms": 30 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -97,10 +97,10 @@ }, "html_validation": { "valid": false, - "errors": 4 + "errors": 0 }, "duplication_percentage": 0.0, - "score": 0.58 + "score": 0.23 }, "transcript_analysis": { "total_events": 117, @@ -126,13 +126,161 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "space", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via space" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 349, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 44 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.28, + "score": 0.28, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 11, + "cognitive_complexity": 164, + "lines_of_code": 1102, + "duplication_pct": 31.6, + "tech_debt_minutes": 40, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "space", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,72 +33,72 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Test timeout of 180000ms exceeded." + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { @@ -107,18 +108,30 @@ "score": 0.31 }, "gameplay": { - "pieces_placed": 110, + "pieces_placed": 20, "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 349, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 310 + "load_time_ms": 44 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -95,10 +95,10 @@ }, "html_validation": { "valid": false, - "errors": 1 + "errors": 0 }, "duplication_percentage": 0.0, - "score": 0.8 + "score": 0.45 }, "transcript_analysis": { "total_events": 115, @@ -124,144 +124,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, + "score": 0.94, "total": 16, - "passed": 0, - "failed": 16, + "passed": 15, + "failed": 1, "report": { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "x", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false, - "grid_confidence": 0 + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "2 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "played for 31s, placed 43 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 43, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 31, "errors_during_play": 0 }, "session": { - "frames": 0, - "events_count": 0, - "pieces_spawned": 0, - "pieces_locked": 0, - "lines_cleared": 0, - "piece_types_seen": [], - "grid_read_success_rate": 0 + "frames": 295, + "events_count": 11, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": -1 + "load_time_ms": 40 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.595, + "score": 0.595, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 14, + "cognitive_complexity": 182, + "lines_of_code": 936, + "duplication_pct": 26.8, + "tech_debt_minutes": 40, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -1,129 +1,139 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "x", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false, - "grid_confidence": 0 + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "2 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "page load failed: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37589/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "played for 31s, placed 43 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 43, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 31, "errors_during_play": 0 }, "session": { - "frames": 0, - "events_count": 0, - "pieces_spawned": 0, - "pieces_locked": 0, - "lines_cleared": 0, - "piece_types_seen": [], - "grid_read_success_rate": 0 + "frames": 295, + "events_count": 11, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": -1 + "load_time_ms": 40 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -125,13 +125,160 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.81, + "total": 16, + "passed": 13, + "failed": 3, + "report": { + "implementation": { + "renderer": "dom", + "grid_detected": true, + "grid_bounds": { + "x": 375, + "y": 30, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "3 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "2 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 38 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 13, + "failed": 3, + "score": 0.81 + }, + "gameplay": { + "pieces_placed": 38, + "lines_cleared": 2, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 301, + "events_count": 8, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 2, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 25 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.53, + "score": 0.53, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 14, + "cognitive_complexity": 182, + "lines_of_code": 936, + "duplication_pct": 26.8, + "tech_debt_minutes": 40, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected after drop" + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 41" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "2 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,24 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 43s, placed 39 pieces, no crashes" + "detail": "played for 30s, placed 38 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 13, + "failed": 3, + "score": 0.81 }, "gameplay": { - "pieces_placed": 80, - "lines_cleared": 1, + "pieces_placed": 38, + "lines_cleared": 2, "max_score_observed": 0, - "play_duration_seconds": 43, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 301, + "events_count": 8, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 2, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 745 + "load_time_ms": 25 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -127,13 +127,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "2 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 280 to 288" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 22 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 22, + "lines_cleared": 1, + "max_score_observed": 288, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 343, + "events_count": 9, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 70 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.595, + "score": 0.595, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 15, + "cognitive_complexity": 184, + "lines_of_code": 1132, + "duplication_pct": 33.1, + "tech_debt_minutes": 24, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "2 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 38" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [190] -> no change after polling" + "pass": true, + "detail": "score changed from 280 to 288" }, { "name": "game_over", @@ -96,29 +97,43 @@ }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 22 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 13, - "failed": 3, - "score": 0.81 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 41, + "pieces_placed": 22, "lines_cleared": 1, - "max_score_observed": 0, - "play_duration_seconds": 0, + "max_score_observed": 288, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 343, + "events_count": 9, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 5790 + "load_time_ms": 70 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -124,10 +124,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.25, + "score": 0.31, "total": 16, - "passed": 4, - "failed": 12, + "passed": 5, + "failed": 11, "report": { "implementation": { "renderer": "dom", @@ -146,7 +146,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -162,89 +163,98 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "exception: page.waitForTimeout: Test timeout of 180000ms exceeded." + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "could not read score element" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 4, - "failed": 12, - "score": 0.25 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 60, - "lines_cleared": 0, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 349, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 9051 + "load_time_ms": 24 }, "accessibility": { "issues": [], @@ -253,10 +263,19 @@ } } }, - "outcome_score": 0.125, - "score": 0.125, + "outcome_score": 0.405, + "score": 0.405, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 8, + "cognitive_complexity": 105, + "lines_of_code": 764, + "duplication_pct": 0.0, + "tech_debt_minutes": 18, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,89 +33,98 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "exception: page.waitForTimeout: Test timeout of 180000ms exceeded." + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "could not read score element" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 4, - "failed": 12, - "score": 0.25 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 60, - "lines_cleared": 0, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 349, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 9051 + "load_time_ms": 24 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -123,10 +123,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.5, + "score": 0.88, "total": 16, - "passed": 8, - "failed": 8, + "passed": 14, + "failed": 2, "report": { "implementation": { "renderer": "canvas", @@ -145,7 +145,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": false + "score_element_found": false, + "grid_confidence": 1 }, "tests": [ { @@ -161,101 +162,124 @@ { "name": "auto_drop", "pass": true, - "detail": "pixels changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "could not detect any piece rotations" + "pass": true, + "detail": "rotation observed, piece types seen: [J, unknown]" }, { "name": "hard_drop", - "pass": true, - "detail": "visual change detected after hard drop" + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", - "pass": false, - "detail": "exception: page.screenshot: Timeout 10000ms exceeded.\nCall log:\n\u001b[2m - taking page screenshot\u001b[22m\n\u001b[2m - waiting for fonts to load...\u001b[22m\n\u001b[2m - fonts loaded\u001b[22m\n" + "pass": true, + "detail": "10 piece(s) locked during play" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "5 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "10 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "exception: page.evaluate: Target page, context or browser has been closed" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 33 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 8, - "failed": 8, - "score": 0.5 + "passed": 14, + "failed": 2, + "score": 0.88 }, "gameplay": { - "pieces_placed": 60, - "lines_cleared": 0, + "pieces_placed": 33, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 319, + "events_count": 7, + "pieces_spawned": 5, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [ + "J", + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 2815 + "load_time_ms": 34 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false } } }, - "outcome_score": 0.25, - "score": 0.25, + "outcome_score": 0.715, + "score": 0.715, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 7, + "cognitive_complexity": 166, + "lines_of_code": 1039, + "duplication_pct": 28.5, + "tech_debt_minutes": 32, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": false + "score_element_found": false, + "grid_confidence": 1 }, "tests": [ { @@ -32,93 +33,107 @@ { "name": "auto_drop", "pass": true, - "detail": "pixels changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "could not detect any piece rotations" + "pass": true, + "detail": "rotation observed, piece types seen: [J, unknown]" }, { "name": "hard_drop", - "pass": true, - "detail": "visual change detected after hard drop" + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", - "pass": false, - "detail": "exception: page.screenshot: Timeout 10000ms exceeded.\nCall log:\n\u001b[2m - taking page screenshot\u001b[22m\n\u001b[2m - waiting for fonts to load...\u001b[22m\n\u001b[2m - fonts loaded\u001b[22m\n" + "pass": true, + "detail": "10 piece(s) locked during play" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "5 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "10 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "exception: page.evaluate: Target page, context or browser has been closed" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 33 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 8, - "failed": 8, - "score": 0.5 + "passed": 14, + "failed": 2, + "score": 0.88 }, "gameplay": { - "pieces_placed": 60, - "lines_cleared": 0, + "pieces_placed": 33, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 319, + "events_count": 7, + "pieces_spawned": 5, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [ + "J", + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 2815 + "load_time_ms": 34 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -124,13 +124,161 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 240, + "height": 400 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 344, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 36 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.405, + "score": 0.405, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 8, + "cognitive_complexity": 105, + "lines_of_code": 764, + "duplication_pct": 0.0, + "tech_debt_minutes": 18, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 7, - "failed": 9, - "score": 0.44 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 188, + "pieces_placed": 20, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 344, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 39 + "load_time_ms": 36 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -123,13 +123,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.81, + "total": 16, + "passed": 13, + "failed": 3, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "1 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": false, + "detail": "could not trigger or detect game over" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 13, + "failed": 3, + "score": 0.81 + }, + "gameplay": { + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 307, + "events_count": 8, + "pieces_spawned": 1, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 56 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.755, + "score": 0.755, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 9, + "cognitive_complexity": 93, + "lines_of_code": 641, + "duplication_pct": 6.0, + "tech_debt_minutes": 39, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.7 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", - "pass": false, - "detail": "exception: page.screenshot: Timeout 10000ms exceeded.\nCall log:\n\u001b[2m - taking page screenshot\u001b[22m\n\u001b[2m - waiting for fonts to load...\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "1 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 16 -> 42" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "20 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -96,29 +97,42 @@ }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 12, - "failed": 4, - "score": 0.75 + "passed": 13, + "failed": 3, + "score": 0.81 }, "gameplay": { - "pieces_placed": 30, - "lines_cleared": 20, + "pieces_placed": 37, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 307, + "events_count": 8, + "pieces_spawned": 1, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 1053 + "load_time_ms": 56 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -26,12 +26,78 @@ "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-4v0qdx8e/loop-bench-ktrmyvlb', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 131616, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-4v0qdx8e/loop-bench-ktrmyvlb', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 14, + "code": 3, + "docs": 7, + "unnecessary": 1, + "unnecessary_list": [ + "README.md" + ] + }, + "lines_of_code": 1308, + "dependencies": { + "production": 0, + "dev": 5, + "total": 5 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 22, + "excessive": true + }, + "function_length": { + "count": 52, + "average": 7.1, + "max": 30, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 0, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 629, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 32, + "source_lines": 1083, + "ratio_pct": 3.0 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 2 + }, + "duplication_percentage": 0.0, + "score": 0.75 }, "transcript_analysis": { "total_events": 120, @@ -57,13 +123,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": false, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "3 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 39 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 39, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 296, + "events_count": 11, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 38 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.755, + "score": 0.755, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 6, + "cognitive_complexity": 130, + "lines_of_code": 1187, + "duplication_pct": 35.2, + "tech_debt_minutes": 20, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.57 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -12,11 +12,12 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "x", + "rotate": "ArrowUp", "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": false + "score_element_found": false, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other, unknown] failed: [] (tested 2 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 22" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "9 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "no score element found and no number changed" + "detail": "no score element found" }, { "name": "game_over", @@ -96,29 +97,42 @@ }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 39 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 14, - "failed": 2, - "score": 0.88 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 42, - "lines_cleared": 9, + "pieces_placed": 39, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 296, + "events_count": 11, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 219 + "load_time_ms": 38 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -26,12 +26,80 @@ "score": 0.75 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-g_6vs4uj/loop-bench-v_scnslz', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": false, + "error": "no tsconfig.json" + }, + "performance": { + "pass": true, + "bundle_size_bytes": 94776, + "size_under_512kb": true + }, + "score": 0.67 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-g_6vs4uj/loop-bench-v_scnslz', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 12, + "code": 4, + "docs": 6, + "unnecessary": 2, + "unnecessary_list": [ + "server.js", + "README.md" + ] + }, + "lines_of_code": 984, + "dependencies": { + "production": 0, + "dev": 5, + "total": 5 + }, + "complexity": "moderate", + "console_logs": 2, + "magic_numbers": { + "count": 17, + "excessive": false + }, + "function_length": { + "count": 54, + "average": 5.1, + "max": 12, + "long_functions": 0 + }, + "max_nesting_depth": 10, + "global_declarations": 20, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 311, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 32, + "source_lines": 711, + "ratio_pct": 4.5 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 2 + }, + "duplication_percentage": 0.0, + "score": 0.66 }, "transcript_analysis": { "total_events": 95, @@ -57,13 +125,160 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.5, + "total": 16, + "passed": 8, + "failed": 8, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "10 piece(s) locked during play" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "1 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "10 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 8, + "failed": 8, + "score": 0.5 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 341, + "events_count": 2, + "pieces_spawned": 1, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 28 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.525, + "score": 0.525, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 11, + "cognitive_complexity": 141, + "lines_of_code": 873, + "duplication_pct": 27.8, + "tech_debt_minutes": 51, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,93 +33,104 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "exception: page.waitForTimeout: Target page, context or browser has been closed" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "10 piece(s) locked during play" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "10 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "could not read score element" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 4, - "failed": 12, - "score": 0.25 + "passed": 8, + "failed": 8, + "score": 0.5 }, "gameplay": { - "pieces_placed": 50, - "lines_cleared": 0, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 341, + "events_count": 2, + "pieces_spawned": 1, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 1117 + "load_time_ms": 28 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/eval_results.json @@ -43,22 +43,242 @@ "score": 0.67 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-u6jryf6_/loop-bench-fx9t030l', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 11, + "code": 3, + "docs": 5, + "unnecessary": 1, + "unnecessary_list": [ + "README.md" + ] + }, + "lines_of_code": 1145, + "dependencies": { + "production": 0, + "dev": 5, + "total": 5 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 12, + "excessive": false + }, + "function_length": { + "count": 50, + "average": 7.5, + "max": 30, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 12, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 748, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 26, + "source_lines": 823, + "ratio_pct": 3.2 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 2 + }, + "duplication_percentage": 0.0, + "score": 0.8 }, "transcript_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/transcript-analysis.py', '/root/loop-benchmarking/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1']' timed out after 30 seconds", - "score": 0 + "total_events": 110, + "tool_calls": { + "total": 28, + "bash": 21, + "write": 0, + "edit": 3, + "read": 4 + }, + "wasted_turns": { + "total": 18, + "docs": 3, + "ascii_art": 5, + "server_starts": 10 + }, + "errors_encountered": 0, + "thinking_blocks": 29, + "text_blocks": 21, + "productivity_ratio": 0.36, + "self_tested": false, + "score": 0.75 }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.5, + "total": 16, + "passed": 8, + "failed": 8, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "10 piece(s) locked during play" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "1 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "10 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 22 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 8, + "failed": 8, + "score": 0.5 + }, + "gameplay": { + "pieces_placed": 22, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 336, + "events_count": 2, + "pieces_spawned": 1, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 26 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.375, + "score": 0.375, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 13, + "cognitive_complexity": 162, + "lines_of_code": 1022, + "duplication_pct": 37.4, + "tech_debt_minutes": 40, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -1,8 +1,13 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", @@ -10,8 +15,9 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "click_canvas", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -21,99 +27,111 @@ }, { "name": "game_starts", - "pass": false, - "detail": "exception: page.screenshot: Timeout 10000ms exceeded.\nCall log:\n\u001b[2m - taking page screenshot\u001b[22m\n\u001b[2m - waiting for fonts to load...\u001b[22m\n" + "pass": true, + "detail": "started via click_canvas" }, { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "rotate", "pass": false, - "detail": "no change detected after rotate key" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", - "pass": false, - "detail": "could not verify piece locking at bottom" + "pass": true, + "detail": "10 piece(s) locked during play" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "could not detect new piece at top" + "pass": true, + "detail": "1 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "pass": true, + "detail": "10 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "exception: page.evaluate: Target page, context or browser has been closed" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 22 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 1, - "failed": 15, - "score": 0.06 + "passed": 8, + "failed": 8, + "score": 0.5 }, "gameplay": { - "pieces_placed": 70, - "lines_cleared": 0, + "pieces_placed": 22, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 336, + "events_count": 2, + "pieces_spawned": 1, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 1237 + "load_time_ms": 26 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/eval_results.json @@ -43,22 +43,235 @@ "score": 0.67 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-23oypf3d/loop-bench-v114tywh', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 15, + "code": 3, + "docs": 6, + "unnecessary": 2, + "unnecessary_list": [ + "QUICK_START.txt", + "README.md" + ] + }, + "lines_of_code": 1138, + "dependencies": { + "production": 0, + "dev": 5, + "total": 5 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 14, + "excessive": false + }, + "function_length": { + "count": 47, + "average": 7.6, + "max": 28, + "long_functions": 0 + }, + "max_nesting_depth": 7, + "global_declarations": 8, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 564, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 60, + "source_lines": 766, + "ratio_pct": 7.8 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 5 + }, + "duplication_percentage": 0.0, + "score": 0.75 }, "transcript_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/transcript-analysis.py', '/root/loop-benchmarking/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2']' timed out after 30 seconds", - "score": 0 + "total_events": 124, + "tool_calls": { + "total": 35, + "bash": 32, + "write": 0, + "edit": 2, + "read": 1 + }, + "wasted_turns": { + "total": 14, + "docs": 6, + "ascii_art": 3, + "server_starts": 5 + }, + "errors_encountered": 0, + "thinking_blocks": 36, + "text_blocks": 14, + "productivity_ratio": 0.6, + "self_tested": false, + "score": 0.75 }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 23 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.22, + "score": 0.22, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 4, + "vulnerabilities": 0, + "code_smells": 14, + "cognitive_complexity": 174, + "lines_of_code": 931, + "duplication_pct": 57.3, + "tech_debt_minutes": 60, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -1,13 +1,8 @@ { "implementation": { - "renderer": "canvas", - "grid_detected": true, - "grid_bounds": { - "x": 0, - "y": 0, - "width": 75, - "height": 150 - }, + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, "controls": { "left": "ArrowLeft", "right": "ArrowRight", @@ -16,7 +11,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { @@ -32,89 +28,98 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "could not read score element" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 4, - "failed": 12, - "score": 0.25 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 46, + "pieces_placed": 26, "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": 39 + "load_time_ms": 23 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/eval_results.json @@ -43,22 +43,235 @@ "score": 0.67 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-oyxhf1b6/loop-bench-n9n73btm', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 20, + "code": 4, + "docs": 8, + "unnecessary": 2, + "unnecessary_list": [ + "QUICK_START.txt", + "README.md" + ] + }, + "lines_of_code": 1157, + "dependencies": { + "production": 0, + "dev": 5, + "total": 5 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 10, + "excessive": false + }, + "function_length": { + "count": 55, + "average": 4.9, + "max": 17, + "long_functions": 0 + }, + "max_nesting_depth": 14, + "global_declarations": 2, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 520, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 20, + "source_lines": 813, + "ratio_pct": 2.5 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 3, + "files_with_logic": 3, + "files_with_both": 3 + }, + "html_validation": { + "valid": true, + "errors": 0 + }, + "duplication_percentage": 0.0, + "score": 0.75 }, "transcript_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/transcript-analysis.py', '/root/loop-benchmarking/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3']' timed out after 30 seconds", - "score": 0 + "total_events": 134, + "tool_calls": { + "total": 36, + "bash": 32, + "write": 0, + "edit": 2, + "read": 2 + }, + "wasted_turns": { + "total": 13, + "docs": 3, + "ascii_art": 5, + "server_starts": 5 + }, + "errors_encountered": 0, + "thinking_blocks": 37, + "text_blocks": 21, + "productivity_ratio": 0.64, + "self_tested": false, + "score": 0.75 }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 31 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.47, + "score": 0.47, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 8, + "cognitive_complexity": 94, + "lines_of_code": 593, + "duplication_pct": 0.0, + "tech_debt_minutes": 29, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.75 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -10,112 +10,120 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "1 console error(s): Unexpected token '<'" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via click_canvas" }, { "name": "auto_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", "pass": false, - "detail": "skipped: page did not load" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "skipped: page did not load" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "skipped: page did not load" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", "pass": false, - "detail": "skipped: page did not load" + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 0, + "pieces_placed": 26, "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": 1915 + "load_time_ms": 31 }, "accessibility": { - "issues": [ - "canvas without aria-label or role" - ], - "issue_count": 1, - "pass": false + "issues": [], + "issue_count": 0, + "pass": true } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/eval_results.json @@ -1,7 +1,29 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-2crbjsnw/loop-bench-npum8frs', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { @@ -20,8 +42,63 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-2crbjsnw/loop-bench-npum8frs', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 42, + "code": 20, + "docs": 7, + "unnecessary": 2, + "unnecessary_list": [ + "FEATURES.md", + "README.md" + ] + }, + "lines_of_code": 2098, + "dependencies": { + "production": 0, + "dev": 6, + "total": 6 + }, + "complexity": "over-engineered", + "console_logs": 0, + "magic_numbers": { + "count": 36, + "excessive": true + }, + "function_length": { + "count": 125, + "average": 6.1, + "max": 34, + "long_functions": 0 + }, + "max_nesting_depth": 10, + "global_declarations": 14, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 98.8, + "camel_case": 1165, + "snake_case": 14 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 50, + "source_lines": 1736, + "ratio_pct": 2.9 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 6, + "files_with_logic": 10, + "files_with_both": 5 + }, + "html_validation": { + "valid": true, + "errors": 0 + }, + "duplication_percentage": 0.0, + "score": 0.6 }, "transcript_analysis": { "total_events": 152, @@ -46,14 +123,165 @@ "score": 0.75 }, "gameplay_bot": { - "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "pass": true, + "score": 1, + "total": 16, + "passed": 16, + "failed": 0, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "5 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 49 to 53" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 25 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 16, + "failed": 0, + "score": 1 + }, + "gameplay": { + "pieces_placed": 25, + "lines_cleared": 1, + "max_score_observed": 53, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 329, + "events_count": 9, + "pieces_spawned": 5, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "O" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 135 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.9, + "score": 0.9, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 21, + "cognitive_complexity": 49, + "lines_of_code": 741, + "duplication_pct": 0.0, + "tech_debt_minutes": 75, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.8 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/gameplay-bot-report.json @@ -1,17 +1,23 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "z", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -21,99 +27,114 @@ }, { "name": "game_starts", - "pass": false, - "detail": "exception: page.screenshot: Timeout 10000ms exceeded.\nCall log:\n\u001b[2m - taking page screenshot\u001b[22m\n\u001b[2m - waiting for fonts to load...\u001b[22m\n" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", "pass": true, - "detail": "pixels changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "exception: page.waitForTimeout: Test timeout of 180000ms exceeded." + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", - "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "5 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "exception: page.evaluate: Target page, context or browser has been closed" + "pass": true, + "detail": "score changed from 49 to 53" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 25 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 6, - "failed": 10, - "score": 0.38 + "passed": 16, + "failed": 0, + "score": 1 }, "gameplay": { - "pieces_placed": 28, - "lines_cleared": 0, - "max_score_observed": 0, - "play_duration_seconds": 0, + "pieces_placed": 25, + "lines_cleared": 1, + "max_score_observed": 53, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 329, + "events_count": 9, + "pieces_spawned": 5, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "O" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 979 + "load_time_ms": 135 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/eval_results.json @@ -1,7 +1,29 @@ { "structural": { "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-h0p8kpvx/loop-bench-5dl5qz_l', 'typescript']' timed out after 120 seconds" + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": false, + "detail": "TypeScript files found but no tsconfig.json" + } + ], + "score": 0.75 }, "quality": { "lint": { @@ -21,8 +43,64 @@ "score": 0.67 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-h0p8kpvx/loop-bench-5dl5qz_l', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 11, + "code": 4, + "docs": 5, + "unnecessary": 3, + "unnecessary_list": [ + "FEATURES.md", + "server.js", + "README.md" + ] + }, + "lines_of_code": 1092, + "dependencies": { + "production": 0, + "dev": 5, + "total": 5 + }, + "complexity": "moderate", + "console_logs": 1, + "magic_numbers": { + "count": 18, + "excessive": false + }, + "function_length": { + "count": 52, + "average": 6.6, + "max": 28, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 17, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 457, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 37, + "source_lines": 749, + "ratio_pct": 4.9 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 3, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 3 + }, + "duplication_percentage": 0.0, + "score": 0.58 }, "transcript_analysis": { "total_events": 94, @@ -48,13 +126,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "1 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 38 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 38, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 297, + "events_count": 11, + "pieces_spawned": 1, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 98 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.745, + "score": 0.745, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 7, + "cognitive_complexity": 168, + "lines_of_code": 963, + "duplication_pct": 34.9, + "tech_debt_minutes": 38, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,93 +33,106 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "1 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 16 -> 46" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 38 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 13, - "failed": 3, - "score": 0.81 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 51, + "pieces_placed": 38, "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 297, + "events_count": 11, + "pieces_spawned": 1, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 747 + "load_time_ms": 98 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/eval_results.json @@ -1,7 +1,29 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-bdk2kxg3/loop-bench-opdim046', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { @@ -20,8 +42,62 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-bdk2kxg3/loop-bench-opdim046', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 18, + "code": 5, + "docs": 7, + "unnecessary": 1, + "unnecessary_list": [ + "README.md" + ] + }, + "lines_of_code": 1363, + "dependencies": { + "production": 0, + "dev": 3, + "total": 3 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 10, + "excessive": false + }, + "function_length": { + "count": 63, + "average": 5.3, + "max": 15, + "long_functions": 0 + }, + "max_nesting_depth": 10, + "global_declarations": 10, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 573, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 40, + "source_lines": 968, + "ratio_pct": 4.1 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 3, + "files_with_logic": 3, + "files_with_both": 3 + }, + "html_validation": { + "valid": false, + "errors": 8 + }, + "duplication_percentage": 0.0, + "score": 0.8 }, "transcript_analysis": { "total_events": 120, @@ -47,13 +123,160 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 346, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 91 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.405, + "score": 0.405, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 8, + "cognitive_complexity": 105, + "lines_of_code": 691, + "duplication_pct": 0.0, + "tech_debt_minutes": 21, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "rotate", "pass": false, - "detail": "no change detected after rotate key" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 79 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { @@ -107,14 +108,23 @@ "score": 0.31 }, "gameplay": { - "pieces_placed": 189, + "pieces_placed": 20, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 346, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 40 + "load_time_ms": 91 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json @@ -1,7 +1,29 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-33yz4i_u/loop-bench-jvbn5i1r', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "dist/index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { @@ -20,8 +42,64 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-33yz4i_u/loop-bench-jvbn5i1r', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 19, + "code": 7, + "docs": 9, + "unnecessary": 3, + "unnecessary_list": [ + "FEATURES.md", + "server.js", + "README.md" + ] + }, + "lines_of_code": 1444, + "dependencies": { + "production": 0, + "dev": 5, + "total": 5 + }, + "complexity": "over-engineered", + "console_logs": 2, + "magic_numbers": { + "count": 13, + "excessive": false + }, + "function_length": { + "count": 52, + "average": 5.2, + "max": 15, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 15, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 393, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 20, + "source_lines": 784, + "ratio_pct": 2.6 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 3 + }, + "duplication_percentage": 0.0, + "score": 0.46 }, "transcript_analysis": { "total_events": 107, @@ -47,13 +125,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 52 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.47, + "score": 0.47, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 9, + "cognitive_complexity": 57, + "lines_of_code": 629, + "duplication_pct": 0.0, + "tech_debt_minutes": 18, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.75 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json @@ -1,13 +1,8 @@ { "implementation": { "renderer": "unknown", - "grid_detected": true, - "grid_bounds": { - "x": 0, - "y": 0, - "width": 75, - "height": 150 - }, + "grid_detected": false, + "grid_bounds": null, "controls": { "left": "ArrowLeft", "right": "ArrowRight", @@ -15,8 +10,9 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": true + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { @@ -26,95 +22,104 @@ }, { "name": "game_starts", - "pass": false, - "detail": "could not start game with any mechanism" + "pass": true, + "detail": "started via click_canvas" }, { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", "pass": false, - "detail": "no change detected after key press" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", "pass": false, - "detail": "no change detected after rotate key" + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "could not read score element" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 1, - "failed": 15, - "score": 0.06 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 70, + "pieces_placed": 26, "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": 79 + "load_time_ms": 52 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/eval_results.json @@ -26,12 +26,79 @@ "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-0x3y75v_/loop-bench-tirri_jo', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 12744, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-0x3y75v_/loop-bench-tirri_jo', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 15, + "code": 4, + "docs": 7, + "unnecessary": 2, + "unnecessary_list": [ + "server.js", + "README.md" + ] + }, + "lines_of_code": 1098, + "dependencies": { + "production": 0, + "dev": 5, + "total": 5 + }, + "complexity": "moderate", + "console_logs": 2, + "magic_numbers": { + "count": 16, + "excessive": false + }, + "function_length": { + "count": 45, + "average": 8.0, + "max": 35, + "long_functions": 0 + }, + "max_nesting_depth": 11, + "global_declarations": 10, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 396, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 34, + "source_lines": 744, + "ratio_pct": 4.6 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": true, + "errors": 0 + }, + "duplication_percentage": 0.0, + "score": 0.71 }, "transcript_analysis": { "total_events": 134, @@ -57,10 +124,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.94, + "score": 0.88, "total": 16, - "passed": 15, - "failed": 1, + "passed": 14, + "failed": 2, "report": { "implementation": { "renderer": "canvas", @@ -79,7 +146,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -95,62 +163,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [unknown] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "2 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 28 -> 47" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": true, - "detail": "line cleared via strategic placement" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -160,24 +228,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 29s, placed 100 pieces, no crashes" + "detail": "played for 30s, placed 39 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 14, + "failed": 2, + "score": 0.88 }, "gameplay": { - "pieces_placed": 154, - "lines_cleared": 1, + "pieces_placed": 39, + "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 29, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 297, + "events_count": 10, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 0, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": -1 + "load_time_ms": 35 }, "accessibility": { "issues": [ @@ -189,10 +268,19 @@ } } }, - "outcome_score": 0.94, - "score": 0.94, + "outcome_score": 0.815, + "score": 0.815, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 9, + "cognitive_complexity": 57, + "lines_of_code": 629, + "duplication_pct": 0.0, + "tech_debt_minutes": 18, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.75 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [unknown] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "2 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 28 -> 47" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": true, - "detail": "line cleared via strategic placement" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,24 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 29s, placed 100 pieces, no crashes" + "detail": "played for 30s, placed 39 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 14, + "failed": 2, + "score": 0.88 }, "gameplay": { - "pieces_placed": 154, - "lines_cleared": 1, + "pieces_placed": 39, + "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 29, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 297, + "events_count": 10, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 0, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": -1 + "load_time_ms": 35 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -1,7 +1,29 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-me5_pdo3/loop-bench-sdqv7jk5', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { @@ -20,8 +42,63 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-me5_pdo3/loop-bench-sdqv7jk5', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 38, + "code": 20, + "docs": 9, + "unnecessary": 2, + "unnecessary_list": [ + "server.js", + "README.md" + ] + }, + "lines_of_code": 1642, + "dependencies": { + "production": 0, + "dev": 5, + "total": 5 + }, + "complexity": "over-engineered", + "console_logs": 2, + "magic_numbers": { + "count": 19, + "excessive": false + }, + "function_length": { + "count": 75, + "average": 6.1, + "max": 35, + "long_functions": 0 + }, + "max_nesting_depth": 10, + "global_declarations": 29, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 685, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 42, + "source_lines": 1296, + "ratio_pct": 3.2 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 5, + "files_with_logic": 9, + "files_with_both": 4 + }, + "html_validation": { + "valid": false, + "errors": 1 + }, + "duplication_percentage": 0.0, + "score": 0.56 }, "transcript_analysis": { "total_events": 123, @@ -47,139 +124,22 @@ }, "gameplay_bot": { "pass": false, - "score": 0.13, - "total": 16, - "passed": 2, - "failed": 14, - "report": { - "implementation": { - "renderer": "unknown", - "grid_detected": true, - "grid_bounds": { - "x": 0, - "y": 0, - "width": 75, - "height": 150 - }, - "controls": { - "left": "ArrowLeft", - "right": "ArrowRight", - "down": "ArrowDown", - "rotate": "ArrowUp", - "drop": "Space" - }, - "start_mechanism": "unknown", - "score_element_found": false - }, - "tests": [ - { - "name": "game_loads", - "pass": true, - "detail": "no console errors" - }, - { - "name": "game_starts", - "pass": false, - "detail": "could not start game with any mechanism" - }, - { - "name": "auto_drop", - "pass": false, - "detail": "piece did not move in 5 seconds" - }, - { - "name": "move_left", - "pass": false, - "detail": "no change detected after key press" - }, - { - "name": "move_right", - "pass": false, - "detail": "no change detected after key press" - }, - { - "name": "move_down", - "pass": false, - "detail": "no change detected after key press" - }, - { - "name": "rotate", - "pass": false, - "detail": "no change detected after rotate key" - }, - { - "name": "all_pieces_rotate", - "pass": false, - "detail": "could not detect any piece rotations" - }, - { - "name": "hard_drop", - "pass": false, - "detail": "no change detected after hard drop key" - }, - { - "name": "piece_locks", - "pass": false, - "detail": "could not verify piece locking at bottom" - }, - { - "name": "new_piece_spawns", - "pass": false, - "detail": "could not detect new piece at top" - }, - { - "name": "multiple_pieces", - "pass": false, - "detail": "grid did not accumulate filled cells" - }, - { - "name": "line_clear", - "pass": true, - "detail": "line cleared via strategic placement" - }, - { - "name": "score_changes", - "pass": false, - "detail": "score did not increase: [0] -> no change after polling" - }, - { - "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" - }, - { - "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" - } - ], - "summary": { - "total": 16, - "passed": 2, - "failed": 14, - "score": 0.13 - }, - "gameplay": { - "pieces_placed": 110, - "lines_cleared": 1, - "max_score_observed": 0, - "play_duration_seconds": 0, - "errors_during_play": 0 - }, - "performance": { - "load_time_ms": 117 - }, - "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true - } - } + "score": 0, + "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.13, - "score": 0.13, + "outcome_score": 0.275, + "score": 0.275, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 12, + "cognitive_complexity": 101, + "lines_of_code": 806, + "duplication_pct": 15.0, + "tech_debt_minutes": 30, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -43,22 +43,243 @@ "score": 0.67 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-dg9g3bx_/loop-bench-eenai5fl', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 11, + "code": 5, + "docs": 3, + "unnecessary": 2, + "unnecessary_list": [ + "server.js", + "README.md" + ] + }, + "lines_of_code": 1390, + "dependencies": { + "production": 0, + "dev": 5, + "total": 5 + }, + "complexity": "moderate", + "console_logs": 2, + "magic_numbers": { + "count": 15, + "excessive": false + }, + "function_length": { + "count": 50, + "average": 8.1, + "max": 40, + "long_functions": 0 + }, + "max_nesting_depth": 10, + "global_declarations": 16, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 830, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 33, + "source_lines": 921, + "ratio_pct": 3.6 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 4 + }, + "duplication_percentage": 0.0, + "score": 0.66 }, "transcript_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/transcript-analysis.py', '/root/loop-benchmarking/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2']' timed out after 30 seconds", - "score": 0 + "total_events": 69, + "tool_calls": { + "total": 18, + "bash": 13, + "write": 0, + "edit": 0, + "read": 5 + }, + "wasted_turns": { + "total": 6, + "docs": 2, + "ascii_art": 0, + "server_starts": 4 + }, + "errors_encountered": 0, + "thinking_blocks": 19, + "text_blocks": 10, + "productivity_ratio": 0.67, + "self_tested": false, + "score": 0.75 }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 60, + "height": 120 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 343, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 101 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.28, + "score": 0.28, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 15, + "cognitive_complexity": 128, + "lines_of_code": 1228, + "duplication_pct": 32.5, + "tech_debt_minutes": 35, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,28 +98,40 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 77 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 7, - "failed": 9, - "score": 0.44 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 187, + "pieces_placed": 20, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 343, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 40 + "load_time_ms": 101 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -1,7 +1,29 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-jz1t_adw/loop-bench-ud3ikw5f', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { @@ -20,8 +42,62 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-jz1t_adw/loop-bench-ud3ikw5f', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 12, + "code": 5, + "docs": 2, + "unnecessary": 1, + "unnecessary_list": [ + "README.md" + ] + }, + "lines_of_code": 1207, + "dependencies": { + "production": 0, + "dev": 3, + "total": 3 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 14, + "excessive": false + }, + "function_length": { + "count": 50, + "average": 6.3, + "max": 26, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 16, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 455, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 31, + "source_lines": 848, + "ratio_pct": 3.7 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 3, + "files_with_logic": 3, + "files_with_both": 3 + }, + "html_validation": { + "valid": false, + "errors": 5 + }, + "duplication_percentage": 0.0, + "score": 0.8 }, "transcript_analysis": { "total_events": 52, @@ -47,13 +123,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "6 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 307, + "events_count": 10, + "pieces_spawned": 6, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 110 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.77, + "score": 0.77, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 9, + "cognitive_complexity": 86, + "lines_of_code": 614, + "duplication_pct": 0.0, + "tech_debt_minutes": 40, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.6 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [I, other] failed: [] (tested 2 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "visual change suggests new piece spawned" + "detail": "6 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 24" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 28s, placed 100 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { @@ -107,14 +108,25 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 210, + "pieces_placed": 37, "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 28, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 307, + "events_count": 10, + "pieces_spawned": 6, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 159 + "load_time_ms": 110 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -1,7 +1,29 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-loagbwpu/loop-bench-xt9_gk40', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { @@ -20,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-loagbwpu/loop-bench-xt9_gk40', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 8, + "code": 3, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 895, + "dependencies": { + "production": 0, + "dev": 6, + "total": 6 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 16, + "excessive": false + }, + "function_length": { + "count": 61, + "average": 6.6, + "max": 22, + "long_functions": 0 + }, + "max_nesting_depth": 8, + "global_declarations": 48, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 275, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 37, + "source_lines": 688, + "ratio_pct": 5.4 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 7 + }, + "duplication_percentage": 0.0, + "score": 0.95 }, "transcript_analysis": { "total_events": 40, @@ -46,14 +120,164 @@ "score": 1.0 }, "gameplay_bot": { - "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "pass": true, + "score": 1, + "total": 16, + "passed": 16, + "failed": 0, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "3 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "2 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 254 to 364" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 34 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 16, + "failed": 0, + "score": 1 + }, + "gameplay": { + "pieces_placed": 34, + "lines_cleared": 2, + "max_score_observed": 364, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 287, + "events_count": 11, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 2, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 35 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.96, + "score": 0.96, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 1, + "cognitive_complexity": 93, + "lines_of_code": 446, + "duplication_pct": 0.0, + "tech_debt_minutes": 5, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.92 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 40" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "2 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [210] -> no change after polling" + "pass": true, + "detail": "score changed from 254 to 364" }, { "name": "game_over", @@ -97,24 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 76 pieces, no crashes" + "detail": "played for 30s, placed 34 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 16, + "failed": 0, + "score": 1 }, "gameplay": { - "pieces_placed": 127, - "lines_cleared": 1, - "max_score_observed": 158, + "pieces_placed": 34, + "lines_cleared": 2, + "max_score_observed": 364, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 287, + "events_count": 11, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 2, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 22 + "load_time_ms": 35 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -1,7 +1,29 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-de89ue8t/loop-bench-3ur6qash', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { @@ -20,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-de89ue8t/loop-bench-3ur6qash', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 8, + "code": 3, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1292, + "dependencies": { + "production": 0, + "dev": 6, + "total": 6 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 30, + "excessive": true + }, + "function_length": { + "count": 59, + "average": 7.3, + "max": 27, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 34, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 435, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 96, + "source_lines": 991, + "ratio_pct": 9.7 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 8 + }, + "duplication_percentage": 0.0, + "score": 0.85 }, "transcript_analysis": { "total_events": 41, @@ -47,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "3 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 256" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 44 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 44, + "lines_cleared": 1, + "max_score_observed": 256, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 289, + "events_count": 11, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 94 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.795, + "score": 0.795, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 7, + "cognitive_complexity": 108, + "lines_of_code": 595, + "duplication_pct": 0.0, + "tech_debt_minutes": 39, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.65 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -12,11 +12,12 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "z", + "rotate": "x", "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 46" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [132] -> no change after polling" + "detail": "score stayed at 256" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 44 pieces, no crashes" } ], "summary": { @@ -107,14 +108,25 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 119, - "lines_cleared": 2, - "max_score_observed": 200, + "pieces_placed": 44, + "lines_cleared": 1, + "max_score_observed": 256, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 289, + "events_count": 11, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 26 + "load_time_ms": 94 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -26,8 +26,20 @@ "score": 0.75 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-kdaj92n_/loop-bench-afsfv6ek', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 17548, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -80,10 +92,10 @@ }, "html_validation": { "valid": false, - "errors": 0 + "errors": 2 }, "duplication_percentage": 0.0, - "score": 0.5 + "score": 0.85 }, "transcript_analysis": { "total_events": 44, @@ -109,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 60, + "height": 120 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 160" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 160, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 343, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 54 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.55, + "score": 0.55, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 123, + "lines_of_code": 644, + "duplication_pct": 0.0, + "tech_debt_minutes": 16, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.79 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [128] -> no change after polling" + "detail": "score stayed at 160" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 10, - "failed": 6, - "score": 0.63 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 188, + "pieces_placed": 20, "lines_cleared": 1, "max_score_observed": 160, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 343, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 34 + "load_time_ms": 54 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -26,8 +26,20 @@ "score": 0.75 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-_3aet_ww/loop-bench-ztizwjzj', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 145626, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -109,13 +121,164 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [S, unknown]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "5 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "3 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 34 to 160" + }, + { + "name": "game_over", + "pass": false, + "detail": "could not trigger or detect game over" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 41 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 41, + "lines_cleared": 3, + "max_score_observed": 160, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 291, + "events_count": 9, + "pieces_spawned": 5, + "pieces_locked": 11, + "lines_cleared": 3, + "piece_types_seen": [ + "S", + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 67 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.745, + "score": 0.745, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 19, + "cognitive_complexity": 120, + "lines_of_code": 1411, + "duplication_pct": 53.1, + "tech_debt_minutes": 74, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [S, unknown]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "5 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "game still responding after 10 piece drops" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "10 line(s) cleared during AI play" + "detail": "3 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": true, - "detail": "score changed from 0 to 122" + "detail": "score changed from 34 to 160" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 28s, placed 100 pieces, no crashes" + "detail": "played for 30s, placed 41 pieces, no crashes" } ], "summary": { @@ -107,14 +108,26 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 141, - "lines_cleared": 46, - "max_score_observed": 0, - "play_duration_seconds": 28, + "pieces_placed": 41, + "lines_cleared": 3, + "max_score_observed": 160, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 291, + "events_count": 9, + "pieces_spawned": 5, + "pieces_locked": 11, + "lines_cleared": 3, + "piece_types_seen": [ + "S", + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 617 + "load_time_ms": 67 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -26,8 +26,21 @@ "score": 0.75 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-3a9i8rmm/loop-bench-jymd_531', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": false, + "errors": 1 + }, + "performance": { + "pass": true, + "bundle_size_bytes": 15037, + "size_under_512kb": true + }, + "score": 0.67 }, "code_analysis": { "files": { @@ -112,10 +125,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.175, + "score": 0.175, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 7, + "cognitive_complexity": 91, + "lines_of_code": 937, + "duplication_pct": 12.6, + "tech_debt_minutes": 75, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.35 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -1,22 +1,18 @@ { "implementation": { - "renderer": "canvas", - "grid_detected": true, - "grid_bounds": { - "x": 0, - "y": 0, - "width": 320, - "height": 640 - }, + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "z", + "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "auto", - "score_element_found": true + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { @@ -27,67 +23,67 @@ { "name": "game_starts", "pass": true, - "detail": "started via auto" + "detail": "started via click_canvas" }, { "name": "auto_drop", - "pass": true, - "detail": "grid state changed after 5s with no input" + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", "pass": false, - "detail": "no change detected after key press" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "rotated: [] failed: [I] (tested 1 piece types in 60 attempts)" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", - "pass": true, - "detail": "filled cells persist at bottom" + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", - "pass": true, - "detail": "new piece detected at top of grid" + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", - "pass": true, - "detail": "grid accumulated cells: 20 -> 44" + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": true, - "detail": "2 line(s) cleared during AI play" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "no score element found" }, { "name": "game_over", @@ -96,32 +92,38 @@ }, { "name": "playable_30s", - "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "pass": false, + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 13, - "failed": 3, - "score": 0.81 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 119, - "lines_cleared": 4, + "pieces_placed": 26, + "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 30, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": 497 + "load_time_ms": 27 }, "accessibility": { - "issues": [ - "canvas without aria-label or role", - "canvas without aria-label or role" - ], - "issue_count": 2, - "pass": false + "issues": [], + "issue_count": 0, + "pass": true } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -26,8 +26,21 @@ "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-mzmf8qdj/loop-bench-amrtiyou', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": false, + "errors": 2 + }, + "performance": { + "pass": true, + "bundle_size_bytes": 151804, + "size_under_512kb": true + }, + "score": 0.67 }, "code_analysis": { "files": { @@ -79,11 +92,11 @@ "files_with_both": 2 }, "html_validation": { - "valid": false, + "valid": true, "errors": 0 }, "duplication_percentage": 0.0, - "score": 0.5 + "score": 0.9 }, "transcript_analysis": { "total_events": 56, @@ -109,13 +122,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "3 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 280" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 46 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 46, + "lines_cleared": 1, + "max_score_observed": 280, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 279, + "events_count": 11, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 21 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.745, + "score": 0.745, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 16, + "cognitive_complexity": 176, + "lines_of_code": 1794, + "duplication_pct": 44.8, + "tech_debt_minutes": 62, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 16 -> 40" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [130] -> no change after polling" + "detail": "score stayed at 280" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 46 pieces, no crashes" } ], "summary": { @@ -107,14 +108,25 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 129, + "pieces_placed": 46, "lines_cleared": 1, - "max_score_observed": 162, + "max_score_observed": 280, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 279, + "events_count": 11, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 27 + "load_time_ms": 21 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-38mshnpy/loop-bench-dzgab_lb', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-38mshnpy/loop-bench-dzgab_lb', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 49197, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -87,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 25, + "height": 50 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 150" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 150, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 347, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 25 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.405, + "score": 0.405, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 12, + "cognitive_complexity": 117, + "lines_of_code": 635, + "duplication_pct": 0.0, + "tech_debt_minutes": 44, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", "pass": false, - "detail": "no change detected after rotate key" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [240] -> no change after polling" + "detail": "score stayed at 150" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 77 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 9, - "failed": 7, - "score": 0.56 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 187, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 186, + "max_score_observed": 150, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 347, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 60 + "load_time_ms": 25 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-0n45h12w/loop-bench-p92ynomw', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-0n45h12w/loop-bench-p92ynomw', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 48079, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -86,11 +120,11 @@ "score": 1.0 }, "gameplay_bot": { - "pass": false, - "score": 0.88, + "pass": true, + "score": 1, "total": 16, - "passed": 14, - "failed": 2, + "passed": 16, + "failed": 0, "report": { "implementation": { "renderer": "canvas", @@ -105,11 +139,12 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "x", "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -125,62 +160,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": false, - "detail": "no change detected after rotate key" + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [unknown, S]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 45" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "2 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [202] -> no change after polling" + "pass": true, + "detail": "score changed from 342 to 360" }, { "name": "game_over", @@ -190,24 +225,36 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 26s, placed 100 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 14, - "failed": 2, - "score": 0.88 + "passed": 16, + "failed": 0, + "score": 1 }, "gameplay": { - "pieces_placed": 141, + "pieces_placed": 37, "lines_cleared": 2, - "max_score_observed": 238, - "play_duration_seconds": 26, + "max_score_observed": 360, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 305, + "events_count": 8, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 2, + "piece_types_seen": [ + "unknown", + "S" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 82 + "load_time_ms": 34 }, "accessibility": { "issues": [ @@ -219,10 +266,19 @@ } } }, - "outcome_score": 0.44, - "score": 0.44, + "outcome_score": 0.875, + "score": 0.875, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 11, + "cognitive_complexity": 76, + "lines_of_code": 819, + "duplication_pct": 0.0, + "tech_debt_minutes": 28, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.75 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -12,11 +12,12 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "x", "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": false, - "detail": "no change detected after rotate key" + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [unknown, S]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 45" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "2 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [202] -> no change after polling" + "pass": true, + "detail": "score changed from 342 to 360" }, { "name": "game_over", @@ -97,24 +98,36 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 26s, placed 100 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 14, - "failed": 2, - "score": 0.88 + "passed": 16, + "failed": 0, + "score": 1 }, "gameplay": { - "pieces_placed": 141, + "pieces_placed": 37, "lines_cleared": 2, - "max_score_observed": 238, - "play_duration_seconds": 26, + "max_score_observed": 360, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 305, + "events_count": 8, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 2, + "piece_types_seen": [ + "unknown", + "S" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 82 + "load_time_ms": 34 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-ex80o7g1/loop-bench-mjng5_fj', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-ex80o7g1/loop-bench-mjng5_fj', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 28619, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -86,14 +120,166 @@ "score": 1.0 }, "gameplay_bot": { - "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "pass": true, + "score": 1, + "total": 16, + "passed": 16, + "failed": 0, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [S, L, J]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "5 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 302 to 310" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 16, + "failed": 0, + "score": 1 + }, + "gameplay": { + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 310, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 302, + "events_count": 10, + "pieces_spawned": 5, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "S", + "L", + "J" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 49 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.895, + "score": 0.895, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 107, + "lines_of_code": 672, + "duplication_pct": 0.0, + "tech_debt_minutes": 32, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.79 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other, unknown] failed: [] (tested 2 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [S, L, J]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "5 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 32 -> 44" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [200] -> no change after polling" + "pass": true, + "detail": "score changed from 302 to 310" }, { "name": "game_over", @@ -97,24 +98,37 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 77 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 16, + "failed": 0, + "score": 1 }, "gameplay": { - "pieces_placed": 122, - "lines_cleared": 2, - "max_score_observed": 182, + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 310, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 302, + "events_count": 10, + "pieces_spawned": 5, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "S", + "L", + "J" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 348 + "load_time_ms": 49 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-9ieawh2c/loop-bench-6s2flshh', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "no build script defined (static project)" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-9ieawh2c/loop-bench-6s2flshh', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 50041, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -87,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 40, + "height": 80 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 164" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 164, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 350, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 29 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.405, + "score": 0.405, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 13, + "cognitive_complexity": 153, + "lines_of_code": 704, + "duplication_pct": 0.0, + "tech_debt_minutes": 69, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [154] -> no change after polling" + "detail": "score stayed at 164" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 10, - "failed": 6, - "score": 0.63 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 188, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 222, + "max_score_observed": 164, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 350, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 34 + "load_time_ms": 29 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-f54mliyx/loop-bench-pc2ap4ay', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "no build script defined (static project)" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-f54mliyx/loop-bench-pc2ap4ay', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 44999, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -87,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 40, + "height": 80 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "enter", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via enter" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 340, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 78 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.405, + "score": 0.405, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 15, + "cognitive_complexity": 140, + "lines_of_code": 709, + "duplication_pct": 0.0, + "tech_debt_minutes": 58, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "enter", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 10, - "failed": 6, - "score": 0.63 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 188, + "pieces_placed": 20, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 340, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 35 + "load_time_ms": 78 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-0ywraxje/loop-bench-q46nchnj', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-0ywraxje/loop-bench-q46nchnj', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 47466, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -87,13 +121,164 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.88, + "total": 16, + "passed": 14, + "failed": 2, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, S]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "4 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 260" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 38 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 14, + "failed": 2, + "score": 0.88 + }, + "gameplay": { + "pieces_placed": 38, + "lines_cleared": 0, + "max_score_observed": 260, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 300, + "events_count": 9, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 0, + "piece_types_seen": [ + "unknown", + "S" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 43 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.69, + "score": 0.69, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 7, + "cognitive_complexity": 106, + "lines_of_code": 604, + "duplication_pct": 0.0, + "tech_debt_minutes": 44, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "no change detected after key press" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [unknown, S]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 41" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": true, - "detail": "line cleared via strategic placement" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [272] -> no change after polling" + "detail": "score stayed at 260" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 80 pieces, no crashes" + "detail": "played for 30s, placed 38 pieces, no crashes" } ], "summary": { @@ -107,14 +108,26 @@ "score": 0.88 }, "gameplay": { - "pieces_placed": 131, - "lines_cleared": 1, - "max_score_observed": 174, + "pieces_placed": 38, + "lines_cleared": 0, + "max_score_observed": 260, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 300, + "events_count": 9, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 0, + "piece_types_seen": [ + "unknown", + "S" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 21 + "load_time_ms": 43 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-egdg4quh/loop-bench-amkr12d_', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-egdg4quh/loop-bench-amkr12d_', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 38300, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -87,13 +121,164 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, S]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "4 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 350" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 43 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 43, + "lines_cleared": 1, + "max_score_observed": 350, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 279, + "events_count": 11, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "S" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 51 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.835, + "score": 0.835, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 4, + "cognitive_complexity": 128, + "lines_of_code": 602, + "duplication_pct": 0.0, + "tech_debt_minutes": 53, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.73 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -12,11 +12,12 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "z", + "rotate": "x", "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "no change detected after key press" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [unknown, S]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 41" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [162] -> no change after polling" + "detail": "score stayed at 350" }, { "name": "game_over", @@ -97,24 +98,36 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 79 pieces, no crashes" + "detail": "played for 30s, placed 43 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 14, - "failed": 2, - "score": 0.88 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 130, + "pieces_placed": 43, "lines_cleared": 1, - "max_score_observed": 192, + "max_score_observed": 350, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 279, + "events_count": 11, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "S" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 21 + "load_time_ms": 51 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-cy1ouzsu/loop-bench-t_rwtfd6', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-cy1ouzsu/loop-bench-t_rwtfd6', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 38664, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -58,10 +92,10 @@ }, "html_validation": { "valid": false, - "errors": 0 + "errors": 3 }, "duplication_percentage": 0.0, - "score": 0.4 + "score": 0.75 }, "transcript_analysis": { "total_events": 73, @@ -87,13 +121,164 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [I, unknown]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "6 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 302" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 41 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 41, + "lines_cleared": 1, + "max_score_observed": 302, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 284, + "events_count": 11, + "pieces_spawned": 6, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "I", + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 66 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.775, + "score": 0.775, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 3, + "cognitive_complexity": 156, + "lines_of_code": 686, + "duplication_pct": 3.5, + "tech_debt_minutes": 33, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.61 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -58,7 +58,7 @@ { "name": "all_pieces_rotate", "pass": true, - "detail": "rotation confirmed but could not identify individual piece types" + "detail": "rotation observed, piece types seen: [I, unknown]" }, { "name": "hard_drop", @@ -73,7 +73,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "5 new piece(s) detected at top of grid" + "detail": "6 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -82,13 +82,13 @@ }, { "name": "line_clear", - "pass": false, - "detail": "could not trigger or detect a line clear via grid reader" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score stayed at 308" + "detail": "score stayed at 302" }, { "name": "game_over", @@ -103,30 +103,31 @@ ], "summary": { "total": 16, - "passed": 14, - "failed": 2, - "score": 0.88 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { "pieces_placed": 41, - "lines_cleared": 0, - "max_score_observed": 308, + "lines_cleared": 1, + "max_score_observed": 302, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 179, - "events_count": 10, - "pieces_spawned": 5, + "frames": 284, + "events_count": 11, + "pieces_spawned": 6, "pieces_locked": 11, - "lines_cleared": 0, + "lines_cleared": 1, "piece_types_seen": [ + "I", "unknown" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 1458 + "load_time_ms": 66 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-1ryr7fh0/loop-bench-g913qn5g', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-1ryr7fh0/loop-bench-g913qn5g', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 42269, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -87,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "3 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 188" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 36 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 36, + "lines_cleared": 1, + "max_score_observed": 188, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 303, + "events_count": 9, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 28 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.775, + "score": 0.775, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 3, + "cognitive_complexity": 156, + "lines_of_code": 686, + "duplication_pct": 3.5, + "tech_debt_minutes": 33, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.61 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "no change detected after key press" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 44" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [178] -> no change after polling" + "detail": "score stayed at 188" }, { "name": "game_over", @@ -97,24 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 79 pieces, no crashes" + "detail": "played for 30s, placed 36 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 14, - "failed": 2, - "score": 0.88 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 120, - "lines_cleared": 2, + "pieces_placed": 36, + "lines_cleared": 1, "max_score_observed": 188, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 303, + "events_count": 9, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 26 + "load_time_ms": 28 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-dr_6_3of/loop-bench-xwpci4l5', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-dr_6_3of/loop-bench-xwpci4l5', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 42258, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -87,13 +121,165 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": false, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, T, L, S]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "10 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 304, + "events_count": 10, + "pieces_spawned": 10, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "T", + "L", + "S" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 48 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.835, + "score": 0.835, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 4, + "cognitive_complexity": 132, + "lines_of_code": 685, + "duplication_pct": 2.8, + "tech_debt_minutes": 23, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.73 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": false + "score_element_found": false, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [unknown, T, L, S]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "10 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 43" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "no score element found and no number changed" + "detail": "no score element found" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 80 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { @@ -107,14 +108,28 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 121, - "lines_cleared": 2, + "pieces_placed": 37, + "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 304, + "events_count": 10, + "pieces_spawned": 10, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "T", + "L", + "S" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 22 + "load_time_ms": 48 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-vqyy1j1m/loop-bench-io7ugig5', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-vqyy1j1m/loop-bench-io7ugig5', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 45625, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -87,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.81, + "total": 16, + "passed": 13, + "failed": 3, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": false, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "9 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 13, + "failed": 3, + "score": 0.81 + }, + "gameplay": { + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 302, + "events_count": 9, + "pieces_spawned": 9, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 29 + }, + "accessibility": { + "issues": [ + "no headings found", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.77, + "score": 0.77, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 4, + "cognitive_complexity": 132, + "lines_of_code": 685, + "duplication_pct": 2.8, + "tech_debt_minutes": 23, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.73 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": false + "score_element_found": false, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "9 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 16 -> 41" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "no score element found and no number changed" + "detail": "no score element found" }, { "name": "game_over", @@ -97,24 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 79 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 13, + "failed": 3, + "score": 0.81 }, "gameplay": { - "pieces_placed": 130, + "pieces_placed": 37, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 302, + "events_count": 9, + "pieces_spawned": 9, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 21 + "load_time_ms": 29 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -26,8 +26,20 @@ "score": 0.75 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-_e4j35hj/loop-bench-wyg6e789', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 36796, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -109,13 +121,167 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [L, S, unknown, T, Z]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "9 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 278" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 36 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 36, + "lines_cleared": 1, + "max_score_observed": 278, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 306, + "events_count": 10, + "pieces_spawned": 9, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "L", + "S", + "unknown", + "T", + "Z" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 67 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.835, + "score": 0.835, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 4, + "cognitive_complexity": 132, + "lines_of_code": 685, + "duplication_pct": 2.8, + "tech_debt_minutes": 23, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.73 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [L, S, unknown, T, Z]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "9 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 44" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [240] -> no change after polling" + "detail": "score stayed at 278" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 80 pieces, no crashes" + "detail": "played for 30s, placed 36 pieces, no crashes" } ], "summary": { @@ -107,14 +108,29 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 121, - "lines_cleared": 2, - "max_score_observed": 152, + "pieces_placed": 36, + "lines_cleared": 1, + "max_score_observed": 278, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 306, + "events_count": 10, + "pieces_spawned": 9, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "L", + "S", + "unknown", + "T", + "Z" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 35 + "load_time_ms": 67 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-60ygqx7w/loop-bench-l6roqn5z', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "no build script defined (static project)" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-60ygqx7w/loop-bench-l6roqn5z', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 36505, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -87,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "4 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 304" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 44 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 44, + "lines_cleared": 1, + "max_score_observed": 304, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 278, + "events_count": 10, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 34 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.795, + "score": 0.795, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 11, + "cognitive_complexity": 104, + "lines_of_code": 760, + "duplication_pct": 0.0, + "tech_debt_minutes": 42, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.65 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 44" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [154] -> no change after polling" + "detail": "score stayed at 304" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 79 pieces, no crashes" + "detail": "played for 30s, placed 44 pieces, no crashes" } ], "summary": { @@ -107,14 +108,25 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 120, - "lines_cleared": 2, - "max_score_observed": 182, + "pieces_placed": 44, + "lines_cleared": 1, + "max_score_observed": 304, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 278, + "events_count": 10, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 28 + "load_time_ms": 34 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-3gvujmf9/loop-bench-k3paesnr', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "no build script defined (static project)" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-3gvujmf9/loop-bench-k3paesnr', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 48614, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -87,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.38, + "total": 16, + "passed": 6, + "failed": 10, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 64, + "height": 128 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 188 to 212" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 6, + "failed": 10, + "score": 0.38 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 212, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 346, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 71 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.365, + "score": 0.365, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 11, + "cognitive_complexity": 121, + "lines_of_code": 829, + "duplication_pct": 2.8, + "tech_debt_minutes": 52, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.35 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [138] -> no change after polling" + "pass": true, + "detail": "score changed from 188 to 212" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 9, - "failed": 7, - "score": 0.56 + "passed": 6, + "failed": 10, + "score": 0.38 }, "gameplay": { - "pieces_placed": 188, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 168, + "max_score_observed": 212, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 346, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 24 + "load_time_ms": 71 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-9kxwf7dz/loop-bench-j7p5ruqx', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-9kxwf7dz/loop-bench-j7p5ruqx', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 45822, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -87,13 +121,164 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, J]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "2 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 290" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 43 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 43, + "lines_cleared": 1, + "max_score_observed": 290, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 284, + "events_count": 11, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "J" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 118 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.72, + "score": 0.72, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 11, + "cognitive_complexity": 125, + "lines_of_code": 763, + "duplication_pct": 0.0, + "tech_debt_minutes": 48, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "no change detected after key press" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [I] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [unknown, J]" }, { "name": "hard_drop", - "pass": false, - "detail": "no change detected after hard drop key" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "2 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "grid did not accumulate filled cells" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [154] -> no change after polling" + "detail": "score stayed at 290" }, { "name": "game_over", @@ -97,24 +98,36 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 80 pieces, no crashes" + "detail": "played for 30s, placed 43 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 12, - "failed": 4, - "score": 0.75 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 180, - "lines_cleared": 2, - "max_score_observed": 172, + "pieces_placed": 43, + "lines_cleared": 1, + "max_score_observed": 290, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 284, + "events_count": 11, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "J" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 29 + "load_time_ms": 118 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-fw_9cpgu/loop-bench-sx5ep9jz', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-fw_9cpgu/loop-bench-sx5ep9jz', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 11827, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -87,13 +121,164 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, Z]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "4 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 278" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 46 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 46, + "lines_cleared": 1, + "max_score_observed": 278, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 279, + "events_count": 10, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "Z" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 72 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.835, + "score": 0.835, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 4, + "cognitive_complexity": 101, + "lines_of_code": 534, + "duplication_pct": 4.5, + "tech_debt_minutes": 16, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.73 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "could not detect any piece rotations" + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, Z]" }, { "name": "hard_drop", - "pass": false, - "detail": "no change detected after hard drop key" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "grid did not accumulate filled cells" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [156] -> no change after polling" + "detail": "score stayed at 278" }, { "name": "game_over", @@ -97,24 +98,36 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 79 pieces, no crashes" + "detail": "played for 30s, placed 46 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 12, - "failed": 4, - "score": 0.75 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 189, + "pieces_placed": 46, "lines_cleared": 1, - "max_score_observed": 230, + "max_score_observed": 278, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 279, + "events_count": 10, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "Z" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 28 + "load_time_ms": 72 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-5j34pz8r/loop-bench-t34c4nm0', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-5j34pz8r/loop-bench-t34c4nm0', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 32578, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -86,14 +120,164 @@ "score": 1.0 }, "gameplay_bot": { - "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "pass": true, + "score": 1, + "total": 16, + "passed": 16, + "failed": 0, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "4 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 304 to 312" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 16, + "failed": 0, + "score": 1 + }, + "gameplay": { + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 312, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 299, + "events_count": 9, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 50 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.865, + "score": 0.865, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 4, + "cognitive_complexity": 101, + "lines_of_code": 534, + "duplication_pct": 4.5, + "tech_debt_minutes": 16, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.73 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 16 -> 44" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [164] -> no change after polling" + "pass": true, + "detail": "score changed from 304 to 312" }, { "name": "game_over", @@ -97,24 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 16, + "failed": 0, + "score": 1 }, "gameplay": { - "pieces_placed": 119, - "lines_cleared": 2, - "max_score_observed": 186, + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 312, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 299, + "events_count": 9, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 26 + "load_time_ms": 50 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-0_q053vs/loop-bench-1oqz1n2q', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-0_q053vs/loop-bench-1oqz1n2q', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 47946, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -87,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 64, + "height": 128 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 162" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 162, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 345, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 28 + }, + "accessibility": { + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 4, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.37, + "score": 0.37, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 4, + "cognitive_complexity": 161, + "lines_of_code": 840, + "duplication_pct": 3.4, + "tech_debt_minutes": 38, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.43 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/gameplay-bot-report.json @@ -12,11 +12,12 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "z", + "rotate": "ArrowUp", "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [168] -> no change after polling" + "detail": "score stayed at 162" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 10, - "failed": 6, - "score": 0.63 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 188, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 232, + "max_score_observed": 162, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 345, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 21 + "load_time_ms": 28 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-iuedekw4/loop-bench-ste9dw4l', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-iuedekw4/loop-bench-ste9dw4l', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 46717, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -58,10 +92,10 @@ }, "html_validation": { "valid": false, - "errors": 0 + "errors": 2 }, "duplication_percentage": 0.0, - "score": 0.5 + "score": 0.85 }, "transcript_analysis": { "total_events": 42, @@ -87,10 +121,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.63, + "score": 0.31, "total": 16, - "passed": 10, - "failed": 6, + "passed": 5, + "failed": 11, "report": { "implementation": { "renderer": "canvas", @@ -109,7 +143,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -124,63 +159,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [166] -> no change after polling" + "detail": "score stayed at 148" }, { "name": "game_over", @@ -190,24 +225,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 77 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 10, - "failed": 6, - "score": 0.63 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 187, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 234, + "max_score_observed": 148, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 346, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 157 + "load_time_ms": 39 }, "accessibility": { "issues": [ @@ -220,10 +264,19 @@ } } }, - "outcome_score": 0.315, - "score": 0.315, + "outcome_score": 0.405, + "score": 0.405, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 14, + "cognitive_complexity": 126, + "lines_of_code": 721, + "duplication_pct": 0.0, + "tech_debt_minutes": 41, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [166] -> no change after polling" + "detail": "score stayed at 148" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 77 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 10, - "failed": 6, - "score": 0.63 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 187, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 234, + "max_score_observed": 148, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 346, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 157 + "load_time_ms": 39 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/eval_results.json @@ -1,7 +1,29 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-r8vyc4_o/loop-bench-s2o7c9uv', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { @@ -20,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-r8vyc4_o/loop-bench-s2o7c9uv', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 9, + "code": 4, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1468, + "dependencies": { + "production": 0, + "dev": 7, + "total": 7 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 26, + "excessive": true + }, + "function_length": { + "count": 59, + "average": 6.4, + "max": 23, + "long_functions": 0 + }, + "max_nesting_depth": 14, + "global_declarations": 18, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 517, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 71, + "source_lines": 1176, + "ratio_pct": 6.0 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 3, + "files_with_logic": 3, + "files_with_both": 3 + }, + "html_validation": { + "valid": false, + "errors": 7 + }, + "duplication_percentage": 0.0, + "score": 0.85 }, "transcript_analysis": { "total_events": 52, @@ -47,13 +121,164 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "enter", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via enter" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [J, Z]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "3 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 264" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 39 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 39, + "lines_cleared": 1, + "max_score_observed": 264, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 293, + "events_count": 10, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "J", + "Z" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 71 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.72, + "score": 0.72, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 16, + "cognitive_complexity": 152, + "lines_of_code": 685, + "duplication_pct": 0.0, + "tech_debt_minutes": 93, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "enter", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [J, Z]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 40" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [162] -> no change after polling" + "detail": "score stayed at 264" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 39 pieces, no crashes" } ], "summary": { @@ -107,14 +108,26 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 129, - "lines_cleared": 2, - "max_score_observed": 180, + "pieces_placed": 39, + "lines_cleared": 1, + "max_score_observed": 264, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 293, + "events_count": 10, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "J", + "Z" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 125 + "load_time_ms": 71 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run3/eval_results.json @@ -26,8 +26,20 @@ "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-dzqt_mrd/loop-bench-exsc6g82', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 36162, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -109,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.88, + "total": 16, + "passed": 14, + "failed": 2, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "8 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 274" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 40 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 14, + "failed": 2, + "score": 0.88 + }, + "gameplay": { + "pieces_placed": 40, + "lines_cleared": 0, + "max_score_observed": 274, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 293, + "events_count": 9, + "pieces_spawned": 8, + "pieces_locked": 11, + "lines_cleared": 0, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 50 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.885, + "score": 0.885, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 90, + "lines_of_code": 637, + "duplication_pct": 0.0, + "tech_debt_minutes": 15, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.89 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "no change detected after key press" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": false, - "detail": "no change detected after rotate key" + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "8 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 44" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": true, - "detail": "1 line(s) cleared during AI play" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [160] -> no change after polling" + "detail": "score stayed at 274" }, { "name": "game_over", @@ -97,24 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 80 pieces, no crashes" + "detail": "played for 30s, placed 40 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 13, - "failed": 3, - "score": 0.81 + "passed": 14, + "failed": 2, + "score": 0.88 }, "gameplay": { - "pieces_placed": 121, - "lines_cleared": 2, - "max_score_observed": 144, + "pieces_placed": 40, + "lines_cleared": 0, + "max_score_observed": 274, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 293, + "events_count": 9, + "pieces_spawned": 8, + "pieces_locked": 11, + "lines_cleared": 0, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 28 + "load_time_ms": 50 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -1,7 +1,29 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-05r17q0m/loop-bench-doqfpofi', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { @@ -20,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-05r17q0m/loop-bench-doqfpofi', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 11, + "code": 5, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1624, + "dependencies": { + "production": 0, + "dev": 8, + "total": 8 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 42, + "excessive": true + }, + "function_length": { + "count": 68, + "average": 7.4, + "max": 43, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 20, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 549, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 84, + "source_lines": 1269, + "ratio_pct": 6.6 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 3, + "files_with_logic": 3, + "files_with_both": 3 + }, + "html_validation": { + "valid": false, + "errors": 9 + }, + "duplication_percentage": 0.0, + "score": 0.85 }, "transcript_analysis": { "total_events": 48, @@ -47,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.38, + "total": 16, + "passed": 6, + "failed": 10, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 40, + "height": 80 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "enter", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via enter" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 196 to 234" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 6, + "failed": 10, + "score": 0.38 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 234, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 344, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 38 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.44, + "score": 0.44, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 15, + "cognitive_complexity": 152, + "lines_of_code": 830, + "duplication_pct": 3.1, + "tech_debt_minutes": 65, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "enter", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [166] -> no change after polling" + "pass": true, + "detail": "score changed from 196 to 234" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 10, - "failed": 6, - "score": 0.63 + "passed": 6, + "failed": 10, + "score": 0.38 }, "gameplay": { - "pieces_placed": 188, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 208, + "max_score_observed": 234, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 344, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 85 + "load_time_ms": 38 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -1,7 +1,29 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-idtri8by/loop-bench-9b42xbl4', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { @@ -20,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-idtri8by/loop-bench-9b42xbl4', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 8, + "code": 3, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1199, + "dependencies": { + "production": 0, + "dev": 7, + "total": 7 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 54, + "excessive": true + }, + "function_length": { + "count": 69, + "average": 6.0, + "max": 27, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 24, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 459, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 80, + "source_lines": 892, + "ratio_pct": 9.0 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 9 + }, + "duplication_percentage": 0.0, + "score": 0.85 }, "transcript_analysis": { "total_events": 53, @@ -47,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.38, + "total": 16, + "passed": 6, + "failed": 10, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 36, + "height": 72 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 218 to 256" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 6, + "failed": 10, + "score": 0.38 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 256, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 343, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 29 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.545, + "score": 0.545, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 3, + "cognitive_complexity": 71, + "lines_of_code": 575, + "duplication_pct": 0.0, + "tech_debt_minutes": 9, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.71 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [210] -> no change after polling" + "pass": true, + "detail": "score changed from 218 to 256" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 10, - "failed": 6, - "score": 0.63 + "passed": 6, + "failed": 10, + "score": 0.38 }, "gameplay": { - "pieces_placed": 188, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 140, + "max_score_observed": 256, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 343, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 30 + "load_time_ms": 29 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -1,7 +1,29 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-dvwvbo88/loop-bench-amzysv_v', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { @@ -20,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-dvwvbo88/loop-bench-amzysv_v', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 8, + "code": 3, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1134, + "dependencies": { + "production": 0, + "dev": 7, + "total": 7 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 20, + "excessive": false + }, + "function_length": { + "count": 56, + "average": 7.4, + "max": 43, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 66, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 385, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 55, + "source_lines": 899, + "ratio_pct": 6.1 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 8 + }, + "duplication_percentage": 0.0, + "score": 0.9 }, "transcript_analysis": { "total_events": 43, @@ -47,13 +121,167 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, S, J, T, L]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "8 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 298" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 298, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 306, + "events_count": 9, + "pieces_spawned": 8, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "S", + "J", + "T", + "L" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 152 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.85, + "score": 0.85, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 3, + "cognitive_complexity": 130, + "lines_of_code": 562, + "duplication_pct": 0.0, + "tech_debt_minutes": 44, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.76 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "no change detected after key press" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [I] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [unknown, S, J, T, L]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "8 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 42" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [226] -> no change after polling" + "detail": "score stayed at 298" }, { "name": "game_over", @@ -97,24 +98,39 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 79 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 14, - "failed": 2, - "score": 0.88 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 120, - "lines_cleared": 2, - "max_score_observed": 196, + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 298, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 306, + "events_count": 9, + "pieces_spawned": 8, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "S", + "J", + "T", + "L" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 23 + "load_time_ms": 152 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -26,8 +26,20 @@ "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-cfq1fa7e/loop-bench-kncx228p', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 15602, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -112,10 +124,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.3, + "score": 0.3, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 13, + "cognitive_complexity": 64, + "lines_of_code": 592, + "duplication_pct": 0.0, + "tech_debt_minutes": 29, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.6 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -1,7 +1,29 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-le3rczwj/loop-bench-6c3p0wr6', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { @@ -20,8 +42,60 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-le3rczwj/loop-bench-6c3p0wr6', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 9, + "code": 4, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1647, + "dependencies": { + "production": 0, + "dev": 6, + "total": 6 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 58, + "excessive": true + }, + "function_length": { + "count": 89, + "average": 7.7, + "max": 45, + "long_functions": 0 + }, + "max_nesting_depth": 15, + "global_declarations": 24, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 567, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 119, + "source_lines": 1195, + "ratio_pct": 10.0 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 7 + }, + "duplication_percentage": 0.0, + "score": 0.85 }, "transcript_analysis": { "total_events": 64, @@ -47,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 60, + "height": 120 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "space", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via space" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 96 to 176" + }, + { + "name": "game_over", + "pass": false, + "detail": "could not trigger or detect game over" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 96, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 348, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 47 + }, + "accessibility": { + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 4, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.405, + "score": 0.405, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 7, + "cognitive_complexity": 152, + "lines_of_code": 767, + "duplication_pct": 0.0, + "tech_debt_minutes": 52, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "space", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": true, - "detail": "visual change detected after hard drop" + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", - "pass": true, - "detail": "visual change suggests new piece spawned" + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", - "pass": true, - "detail": "game still responding after 10 piece drops" + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [214] -> no change after polling" + "pass": true, + "detail": "score changed from 96 to 176" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 11, - "failed": 5, - "score": 0.69 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 188, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 72, + "max_score_observed": 96, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 348, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 41 + "load_time_ms": 47 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -1,7 +1,29 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-2zfhi3g8/loop-bench-9r05i_lq', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { @@ -20,8 +42,62 @@ "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-2zfhi3g8/loop-bench-9r05i_lq', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 8, + "code": 3, + "docs": 1, + "unnecessary": 1, + "unnecessary_list": [ + "README.md" + ] + }, + "lines_of_code": 1535, + "dependencies": { + "production": 0, + "dev": 6, + "total": 6 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 64, + "excessive": true + }, + "function_length": { + "count": 75, + "average": 6.7, + "max": 30, + "long_functions": 0 + }, + "max_nesting_depth": 12, + "global_declarations": 22, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 534, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 136, + "source_lines": 1078, + "ratio_pct": 12.6 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 0 + }, + "duplication_percentage": 0.0, + "score": 0.4 }, "transcript_analysis": { "total_events": 82, @@ -47,13 +123,164 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 42, + "height": 84 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 184 to 218" + }, + { + "name": "game_over", + "pass": false, + "detail": "could not trigger or detect game over" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 218, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 347, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 44 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 5, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.48, + "score": 0.48, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 15, + "cognitive_complexity": 112, + "lines_of_code": 750, + "duplication_pct": 0.0, + "tech_debt_minutes": 52, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.65 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=off_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": true, - "detail": "visual change detected after hard drop" + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", - "pass": true, - "detail": "visual change suggests new piece spawned" + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", - "pass": true, - "detail": "game still responding after 10 piece drops" + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": true, - "detail": "score changed from 184 to 284" + "detail": "score changed from 184 to 218" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 78 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 13, - "failed": 3, - "score": 0.81 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 188, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 100, + "max_score_observed": 218, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 347, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 25 + "load_time_ms": 44 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -26,8 +26,21 @@ "score": 0.75 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-oqdanj22/loop-bench-zhl3wjhq', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": false, + "errors": 1 + }, + "performance": { + "pass": true, + "bundle_size_bytes": 0, + "size_under_512kb": true + }, + "score": 0.67 }, "code_analysis": { "files": { @@ -109,29 +122,25 @@ }, "gameplay_bot": { "pass": false, - "score": 0.88, + "score": 0.19, "total": 16, - "passed": 14, - "failed": 2, + "passed": 3, + "failed": 13, "report": { "implementation": { - "renderer": "canvas", - "grid_detected": true, - "grid_bounds": { - "x": 0, - "y": 0, - "width": 320, - "height": 640 - }, + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "z", + "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "auto", - "score_element_found": true + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { @@ -142,67 +151,67 @@ { "name": "game_starts", "pass": true, - "detail": "started via auto" + "detail": "started via click_canvas" }, { "name": "auto_drop", - "pass": true, - "detail": "grid state changed after 5s with no input" + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", "pass": false, - "detail": "no change detected after key press" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", - "pass": true, - "detail": "filled cells persist at bottom" + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", - "pass": true, - "detail": "new piece detected at top of grid" + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", - "pass": true, - "detail": "game still responding after 10 piece drops" + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": true, - "detail": "51 line(s) cleared during AI play" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "no score element found" }, { "name": "game_over", @@ -211,40 +220,55 @@ }, { "name": "playable_30s", - "pass": true, - "detail": "played for 30s, placed 82 pieces, no crashes" + "pass": false, + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 14, - "failed": 2, - "score": 0.88 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 123, - "lines_cleared": 175, + "pieces_placed": 26, + "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 30, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { "load_time_ms": 19 }, "accessibility": { - "issues": [ - "canvas without aria-label or role", - "canvas without aria-label or role" - ], - "issue_count": 2, - "pass": false + "issues": [], + "issue_count": 0, + "pass": true } } }, - "outcome_score": 0.44, - "score": 0.44, + "outcome_score": 0.22, + "score": 0.22, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 3, + "vulnerabilities": 0, + "code_smells": 21, + "cognitive_complexity": 184, + "lines_of_code": 1499, + "duplication_pct": 55.8, + "tech_debt_minutes": 147, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -1,22 +1,18 @@ { "implementation": { - "renderer": "canvas", - "grid_detected": true, - "grid_bounds": { - "x": 0, - "y": 0, - "width": 320, - "height": 640 - }, + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "z", + "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "auto", - "score_element_found": true + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { @@ -27,67 +23,67 @@ { "name": "game_starts", "pass": true, - "detail": "started via auto" + "detail": "started via click_canvas" }, { "name": "auto_drop", - "pass": true, - "detail": "grid state changed after 5s with no input" + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", "pass": false, - "detail": "no change detected after key press" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", - "pass": true, - "detail": "filled cells persist at bottom" + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", - "pass": true, - "detail": "new piece detected at top of grid" + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", - "pass": true, - "detail": "game still responding after 10 piece drops" + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": true, - "detail": "51 line(s) cleared during AI play" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "no score element found" }, { "name": "game_over", @@ -96,32 +92,38 @@ }, { "name": "playable_30s", - "pass": true, - "detail": "played for 30s, placed 82 pieces, no crashes" + "pass": false, + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 14, - "failed": 2, - "score": 0.88 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 123, - "lines_cleared": 175, + "pieces_placed": 26, + "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 30, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { "load_time_ms": 19 }, "accessibility": { - "issues": [ - "canvas without aria-label or role", - "canvas without aria-label or role" - ], - "issue_count": 2, - "pass": false + "issues": [], + "issue_count": 0, + "pass": true } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -26,8 +26,20 @@ "score": 0.75 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-eb26nye_/loop-bench-_a_0urqo', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 20120, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -109,142 +121,22 @@ }, "gameplay_bot": { "pass": false, - "score": 0.94, - "total": 16, - "passed": 15, - "failed": 1, - "report": { - "implementation": { - "renderer": "canvas", - "grid_detected": true, - "grid_bounds": { - "x": 0, - "y": 0, - "width": 300, - "height": 600 - }, - "controls": { - "left": "ArrowLeft", - "right": "ArrowRight", - "down": "ArrowDown", - "rotate": "z", - "drop": "Space" - }, - "start_mechanism": "auto", - "score_element_found": true - }, - "tests": [ - { - "name": "game_loads", - "pass": true, - "detail": "no console errors" - }, - { - "name": "game_starts", - "pass": true, - "detail": "started via auto" - }, - { - "name": "auto_drop", - "pass": true, - "detail": "grid state changed after 5s with no input" - }, - { - "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" - }, - { - "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" - }, - { - "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" - }, - { - "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" - }, - { - "name": "all_pieces_rotate", - "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" - }, - { - "name": "hard_drop", - "pass": true, - "detail": "piece immediately dropped and new piece appeared" - }, - { - "name": "piece_locks", - "pass": true, - "detail": "filled cells persist at bottom" - }, - { - "name": "new_piece_spawns", - "pass": true, - "detail": "new piece detected at top of grid" - }, - { - "name": "multiple_pieces", - "pass": true, - "detail": "game still responding after 10 piece drops" - }, - { - "name": "line_clear", - "pass": true, - "detail": "9 line(s) cleared during AI play" - }, - { - "name": "score_changes", - "pass": true, - "detail": "score changed from 192 to 200" - }, - { - "name": "game_over", - "pass": false, - "detail": "could not trigger or detect game over" - }, - { - "name": "playable_30s", - "pass": true, - "detail": "played for 30s, placed 83 pieces, no crashes" - } - ], - "summary": { - "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 - }, - "gameplay": { - "pieces_placed": 124, - "lines_cleared": 36, - "max_score_observed": 238, - "play_duration_seconds": 30, - "errors_during_play": 0 - }, - "performance": { - "load_time_ms": 35 - }, - "accessibility": { - "issues": [ - "canvas without aria-label or role", - "canvas without aria-label or role" - ], - "issue_count": 2, - "pass": false - } - } + "score": 0, + "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.47, - "score": 0.47, + "outcome_score": 0.15, + "score": 0.15, "sonarqube": { - "error": "SonarQube scan timed out", - "score": 0 + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 14, + "cognitive_complexity": 145, + "lines_of_code": 1250, + "duplication_pct": 5.9, + "tech_debt_minutes": 170, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.3 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -1,22 +1,18 @@ { "implementation": { - "renderer": "canvas", - "grid_detected": true, - "grid_bounds": { - "x": 0, - "y": 0, - "width": 300, - "height": 600 - }, + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "z", + "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "auto", - "score_element_found": true + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { @@ -27,101 +23,107 @@ { "name": "game_starts", "pass": true, - "detail": "started via auto" + "detail": "started via click_canvas" }, { "name": "auto_drop", - "pass": true, - "detail": "grid state changed after 5s with no input" + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", - "pass": true, - "detail": "filled cells persist at bottom" + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", - "pass": true, - "detail": "new piece detected at top of grid" + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", - "pass": true, - "detail": "game still responding after 10 piece drops" + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": true, - "detail": "9 line(s) cleared during AI play" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", - "pass": true, - "detail": "score changed from 192 to 200" + "pass": false, + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "could not trigger or detect game over" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": true, - "detail": "played for 30s, placed 83 pieces, no crashes" + "pass": false, + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 124, - "lines_cleared": 36, - "max_score_observed": 238, - "play_duration_seconds": 30, + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": 35 + "load_time_ms": 22 }, "accessibility": { - "issues": [ - "canvas without aria-label or role", - "canvas without aria-label or role" - ], - "issue_count": 2, - "pass": false + "issues": [], + "issue_count": 0, + "pass": true } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -43,8 +43,60 @@ "score": 0.67 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-x_a00wq5/loop-bench-gmilst_1', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 7, + "code": 3, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1657, + "dependencies": { + "production": 0, + "dev": 7, + "total": 7 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 28, + "excessive": true + }, + "function_length": { + "count": 55, + "average": 6.6, + "max": 26, + "long_functions": 0 + }, + "max_nesting_depth": 10, + "global_declarations": 18, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 100.0, + "camel_case": 328, + "snake_case": 0 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 101, + "source_lines": 781, + "ratio_pct": 12.9 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 0 + }, + "duplication_percentage": 0.0, + "score": 0.5 }, "transcript_analysis": { "total_events": 74, @@ -70,13 +122,164 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.88, + "total": 16, + "passed": 14, + "failed": 2, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, Z]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "5 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": false, + "detail": "could not trigger or detect game over" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 44 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 14, + "failed": 2, + "score": 0.88 + }, + "gameplay": { + "pieces_placed": 44, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 279, + "events_count": 9, + "pieces_spawned": 5, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "Z" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 19 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.715, + "score": 0.715, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 16, + "cognitive_complexity": 170, + "lines_of_code": 1440, + "duplication_pct": 49.3, + "tech_debt_minutes": 48, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=detailed_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -58,7 +58,7 @@ { "name": "all_pieces_rotate", "pass": true, - "detail": "rotation confirmed but could not identify individual piece types" + "detail": "rotation observed, piece types seen: [unknown, Z]" }, { "name": "hard_drop", @@ -68,7 +68,7 @@ { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", @@ -98,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 31s, placed 37 pieces, no crashes" + "detail": "played for 30s, placed 44 pieces, no crashes" } ], "summary": { @@ -108,25 +108,26 @@ "score": 0.88 }, "gameplay": { - "pieces_placed": 37, + "pieces_placed": 44, "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 31, + "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 145, + "frames": 279, "events_count": 9, "pieces_spawned": 5, "pieces_locked": 11, "lines_cleared": 1, "piece_types_seen": [ - "unknown" + "unknown", + "Z" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 447 + "load_time_ms": 19 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -26,12 +26,76 @@ "score": 0.75 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-t3fuj7u6/loop-bench-_7ks5zgn', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 16947, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { - "error": "Command '['python3', '/root/loop-benchmarking/tasks/tetris/eval/code-analysis.py', '/tmp/reeval-t3fuj7u6/loop-bench-_7ks5zgn', 'typescript']' timed out after 120 seconds", - "score": 0 + "files": { + "total": 7, + "code": 3, + "docs": 0, + "unnecessary": 0, + "unnecessary_list": [] + }, + "lines_of_code": 1375, + "dependencies": { + "production": 0, + "dev": 7, + "total": 7 + }, + "complexity": "moderate", + "console_logs": 0, + "magic_numbers": { + "count": 66, + "excessive": true + }, + "function_length": { + "count": 58, + "average": 7.6, + "max": 27, + "long_functions": 0 + }, + "max_nesting_depth": 14, + "global_declarations": 30, + "naming": { + "dominant_style": "camelCase", + "consistency_pct": 92.4, + "camel_case": 363, + "snake_case": 30 + }, + "error_handling": { + "try_catch_blocks": 0, + "has_error_handling": false + }, + "comments": { + "comment_lines": 72, + "source_lines": 1035, + "ratio_pct": 7.0 + }, + "separation_of_concerns": { + "verdict": "mixed", + "files_with_rendering": 2, + "files_with_logic": 2, + "files_with_both": 2 + }, + "html_validation": { + "valid": false, + "errors": 2 + }, + "duplication_percentage": 0.0, + "score": 0.85 }, "transcript_analysis": { "total_events": 55, @@ -57,13 +121,165 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.88, + "total": 16, + "passed": 14, + "failed": 2, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "6 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 324 to 342" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 14, + "failed": 2, + "score": 0.88 + }, + "gameplay": { + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 342, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 299, + "events_count": 10, + "pieces_spawned": 6, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "I", + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 89 + }, + "accessibility": { + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.715, + "score": 0.715, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 5, + "cognitive_complexity": 144, + "lines_of_code": 638, + "duplication_pct": 0.0, + "tech_debt_minutes": 45, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -12,7 +12,7 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "x", + "rotate": "ArrowUp", "drop": "Space" }, "start_mechanism": "auto", @@ -52,13 +52,13 @@ }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotation confirmed but could not identify individual piece types" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", @@ -87,8 +87,8 @@ }, { "name": "score_changes", - "pass": false, - "detail": "score stayed at 296" + "pass": true, + "detail": "score changed from 324 to 342" }, { "name": "game_over", @@ -98,39 +98,44 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 31 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 14, + "failed": 2, + "score": 0.88 }, "gameplay": { - "pieces_placed": 31, + "pieces_placed": 37, "lines_cleared": 1, - "max_score_observed": 296, + "max_score_observed": 342, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 159, - "events_count": 11, + "frames": 299, + "events_count": 10, "pieces_spawned": 6, "pieces_locked": 11, "lines_cleared": 1, "piece_types_seen": [ + "I", "unknown" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 4276 + "load_time_ms": 89 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -26,8 +26,20 @@ "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-i4ymkfiq/loop-bench-hdcmmx_h', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 25381, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -80,10 +92,10 @@ }, "html_validation": { "valid": false, - "errors": 6 + "errors": 0 }, "duplication_percentage": 0.0, - "score": 0.85 + "score": 0.5 }, "transcript_analysis": { "total_events": 57, @@ -109,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 40, + "height": 80 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 0 to 144" + }, + { + "name": "game_over", + "pass": false, + "detail": "could not trigger or detect game over" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 144, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 345, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 32 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.405, + "score": 0.405, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 7, + "cognitive_complexity": 144, + "lines_of_code": 780, + "duplication_pct": 0.0, + "tech_debt_minutes": 35, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": true, - "detail": "visual change detected after hard drop" + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", - "pass": true, - "detail": "visual change suggests new piece spawned" + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", - "pass": true, - "detail": "game still responding after 10 piece drops" + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": true, - "detail": "score changed from 138 to 246" + "detail": "score changed from 0 to 144" }, { "name": "game_over", @@ -97,24 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 77 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 13, - "failed": 3, - "score": 0.81 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 187, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 66, + "max_score_observed": 144, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 345, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 28 + "load_time_ms": 32 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -26,8 +26,20 @@ "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-d59z4ylb/loop-bench-udnn4aje', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 17708, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -109,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "enter", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via enter" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "5 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 0 to 368" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 37, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 303, + "events_count": 10, + "pieces_spawned": 5, + "pieces_locked": 11, + "lines_cleared": 0, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 34 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.865, + "score": 0.865, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 136, + "lines_of_code": 653, + "duplication_pct": 0.0, + "tech_debt_minutes": 48, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.79 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=off_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -12,11 +12,12 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "x", + "rotate": "ArrowUp", "drop": "Space" }, "start_mechanism": "enter", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "5 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "game still responding after 10 piece drops" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": true, - "detail": "11 line(s) cleared during AI play" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": true, - "detail": "score changed from 96 to 192" + "detail": "score changed from 0 to 368" }, { "name": "game_over", @@ -97,24 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 83 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 16, - "failed": 0, - "score": 1 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 124, - "lines_cleared": 42, - "max_score_observed": 216, + "pieces_placed": 37, + "lines_cleared": 0, + "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 303, + "events_count": 10, + "pieces_spawned": 5, + "pieces_locked": 11, + "lines_cleared": 0, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 40 + "load_time_ms": 34 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -26,8 +26,20 @@ "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-wnxcksi7/loop-bench-ymvhffsl', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 13233, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -112,10 +124,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.46, + "score": 0.46, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 1, + "cognitive_complexity": 61, + "lines_of_code": 478, + "duplication_pct": 0.0, + "tech_debt_minutes": 5, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.92 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -26,8 +26,20 @@ "score": 0.75 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-4g5vr4fy/loop-bench-9osp3kim', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 15345, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -80,10 +92,10 @@ }, "html_validation": { "valid": false, - "errors": 0 + "errors": 2 }, "duplication_percentage": 0.0, - "score": 0.5 + "score": 0.85 }, "transcript_analysis": { "total_events": 52, @@ -109,13 +121,164 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [I, unknown]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "4 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 246" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 246, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 305, + "events_count": 11, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "I", + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 87 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.915, + "score": 0.915, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 73, + "lines_of_code": 555, + "duplication_pct": 0.0, + "tech_debt_minutes": 6, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.89 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -12,11 +12,12 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "x", + "rotate": "ArrowUp", "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [I, unknown]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 44" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "1 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [166] -> no change after polling" + "detail": "score stayed at 246" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 79 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { @@ -107,14 +108,26 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 120, - "lines_cleared": 2, - "max_score_observed": 142, + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 246, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 305, + "events_count": 11, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "I", + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 76 + "load_time_ms": 87 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -1,11 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-0_h4aplp/loop-bench-5xgkq0xm', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-0_h4aplp/loop-bench-5xgkq0xm', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 22016, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -87,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 60, + "height": 120 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "enter", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via enter" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 178" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 178, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 346, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 111 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.475, + "score": 0.475, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 111, + "lines_of_code": 754, + "duplication_pct": 0.0, + "tech_debt_minutes": 10, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.64 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=off_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "enter", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [356] -> no change after polling" + "detail": "score stayed at 178" }, { "name": "game_over", @@ -97,28 +98,41 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 31s, placed 48 pieces, no crashes" + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 10, - "failed": 6, - "score": 0.63 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 158, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 0, - "play_duration_seconds": 31, + "max_score_observed": 178, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 346, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 1092 + "load_time_ms": 111 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -26,8 +26,20 @@ "score": 0.75 }, "quality": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/quality.sh', '/tmp/reeval-6xzte1vr/loop-bench-05evwwhn', 'typescript']' timed out after 120 seconds" + "lint": { + "pass": true, + "errors": 0, + "warnings": 0 + }, + "typecheck": { + "pass": true + }, + "performance": { + "pass": true, + "bundle_size_bytes": 16576, + "size_under_512kb": true + }, + "score": 1.0 }, "code_analysis": { "files": { @@ -80,10 +92,10 @@ }, "html_validation": { "valid": false, - "errors": 0 + "errors": 2 }, "duplication_percentage": 0.0, - "score": 0.5 + "score": 0.85 }, "transcript_analysis": { "total_events": 61, @@ -109,13 +121,164 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, L]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "3 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 332" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 46 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 46, + "lines_cleared": 1, + "max_score_observed": 332, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 279, + "events_count": 11, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "L" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 42 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.76, + "score": 0.76, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 4, + "cognitive_complexity": 113, + "lines_of_code": 541, + "duplication_pct": 0.0, + "tech_debt_minutes": 40, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.58 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [unknown, L]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected after drop" + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 20 -> 40" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [162] -> no change after polling" + "detail": "score stayed at 332" }, { "name": "game_over", @@ -97,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 26s, placed 100 pieces, no crashes" + "detail": "played for 30s, placed 46 pieces, no crashes" } ], "summary": { @@ -107,14 +108,26 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 151, + "pieces_placed": 46, "lines_cleared": 1, - "max_score_observed": 132, - "play_duration_seconds": 26, + "max_score_observed": 332, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 279, + "events_count": 11, + "pieces_spawned": 3, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "L" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 937 + "load_time_ms": 42 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -1,25 +1,45 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-ldeahnr5/loop-bench-_vpevx35', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { - "pass": false, - "errors": -1, - "warnings": 0, - "error": "eslint failed to run" + "pass": true, + "errors": 0, + "warnings": 0 }, "typecheck": { - "pass": false, - "errors": 0 + "pass": true }, "performance": { "pass": true, "bundle_size_bytes": 21434, "size_under_512kb": true }, - "score": 0.33 + "score": 1.0 }, "code_analysis": { "files": { @@ -72,10 +92,10 @@ }, "html_validation": { "valid": false, - "errors": 0 + "errors": 2 }, "duplication_percentage": 0.0, - "score": 0.5 + "score": 0.85 }, "transcript_analysis": { "total_events": 63, @@ -104,10 +124,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.25, + "score": 0.25, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 16, + "cognitive_complexity": 133, + "lines_of_code": 687, + "duplication_pct": 0.0, + "tech_debt_minutes": 82, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -1,7 +1,29 @@ { "structural": { - "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-r14ej_nz/loop-bench-ofmivkls', 'typescript']' timed out after 120 seconds" + "pass": true, + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "npm run build completed successfully" + }, + { + "name": "typescript_compiles", + "pass": true, + "detail": "tsc --noEmit passed" + } + ], + "score": 1.0 }, "quality": { "lint": { @@ -10,15 +32,14 @@ "warnings": 0 }, "typecheck": { - "pass": false, - "errors": 0 + "pass": true }, "performance": { "pass": true, "bundle_size_bytes": 15383, "size_under_512kb": true }, - "score": 0.67 + "score": 1.0 }, "code_analysis": { "files": { @@ -71,10 +92,10 @@ }, "html_validation": { "valid": false, - "errors": 0 + "errors": 3 }, "duplication_percentage": 0.0, - "score": 0.5 + "score": 0.85 }, "transcript_analysis": { "total_events": 49, @@ -100,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "4 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 305, + "events_count": 10, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 40 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.695, + "score": 0.695, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 10, + "cognitive_complexity": 69, + "lines_of_code": 594, + "duplication_pct": 0.0, + "tech_debt_minutes": 34, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.45 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=off_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "no change detected after key press" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [unknown] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 36 -> 44" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "10 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", @@ -96,29 +97,43 @@ }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 13, - "failed": 3, - "score": 0.81 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 92, - "lines_cleared": 10, + "pieces_placed": 37, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 305, + "events_count": 10, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 1356 + "load_time_ms": 40 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -1,7 +1,29 @@ { "structural": { "pass": false, - "error": "Command '['bash', '/root/loop-benchmarking/tasks/tetris/eval/structural.sh', '/tmp/reeval-wuh6htkd/loop-bench-q0okqwr1', 'typescript']' timed out after 120 seconds" + "checks": [ + { + "name": "entry_point_exists", + "pass": true, + "detail": "index.html found" + }, + { + "name": "package_json_exists", + "pass": true, + "detail": "package.json found" + }, + { + "name": "build_succeeds", + "pass": true, + "detail": "no build script defined (static project)" + }, + { + "name": "typescript_compiles", + "pass": false, + "detail": "tsc --noEmit failed" + } + ], + "score": 0.75 }, "quality": { "lint": { @@ -99,13 +121,167 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, T, Z, J]" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "6 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 394" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 42 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 42, + "lines_cleared": 1, + "max_score_observed": 394, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 294, + "events_count": 10, + "pieces_spawned": 6, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "T", + "Z", + "J" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 31 + }, + "accessibility": { + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.845, + "score": 0.845, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 8, + "cognitive_complexity": 80, + "lines_of_code": 516, + "duplication_pct": 0.0, + "tech_debt_minutes": 33, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.75 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": false, - "detail": "no change detected after rotate key" + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation observed, piece types seen: [unknown, T, Z, J]" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "6 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 16 -> 49" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "2 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [222] -> no change after polling" + "detail": "score stayed at 394" }, { "name": "game_over", @@ -97,24 +98,38 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 79 pieces, no crashes" + "detail": "played for 30s, placed 42 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 14, - "failed": 2, - "score": 0.88 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 120, - "lines_cleared": 2, - "max_score_observed": 222, + "pieces_placed": 42, + "lines_cleared": 1, + "max_score_observed": 394, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 294, + "events_count": 10, + "pieces_spawned": 6, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "T", + "Z", + "J" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 2944 + "load_time_ms": 31 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -121,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.88, + "total": 16, + "passed": 14, + "failed": 2, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "1 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 302" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 14, + "failed": 2, + "score": 0.88 + }, + "gameplay": { + "pieces_placed": 37, + "lines_cleared": 0, + "max_score_observed": 302, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 304, + "events_count": 9, + "pieces_spawned": 1, + "pieces_locked": 11, + "lines_cleared": 0, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 29 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.69, + "score": 0.69, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 12, + "cognitive_complexity": 101, + "lines_of_code": 680, + "duplication_pct": 0.0, + "tech_debt_minutes": 50, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,62 +33,62 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [I] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "1 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "grid accumulated cells: 16 -> 36" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": true, - "detail": "line cleared via strategic placement" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [294] -> no change after polling" + "detail": "score stayed at 302" }, { "name": "game_over", @@ -97,24 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 70 pieces, no crashes" + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 14, + "failed": 2, + "score": 0.88 }, "gameplay": { - "pieces_placed": 121, - "lines_cleared": 1, - "max_score_observed": 186, + "pieces_placed": 37, + "lines_cleared": 0, + "max_score_observed": 302, "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 304, + "events_count": 9, + "pieces_spawned": 1, + "pieces_locked": 11, + "lines_cleared": 0, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 358 + "load_time_ms": 29 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -121,10 +121,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.56, + "score": 0.38, "total": 16, - "passed": 9, - "failed": 7, + "passed": 6, + "failed": 10, "report": { "implementation": { "renderer": "canvas", @@ -143,7 +143,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -158,63 +159,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [176] -> no change after polling" + "pass": true, + "detail": "score changed from 168 to 186" }, { "name": "game_over", @@ -223,37 +224,59 @@ }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 9, - "failed": 7, - "score": 0.56 + "passed": 6, + "failed": 10, + "score": 0.38 }, "gameplay": { - "pieces_placed": 104, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 0, - "play_duration_seconds": 0, + "max_score_observed": 186, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 346, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 1405 + "load_time_ms": 48 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } }, - "outcome_score": 0.56, - "score": 0.56, + "outcome_score": 0.57, + "score": 0.57, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 3, + "cognitive_complexity": 151, + "lines_of_code": 668, + "duplication_pct": 0.0, + "tech_debt_minutes": 30, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.76 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=off_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,63 +32,63 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [176] -> no change after polling" + "pass": true, + "detail": "score changed from 168 to 186" }, { "name": "game_over", @@ -96,29 +97,42 @@ }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 9, - "failed": 7, - "score": 0.56 + "passed": 6, + "failed": 10, + "score": 0.38 }, "gameplay": { - "pieces_placed": 104, + "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 0, - "play_duration_seconds": 0, + "max_score_observed": 186, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 346, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 1405 + "load_time_ms": 48 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/eval_results.json @@ -121,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 40, + "height": 80 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "space", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via space" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 349, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 27 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.455, + "score": 0.455, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 12, + "cognitive_complexity": 72, + "lines_of_code": 574, + "duplication_pct": 0.0, + "tech_debt_minutes": 41, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.6 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "space", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,72 +33,72 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { @@ -107,18 +108,31 @@ "score": 0.31 }, "gameplay": { - "pieces_placed": 101, + "pieces_placed": 20, "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 349, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 702 + "load_time_ms": 27 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/eval_results.json @@ -121,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 33, + "height": 66 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 0 to 182" + }, + { + "name": "game_over", + "pass": false, + "detail": "could not trigger or detect game over" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 349, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 37 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.405, + "score": 0.405, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 18, + "cognitive_complexity": 137, + "lines_of_code": 806, + "duplication_pct": 0.0, + "tech_debt_minutes": 51, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,93 +33,106 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": true, - "detail": "visual change detected after hard drop" + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", - "pass": true, - "detail": "visual change suggests new piece spawned" + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", - "pass": true, - "detail": "game still responding after 10 piece drops" + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Test timeout of 180000ms exceeded." + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "could not read score element" + "pass": true, + "detail": "score changed from 0 to 182" }, { "name": "game_over", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not trigger or detect game over" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 7, - "failed": 9, - "score": 0.44 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 100, - "lines_cleared": 0, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 349, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 1304 + "load_time_ms": 37 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/eval_results.json @@ -121,13 +121,161 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.44, + "total": 16, + "passed": 7, + "failed": 9, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 56, + "height": 112 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 202 to 226" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 7, + "failed": 9, + "score": 0.44 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 226, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 350, + "events_count": 3, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 38 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.65, + "score": 0.65, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 3, + "cognitive_complexity": 72, + "lines_of_code": 647, + "duplication_pct": 0.0, + "tech_debt_minutes": 20, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.86 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=off_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,94 +32,106 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "grid state changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", - "pass": false, - "detail": "could not verify piece locking at bottom" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": true, - "detail": "151 line(s) cleared during AI play" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "score did not increase: [360] -> no change after polling" + "pass": true, + "detail": "score changed from 202 to 226" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 8, - "failed": 8, - "score": 0.5 + "passed": 7, + "failed": 9, + "score": 0.44 }, "gameplay": { - "pieces_placed": 100, - "lines_cleared": 151, - "max_score_observed": 0, - "play_duration_seconds": 0, + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 226, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 350, + "events_count": 3, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 3913 + "load_time_ms": 38 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run1/eval_results.json @@ -124,10 +124,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.275, + "score": 0.275, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 14, + "cognitive_complexity": 271, + "lines_of_code": 1230, + "duplication_pct": 14.9, + "tech_debt_minutes": 101, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/eval_results.json @@ -122,13 +122,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.38, + "total": 16, + "passed": 6, + "failed": 10, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 40, + "height": 80 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 200 to 232" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 6, + "failed": 10, + "score": 0.38 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 232, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 351, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 91 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.465, + "score": 0.465, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 14, + "cognitive_complexity": 271, + "lines_of_code": 1230, + "duplication_pct": 14.9, + "tech_debt_minutes": 101, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run2/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,94 +32,107 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Test timeout of 180000ms exceeded." + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "could not read score element" + "pass": true, + "detail": "score changed from 200 to 232" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 7, - "failed": 9, - "score": 0.44 + "passed": 6, + "failed": 10, + "score": 0.38 }, "gameplay": { - "pieces_placed": 80, - "lines_cleared": 0, - "max_score_observed": 0, - "play_duration_seconds": 0, + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 232, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 351, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 1061 + "load_time_ms": 91 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/eval_results.json @@ -121,15 +121,20 @@ }, "gameplay_bot": { "pass": false, - "score": 0, + "score": 0.94, "total": 16, - "passed": 0, - "failed": 16, + "passed": 15, + "failed": 1, "report": { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", @@ -137,128 +142,144 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", + "start_mechanism": "auto", "score_element_found": false, - "grid_confidence": 0 + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, J, Z]" }, { "name": "hard_drop", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "played for 30s, placed 42 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 42, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 0, - "events_count": 0, - "pieces_spawned": 0, - "pieces_locked": 0, - "lines_cleared": 0, - "piece_types_seen": [], - "grid_read_success_rate": 0 + "frames": 289, + "events_count": 10, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "J", + "Z" + ], + "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": -1 + "load_time_ms": 62 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.72, + "score": 0.72, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 12, + "cognitive_complexity": 102, + "lines_of_code": 588, + "duplication_pct": 0.0, + "tech_debt_minutes": 72, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=off_web_search=on_run3/gameplay-bot-report.json @@ -1,8 +1,13 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", @@ -10,120 +15,127 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", + "start_mechanism": "auto", "score_element_found": false, - "grid_confidence": 0 + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, J, Z]" }, { "name": "hard_drop", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:33953/\" is interrupted by another navigation to \"http://127.0.0.1:33953/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33953/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "played for 30s, placed 42 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 42, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 0, - "events_count": 0, - "pieces_spawned": 0, - "pieces_locked": 0, - "lines_cleared": 0, - "piece_types_seen": [], - "grid_read_success_rate": 0 + "frames": 289, + "events_count": 10, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "J", + "Z" + ], + "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": -1 + "load_time_ms": 62 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/eval_results.json @@ -121,13 +121,166 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.13, + "total": 16, + "passed": 2, + "failed": 14, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 36, + "height": 72 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": false, + "detail": "1 console error(s): Cannot read properties of undefined (reading '0')" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": false, + "detail": "could not trigger or detect game over" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "5 console error(s), 0 play errors" + } + ], + "summary": { + "total": 16, + "passed": 2, + "failed": 14, + "score": 0.13 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 348, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 61 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 7, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.325, + "score": 0.325, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 6, + "cognitive_complexity": 152, + "lines_of_code": 686, + "duplication_pct": 0.0, + "tech_debt_minutes": 62, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.52 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run1/gameplay-bot-report.json @@ -1,17 +1,23 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 36, + "height": 72 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "x", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -21,95 +27,104 @@ }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "skipped: page did not load" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "skipped: page did not load" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "skipped: page did not load" + "detail": "score stayed at 0" }, { "name": "game_over", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not trigger or detect game over" }, { "name": "playable_30s", "pass": false, - "detail": "skipped: page did not load" + "detail": "5 console error(s), 0 play errors" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 2, + "failed": 14, + "score": 0.13 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 348, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 5864 + "load_time_ms": 61 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/eval_results.json @@ -121,144 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, + "score": 0.31, "total": 16, - "passed": 0, - "failed": 16, + "passed": 5, + "failed": 11, "report": { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 42, + "height": 84 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "x", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false, - "grid_confidence": 0 + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no grid change detected after key press" }, { "name": "rotate", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 0, - "events_count": 0, + "frames": 347, + "events_count": 2, "pieces_spawned": 0, - "pieces_locked": 0, - "lines_cleared": 0, + "pieces_locked": 10, + "lines_cleared": 1, "piece_types_seen": [], - "grid_read_success_rate": 0 + "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": -1 + "load_time_ms": 55 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.415, + "score": 0.415, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 6, + "cognitive_complexity": 152, + "lines_of_code": 686, + "duplication_pct": 0.0, + "tech_debt_minutes": 62, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.52 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run2/gameplay-bot-report.json @@ -1,129 +1,138 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 42, + "height": 84 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "x", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false, - "grid_confidence": 0 + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no grid change detected after key press" }, { "name": "rotate", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "page load failed: page.goto: Navigation to \"http://127.0.0.1:41317/\" is interrupted by another navigation to \"http://127.0.0.1:41317/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:41317/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 0, - "events_count": 0, + "frames": 347, + "events_count": 2, "pieces_spawned": 0, - "pieces_locked": 0, - "lines_cleared": 0, + "pieces_locked": 10, + "lines_cleared": 1, "piece_types_seen": [], - "grid_read_success_rate": 0 + "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": -1 + "load_time_ms": 55 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run3/eval_results.json @@ -121,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.25, + "total": 16, + "passed": 4, + "failed": 12, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 40, + "height": 80 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "enter", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via enter" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": false, + "detail": "could not trigger or detect game over" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 4, + "failed": 12, + "score": 0.25 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 345, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 22 + }, + "accessibility": { + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 4, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.375, + "score": 0.375, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 15, + "cognitive_complexity": 117, + "lines_of_code": 741, + "duplication_pct": 0.0, + "tech_debt_minutes": 66, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=off_run3/gameplay-bot-report.json @@ -1,119 +1,139 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 40, + "height": 80 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "x", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "enter", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "exception: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:36761/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via enter" }, { "name": "auto_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "skipped: page did not load" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "skipped: page did not load" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "skipped: page did not load" + "detail": "score stayed at 0" }, { "name": "game_over", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not trigger or detect game over" }, { "name": "playable_30s", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 4, + "failed": 12, + "score": 0.25 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 345, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 4258 + "load_time_ms": 22 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 4, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -122,13 +122,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "6 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 226" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 226, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 299, + "events_count": 11, + "pieces_spawned": 6, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 34 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.595, + "score": 0.595, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 8, + "cognitive_complexity": 208, + "lines_of_code": 1260, + "duplication_pct": 21.7, + "tech_debt_minutes": 54, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,93 +33,106 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": true, - "detail": "piece shape changed after rotate key" + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", "pass": true, - "detail": "rotated: [other] failed: [] (tested 1 piece types in 60 attempts)" + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "6 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "exception: page.waitForTimeout: Test timeout of 180000ms exceeded." + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "could not read score element" + "detail": "score stayed at 226" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 37 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 11, - "failed": 5, - "score": 0.69 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 1, - "lines_cleared": 0, - "max_score_observed": 0, - "play_duration_seconds": 0, + "pieces_placed": 37, + "lines_cleared": 1, + "max_score_observed": 226, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 299, + "events_count": 11, + "pieces_spawned": 6, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 7547 + "load_time_ms": 34 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -92,10 +92,10 @@ }, "html_validation": { "valid": false, - "errors": 9 + "errors": 0 }, "duplication_percentage": 0.0, - "score": 0.85 + "score": 0.5 }, "transcript_analysis": { "total_events": 45, @@ -124,10 +124,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.3, + "score": 0.3, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 11, + "cognitive_complexity": 93, + "lines_of_code": 638, + "duplication_pct": 0.0, + "tech_debt_minutes": 33, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.6 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -92,10 +92,10 @@ }, "html_validation": { "valid": false, - "errors": 0 + "errors": 3 }, "duplication_percentage": 0.0, - "score": 0.5 + "score": 0.85 }, "transcript_analysis": { "total_events": 74, @@ -121,11 +121,152 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 64, + "height": 128 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 1" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 1, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 348, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 77 + }, + "accessibility": { + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 4, + "pass": false + } + } }, - "outcome_score": 0.38, - "score": 0.38, + "outcome_score": 0.535, + "score": 0.535, "sonarqube": { "bugs": 0, "vulnerabilities": 0, diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -1,17 +1,23 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 64, + "height": 128 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "x", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -21,99 +27,113 @@ }, { "name": "game_starts", - "pass": false, - "detail": "exception: page.screenshot: Timeout 10000ms exceeded.\nCall log:\n\u001b[2m - taking page screenshot\u001b[22m\n\u001b[2m - waiting for fonts to load...\u001b[22m\n" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "exception: page.evaluate: Target page, context or browser has been closed" + "detail": "score stayed at 1" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 6, - "failed": 10, - "score": 0.38 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 19, - "lines_cleared": 0, - "max_score_observed": 0, - "play_duration_seconds": 0, + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 1, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 348, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 3882 + "load_time_ms": 77 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 4, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -120,13 +120,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 22 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.55, + "score": 0.55, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 3, + "cognitive_complexity": 3, + "lines_of_code": 700, + "duplication_pct": 0.0, + "tech_debt_minutes": 15, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.91 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -10,8 +10,9 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { @@ -21,78 +22,78 @@ }, { "name": "game_starts", - "pass": false, - "detail": "exception: page.screenshot: Timeout 10000ms exceeded.\nCall log:\n\u001b[2m - taking page screenshot\u001b[22m\n\u001b[2m - waiting for fonts to load...\u001b[22m\n\u001b[2m - fonts loaded\u001b[22m\n" + "pass": true, + "detail": "started via click_canvas" }, { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "exception: keyboard.press: Test timeout of 180000ms exceeded." + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "exception: page.evaluate: Target page, context or browser has been closed" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "only played for 7s" } ], "summary": { @@ -102,14 +103,23 @@ "score": 0.19 }, "gameplay": { - "pieces_placed": 21, + "pieces_placed": 26, "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": 3622 + "load_time_ms": 22 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -117,13 +117,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 33 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.58, + "score": 0.58, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 1, + "cognitive_complexity": 0, + "lines_of_code": 625, + "duplication_pct": 0.0, + "tech_debt_minutes": 5, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.97 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -10,106 +10,116 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "1 console error(s): Cannot read properties of undefined (reading '0')" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via click_canvas" }, { "name": "auto_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", "pass": false, - "detail": "skipped: page did not load" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "skipped: page did not load" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "skipped: page did not load" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", "pass": false, - "detail": "skipped: page did not load" + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 0, + "pieces_placed": 26, "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": 1604 + "load_time_ms": 33 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -117,13 +117,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 37 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.535, + "score": 0.535, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 4, + "cognitive_complexity": 0, + "lines_of_code": 558, + "duplication_pct": 0.0, + "tech_debt_minutes": 23, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.88 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -10,114 +10,120 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "1 console error(s): Cannot read properties of undefined (reading '0')" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via click_canvas" }, { "name": "auto_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", "pass": false, - "detail": "skipped: page did not load" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "skipped: page did not load" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "skipped: page did not load" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", "pass": false, - "detail": "skipped: page did not load" + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 0, + "pieces_placed": 26, "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": 1604 + "load_time_ms": 37 }, "accessibility": { - "issues": [ - "canvas without aria-label or role", - "canvas without aria-label or role", - "canvas without aria-label or role" - ], - "issue_count": 3, - "pass": false + "issues": [], + "issue_count": 0, + "pass": true } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -120,10 +120,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.47, + "score": 0.47, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 0, + "lines_of_code": 557, + "duplication_pct": 0.0, + "tech_debt_minutes": 40, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.94 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -120,10 +120,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.47, + "score": 0.47, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 2, + "lines_of_code": 678, + "duplication_pct": 0.0, + "tech_debt_minutes": 29, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.94 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -120,10 +120,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.455, + "score": 0.455, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 3, + "cognitive_complexity": 0, + "lines_of_code": 632, + "duplication_pct": 0.0, + "tech_debt_minutes": 45, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.91 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -117,13 +117,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 21 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.565, + "score": 0.565, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 0, + "lines_of_code": 678, + "duplication_pct": 0.0, + "tech_debt_minutes": 76, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.94 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -1,22 +1,18 @@ { "implementation": { - "renderer": "canvas", - "grid_detected": true, - "grid_bounds": { - "x": 0, - "y": 0, - "width": 32, - "height": 64 - }, + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "x", + "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "auto", - "score_element_found": true + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { @@ -27,94 +23,103 @@ { "name": "game_starts", "pass": true, - "detail": "started via auto" + "detail": "started via click_canvas" }, { "name": "auto_drop", "pass": false, - "detail": "exception: page.screenshot: Timeout 10000ms exceeded.\nCall log:\n\u001b[2m - taking page screenshot\u001b[22m\n\u001b[2m - waiting for fonts to load...\u001b[22m\n" + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "exception: page.waitForTimeout: Target page, context or browser has been closed" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "could not read score element" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 6, - "failed": 10, - "score": 0.38 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 11, + "pieces_placed": 26, "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": 535 + "load_time_ms": 21 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -117,13 +117,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 20 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.43, + "score": 0.43, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 6, + "cognitive_complexity": 0, + "lines_of_code": 961, + "duplication_pct": 0.0, + "tech_debt_minutes": 208, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.67 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=en_language=unspecified_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -10,8 +10,9 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { @@ -21,68 +22,68 @@ }, { "name": "game_starts", - "pass": false, - "detail": "could not start game with any mechanism" + "pass": true, + "detail": "started via click_canvas" }, { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", "pass": false, - "detail": "no change detected after key press" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", "pass": false, - "detail": "no change detected after rotate key" + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "could not detect any piece rotations" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "no change detected after hard drop key" + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", "pass": false, - "detail": "could not verify piece locking at bottom" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "could not detect new piece at top" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": false, - "detail": "could not trigger or detect a line clear" + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "no score element found and no number changed" + "detail": "no score element found" }, { "name": "game_over", @@ -91,8 +92,8 @@ }, { "name": "playable_30s", - "pass": true, - "detail": "played for 30s, placed 11 pieces, no crashes" + "pass": false, + "detail": "only played for 7s" } ], "summary": { @@ -102,14 +103,23 @@ "score": 0.19 }, "gameplay": { - "pieces_placed": 102, + "pieces_placed": 26, "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 30, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": 19 + "load_time_ms": 20 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -96,10 +96,10 @@ }, "html_validation": { "valid": false, - "errors": 3 + "errors": 0 }, "duplication_percentage": 0.0, - "score": 0.68 + "score": 0.33 }, "transcript_analysis": { "total_events": 84, @@ -128,10 +128,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.275, + "score": 0.275, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 18, + "cognitive_complexity": 185, + "lines_of_code": 1106, + "duplication_pct": 28.6, + "tech_debt_minutes": 59, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -124,13 +124,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 39 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.345, + "score": 0.345, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 7, + "cognitive_complexity": 99, + "lines_of_code": 841, + "duplication_pct": 21.4, + "tech_debt_minutes": 20, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -10,106 +10,116 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "exception: page.goto: Navigation to \"http://127.0.0.1:44091/\" is interrupted by another navigation to \"http://127.0.0.1:44091/public/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:44091/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via click_canvas" }, { "name": "auto_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", "pass": false, - "detail": "skipped: page did not load" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "skipped: page did not load" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "skipped: page did not load" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", "pass": false, - "detail": "skipped: page did not load" + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 0, + "pieces_placed": 26, "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": -1 + "load_time_ms": 39 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -126,13 +126,153 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "unknown", + "grid_detected": false, + "grid_bounds": null, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify auto-drop" + }, + { + "name": "move_left", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_right", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "move_down", + "pass": false, + "detail": "grid reader unreliable, cannot verify movement" + }, + { + "name": "rotate", + "pass": false, + "detail": "grid reader unreliable, cannot verify rotation" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "grid reader unreliable, cannot verify hard drop" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "only played for 7s" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 26, + "lines_cleared": 0, + "max_score_observed": 0, + "play_duration_seconds": 7, + "errors_during_play": 0 + }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, + "performance": { + "load_time_ms": 33 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.22, + "score": 0.22, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 14, + "cognitive_complexity": 174, + "lines_of_code": 1014, + "duplication_pct": 30.6, + "tech_debt_minutes": 28, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -10,106 +10,116 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "click_canvas", + "score_element_found": false, + "grid_confidence": 0 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "1 console error(s): Unexpected token '<'" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via click_canvas" }, { "name": "auto_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify auto-drop" }, { "name": "move_left", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_right", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "move_down", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify movement" }, { "name": "rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify rotation" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "grid reader unreliable, cannot verify hard drop" }, { "name": "piece_locks", "pass": false, - "detail": "skipped: page did not load" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "skipped: page did not load" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", "pass": false, - "detail": "skipped: page did not load" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", "pass": false, - "detail": "skipped: page did not load" + "detail": "only played for 7s" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 3, + "failed": 13, + "score": 0.19 }, "gameplay": { - "pieces_placed": 0, + "pieces_placed": 26, "lines_cleared": 0, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 7, "errors_during_play": 0 }, + "session": { + "frames": 33, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 0, + "piece_types_seen": [], + "grid_read_success_rate": 0 + }, "performance": { - "load_time_ms": 97 + "load_time_ms": 33 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -121,13 +121,165 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.75, + "total": 16, + "passed": 12, + "failed": 4, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "4 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": false, + "detail": "could not trigger or detect game over" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 43 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 12, + "failed": 4, + "score": 0.75 + }, + "gameplay": { + "pieces_placed": 43, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 290, + "events_count": 9, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "J", + "Z" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 21 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.675, + "score": 0.675, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 12, + "cognitive_complexity": 82, + "lines_of_code": 612, + "duplication_pct": 0.0, + "tech_debt_minutes": 42, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.6 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,93 +33,109 @@ { "name": "auto_drop", "pass": true, - "detail": "grid state changed after 5s with no input" + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", "pass": true, - "detail": "grid state changed after key press" + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotated: [unknown] failed: [] (tested 1 piece types in 60 attempts)" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": true, - "detail": "piece immediately dropped and new piece appeared" + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom" + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "new piece detected at top of grid" + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "game still responding after 10 piece drops" + "detail": "11 pieces placed during play session" }, { "name": "line_clear", "pass": true, - "detail": "line cleared via strategic placement" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score did not increase: [0] -> no change after polling" + "detail": "score stayed at 0" }, { "name": "game_over", "pass": false, - "detail": "exception: keyboard.press: Test timeout of 180000ms exceeded." + "detail": "could not trigger or detect game over" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 43 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 13, - "failed": 3, - "score": 0.81 + "passed": 12, + "failed": 4, + "score": 0.75 }, "gameplay": { - "pieces_placed": 32, + "pieces_placed": 43, "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 290, + "events_count": 9, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown", + "J", + "Z" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 256 + "load_time_ms": 21 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -124,10 +124,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.41, + "score": 0.41, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 1, + "cognitive_complexity": 106, + "lines_of_code": 715, + "duplication_pct": 0.0, + "tech_debt_minutes": 5, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.82 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -124,10 +124,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.395, + "score": 0.395, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 2, + "cognitive_complexity": 143, + "lines_of_code": 929, + "duplication_pct": 3.1, + "tech_debt_minutes": 12, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.79 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -121,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.38, + "total": 16, + "passed": 6, + "failed": 10, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 48, + "height": 96 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 160 to 162" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 6, + "failed": 10, + "score": 0.38 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 162, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 348, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 25 + }, + "accessibility": { + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 4, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.54, + "score": 0.54, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 5, + "cognitive_complexity": 117, + "lines_of_code": 867, + "duplication_pct": 0.0, + "tech_debt_minutes": 13, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.7 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,94 +32,108 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "exception: page.reload: Timeout 10000ms exceeded.\nCall log:\n\u001b[2m - waiting for navigation until \"load\"\u001b[22m\n\u001b[2m - navigated to \"http://127.0.0.1:42027/index.html\"\u001b[22m\n" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": true, - "detail": "visual change detected after hard drop" + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "exception: page.screenshot: Timeout 10000ms exceeded.\nCall log:\n\u001b[2m - taking page screenshot\u001b[22m\n\u001b[2m - waiting for fonts to load...\u001b[22m\n" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", - "pass": true, - "detail": "visual change suggests new piece spawned" + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "grid did not accumulate filled cells" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "could not read score element" + "pass": true, + "detail": "score changed from 160 to 162" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 9, - "failed": 7, - "score": 0.56 + "passed": 6, + "failed": 10, + "score": 0.38 }, "gameplay": { - "pieces_placed": 10, - "lines_cleared": 0, - "max_score_observed": 0, - "play_duration_seconds": 0, + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 162, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 348, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 5657 + "load_time_ms": 25 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 4, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -121,11 +121,151 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 343, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 60 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.26, - "score": 0.26, + "outcome_score": 0.415, + "score": 0.415, "sonarqube": { "bugs": 1, "vulnerabilities": 0, diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -1,119 +1,138 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "x", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "exception: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:38503/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "skipped: page did not load" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "skipped: page did not load" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "skipped: page did not load" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 343, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": -1 + "load_time_ms": 60 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -121,13 +121,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 50, + "height": 100 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 222 to 246" + }, + { + "name": "game_over", + "pass": false, + "detail": "could not trigger or detect game over" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 246, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 338, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 31 + }, + "accessibility": { + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 4, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.405, + "score": 0.405, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 1, + "vulnerabilities": 0, + "code_smells": 11, + "cognitive_complexity": 119, + "lines_of_code": 726, + "duplication_pct": 0.0, + "tech_debt_minutes": 41, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.5 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=high_human_language=es_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -1,119 +1,139 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 50, + "height": 100 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "x", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "exception: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:33367/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "skipped: page did not load" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "skipped: page did not load" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "score changed from 222 to 246" }, { "name": "game_over", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not trigger or detect game over" }, { "name": "playable_30s", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, - "max_score_observed": 0, - "play_duration_seconds": 0, + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 246, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 338, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 7033 + "load_time_ms": 31 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "no headings found", + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 4, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -124,13 +124,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.75, + "total": 16, + "passed": 12, + "failed": 4, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, + "controls": { + "left": "a", + "right": "d", + "down": "s", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": false, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "2 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "no score element found" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 36 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 12, + "failed": 4, + "score": 0.75 + }, + "gameplay": { + "pieces_placed": 36, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 308, + "events_count": 6, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 32 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.775, + "score": 0.775, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 5, + "cognitive_complexity": 88, + "lines_of_code": 528, + "duplication_pct": 0.0, + "tech_debt_minutes": 17, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.8 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -1,119 +1,138 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, "controls": { - "left": "ArrowLeft", - "right": "ArrowRight", - "down": "ArrowDown", - "rotate": "ArrowUp", + "left": "a", + "right": "d", + "down": "s", + "rotate": "z", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "auto", + "score_element_found": false, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "exception: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:46525/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via auto" }, { "name": "auto_drop", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "2 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "skipped: page did not load" + "detail": "no score element found" }, { "name": "game_over", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "played for 30s, placed 36 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 12, + "failed": 4, + "score": 0.75 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 36, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 308, + "events_count": 6, + "pieces_spawned": 2, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 7299 + "load_time_ms": 32 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role" + ], + "issue_count": 1, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -127,10 +127,19 @@ "score": 0, "error": "Gameplay bot timed out after 180 seconds" }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.275, + "score": 0.275, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 17, + "cognitive_complexity": 198, + "lines_of_code": 1055, + "duplication_pct": 34.3, + "tech_debt_minutes": 58, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -1,8 +1,13 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 300, + "height": 600 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", @@ -10,110 +15,123 @@ "rotate": "ArrowUp", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "space", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "exception: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:37507/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via space" }, { "name": "auto_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "skipped: page did not load" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "skipped: page did not load" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "skipped: page did not load" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 344, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": -1 + "load_time_ms": 51 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -126,13 +126,161 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 50, + "height": 100 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 339, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 73 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.28, + "score": 0.28, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 2, + "vulnerabilities": 0, + "code_smells": 19, + "cognitive_complexity": 198, + "lines_of_code": 1205, + "duplication_pct": 26.5, + "tech_debt_minutes": 55, + "maintainability": "A", + "reliability": "C", + "security": "A", + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "click_canvas", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -32,93 +33,105 @@ { "name": "auto_drop", "pass": false, - "detail": "piece did not move in 5 seconds" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "move_down", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "exception: page.waitForTimeout: Target page, context or browser has been closed" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "could not read score element" + "detail": "score stayed at 0" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 4, - "failed": 12, - "score": 0.25 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 11, - "lines_cleared": 0, + "pieces_placed": 20, + "lines_cleared": 1, "max_score_observed": 0, - "play_duration_seconds": 0, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 339, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 123 + "load_time_ms": 73 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -121,13 +121,162 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 40, + "height": 80 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 138" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 138, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 343, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 57 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.0, - "score": 0.0, + "outcome_score": 0.48, + "score": 0.48, "sonarqube": { - "pass": false, - "error": "no output" + "bugs": 0, + "vulnerabilities": 0, + "code_smells": 44, + "cognitive_complexity": 124, + "lines_of_code": 809, + "duplication_pct": 2.8, + "tech_debt_minutes": 86, + "maintainability": "A", + "reliability": "A", + "security": "A", + "score": 0.65 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -16,7 +16,8 @@ "drop": "Space" }, "start_mechanism": "auto", - "score_element_found": true + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { @@ -31,94 +32,107 @@ }, { "name": "auto_drop", - "pass": true, - "detail": "pixels changed after 5s with no input" + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_right", - "pass": true, - "detail": "grid state changed after key press" + "pass": false, + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "no change detected after key press" + "detail": "no grid change detected after key press" }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "exception: page.waitForTimeout: Target page, context or browser has been closed" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "exception: page.screenshot: Target page, context or browser has been closed" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "could not read score element" + "detail": "score stayed at 138" }, { "name": "game_over", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "game stopped after stacking to top" }, { "name": "playable_30s", - "pass": false, - "detail": "exception: keyboard.press: Target page, context or browser has been closed" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 6, - "failed": 10, - "score": 0.38 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 8, - "lines_cleared": 0, - "max_score_observed": 0, - "play_duration_seconds": 0, + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 138, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 343, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 3482 + "load_time_ms": 57 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -92,10 +92,10 @@ }, "html_validation": { "valid": false, - "errors": 0 + "errors": 8 }, "duplication_percentage": 0.0, - "score": 0.4 + "score": 0.75 }, "transcript_analysis": { "total_events": 112, @@ -121,11 +121,151 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.31, + "total": 16, + "passed": 5, + "failed": 11, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 54, + "height": 108 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "enter", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via enter" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 38 to 194" + }, + { + "name": "game_over", + "pass": false, + "detail": "could not trigger or detect game over" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 5, + "failed": 11, + "score": 0.31 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 194, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 338, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 63 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false + } + } }, - "outcome_score": 0.325, - "score": 0.325, + "outcome_score": 0.48, + "score": 0.48, "sonarqube": { "bugs": 0, "vulnerabilities": 0, diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -1,119 +1,138 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 54, + "height": 108 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "z", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "enter", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "exception: page.goto: Timeout 5000ms exceeded.\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:35703/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via enter" }, { "name": "auto_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "piece did not move down in 5 seconds (grid-verified)" }, { "name": "move_left", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "move_right", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "move_down", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change detected after key press" }, { "name": "rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", "pass": false, - "detail": "skipped: page did not load" + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": false, - "detail": "skipped: page did not load" + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" }, { "name": "new_piece_spawns", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not detect new piece spawning at top via grid reader" }, { "name": "multiple_pieces", "pass": false, - "detail": "skipped: page did not load" + "detail": "only 10 piece(s) detected, need at least 3" }, { "name": "line_clear", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "score changed from 38 to 194" }, { "name": "game_over", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not trigger or detect game over" }, { "name": "playable_30s", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "played for 30s, placed 20 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 5, + "failed": 11, + "score": 0.31 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, - "max_score_observed": 0, - "play_duration_seconds": 0, + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 194, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 338, + "events_count": 1, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": 5999 + "load_time_ms": 63 }, "accessibility": { - "issues": [], - "issue_count": 0, - "pass": true + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 3, + "pass": false } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -92,10 +92,10 @@ }, "html_validation": { "valid": false, - "errors": 0 + "errors": 8 }, "duplication_percentage": 0.0, - "score": 0.5 + "score": 0.85 }, "transcript_analysis": { "total_events": 71, @@ -121,11 +121,152 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "z", + "drop": "Space" + }, + "start_mechanism": "auto", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via auto" + }, + { + "name": "auto_drop", + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "4 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 354" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 44 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 44, + "lines_cleared": 1, + "max_score_observed": 354, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 289, + "events_count": 11, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 106 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.25, - "score": 0.25, + "outcome_score": 0.72, + "score": 0.72, "sonarqube": { "bugs": 1, "vulnerabilities": 0, diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -52,13 +52,13 @@ }, { "name": "rotate", - "pass": false, - "detail": "no shape change detected after rotate key" + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "could not detect any piece rotations via grid reader" + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", @@ -73,7 +73,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "6 new piece(s) detected at top of grid" + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -88,7 +88,7 @@ { "name": "score_changes", "pass": false, - "detail": "score stayed at 212" + "detail": "score stayed at 354" }, { "name": "game_over", @@ -98,26 +98,26 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 41 pieces, no crashes" + "detail": "played for 30s, placed 44 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 13, - "failed": 3, - "score": 0.81 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 41, + "pieces_placed": 44, "lines_cleared": 1, - "max_score_observed": 212, + "max_score_observed": 354, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 204, - "events_count": 10, - "pieces_spawned": 6, + "frames": 289, + "events_count": 11, + "pieces_spawned": 4, "pieces_locked": 11, "lines_cleared": 1, "piece_types_seen": [ @@ -126,7 +126,7 @@ "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 280 + "load_time_ms": 106 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -121,10 +121,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.94, + "score": 0.81, "total": 16, - "passed": 15, - "failed": 1, + "passed": 13, + "failed": 3, "report": { "implementation": { "renderer": "canvas", @@ -179,13 +179,13 @@ }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotation confirmed but could not identify individual piece types" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", @@ -200,7 +200,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "2 new piece(s) detected at top of grid" + "detail": "1 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -209,13 +209,13 @@ }, { "name": "line_clear", - "pass": false, - "detail": "could not trigger or detect a line clear via grid reader" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": true, - "detail": "score changed from 362 to 372" + "pass": false, + "detail": "score stayed at 276" }, { "name": "game_over", @@ -225,35 +225,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 39 pieces, no crashes" + "detail": "played for 30s, placed 38 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 13, + "failed": 3, + "score": 0.81 }, "gameplay": { - "pieces_placed": 39, - "lines_cleared": 0, - "max_score_observed": 372, + "pieces_placed": 38, + "lines_cleared": 1, + "max_score_observed": 276, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 298, + "frames": 303, "events_count": 9, - "pieces_spawned": 2, + "pieces_spawned": 1, "pieces_locked": 11, - "lines_cleared": 0, - "piece_types_seen": [ - "unknown" - ], + "lines_cleared": 1, + "piece_types_seen": [], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 181 + "load_time_ms": 92 }, "accessibility": { "issues": [ @@ -265,8 +263,8 @@ } } }, - "outcome_score": 0.72, - "score": 0.72, + "outcome_score": 0.655, + "score": 0.655, "sonarqube": { "bugs": 2, "vulnerabilities": 0, diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -52,13 +52,13 @@ }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotation confirmed but could not identify individual piece types" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", @@ -73,7 +73,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "2 new piece(s) detected at top of grid" + "detail": "1 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -82,13 +82,13 @@ }, { "name": "line_clear", - "pass": false, - "detail": "could not trigger or detect a line clear via grid reader" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", - "pass": true, - "detail": "score changed from 362 to 372" + "pass": false, + "detail": "score stayed at 276" }, { "name": "game_over", @@ -98,35 +98,33 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 39 pieces, no crashes" + "detail": "played for 30s, placed 38 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 13, + "failed": 3, + "score": 0.81 }, "gameplay": { - "pieces_placed": 39, - "lines_cleared": 0, - "max_score_observed": 372, + "pieces_placed": 38, + "lines_cleared": 1, + "max_score_observed": 276, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 298, + "frames": 303, "events_count": 9, - "pieces_spawned": 2, + "pieces_spawned": 1, "pieces_locked": 11, - "lines_cleared": 0, - "piece_types_seen": [ - "unknown" - ], + "lines_cleared": 1, + "piece_types_seen": [], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 181 + "load_time_ms": 92 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -1,115 +1,133 @@ { "implementation": { - "renderer": "unknown", - "grid_detected": false, - "grid_bounds": null, + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 320, + "height": 640 + }, "controls": { "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "x", "drop": "Space" }, - "start_mechanism": "unknown", - "score_element_found": false + "start_mechanism": "enter", + "score_element_found": true, + "grid_confidence": 1 }, "tests": [ { "name": "game_loads", - "pass": false, - "detail": "exception: page.goto: Navigation to \"http://127.0.0.1:40031/\" is interrupted by another navigation to \"http://127.0.0.1:40031/public/index.html\"\nCall log:\n\u001b[2m - navigating to \"http://127.0.0.1:40031/\", waiting until \"domcontentloaded\"\u001b[22m\n" + "pass": true, + "detail": "no console errors" }, { "name": "game_starts", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "started via enter" }, { "name": "auto_drop", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "grid state changed after 5s with no input (grid-verified)" }, { "name": "move_left", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_right", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "move_down", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "grid state changed after key press (grid-verified)" }, { "name": "rotate", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "rotation observed, piece types seen: [unknown, I]" }, { "name": "hard_drop", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" }, { "name": "piece_locks", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" }, { "name": "new_piece_spawns", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "11 pieces placed during play session" }, { "name": "line_clear", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "3 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "skipped: page did not load" + "detail": "score stayed at 414" }, { "name": "game_over", "pass": false, - "detail": "skipped: page did not load" + "detail": "could not trigger or detect game over" }, { "name": "playable_30s", - "pass": false, - "detail": "skipped: page did not load" + "pass": true, + "detail": "played for 30s, placed 31 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 0, - "failed": 16, - "score": 0 + "passed": 14, + "failed": 2, + "score": 0.88 }, "gameplay": { - "pieces_placed": 0, - "lines_cleared": 0, - "max_score_observed": 0, - "play_duration_seconds": 0, + "pieces_placed": 31, + "lines_cleared": 3, + "max_score_observed": 414, + "play_duration_seconds": 30, "errors_during_play": 0 }, + "session": { + "frames": 288, + "events_count": 12, + "pieces_spawned": 4, + "pieces_locked": 11, + "lines_cleared": 3, + "piece_types_seen": [ + "unknown", + "I" + ], + "grid_read_success_rate": 1 + }, "performance": { - "load_time_ms": -1 + "load_time_ms": 101 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -139,7 +139,7 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "x", + "rotate": "z", "drop": "Space" }, "start_mechanism": "auto", @@ -185,7 +185,7 @@ { "name": "all_pieces_rotate", "pass": true, - "detail": "rotation confirmed but could not identify individual piece types" + "detail": "rotation observed, piece types seen: [I, unknown]" }, { "name": "hard_drop", @@ -200,7 +200,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "5 new piece(s) detected at top of grid" + "detail": "7 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -215,7 +215,7 @@ { "name": "score_changes", "pass": true, - "detail": "score changed from 0 to 144" + "detail": "score changed from 98 to 364" }, { "name": "game_over", @@ -225,7 +225,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 43 pieces, no crashes" + "detail": "played for 30s, placed 42 pieces, no crashes" } ], "summary": { @@ -235,25 +235,26 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 43, + "pieces_placed": 42, "lines_cleared": 1, - "max_score_observed": 144, + "max_score_observed": 98, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 288, + "frames": 286, "events_count": 10, - "pieces_spawned": 5, + "pieces_spawned": 7, "pieces_locked": 11, "lines_cleared": 1, "piece_types_seen": [ + "I", "unknown" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 231 + "load_time_ms": 27 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=none_effort=max_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -12,7 +12,7 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "x", + "rotate": "z", "drop": "Space" }, "start_mechanism": "auto", @@ -58,7 +58,7 @@ { "name": "all_pieces_rotate", "pass": true, - "detail": "rotation confirmed but could not identify individual piece types" + "detail": "rotation observed, piece types seen: [I, unknown]" }, { "name": "hard_drop", @@ -73,7 +73,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "5 new piece(s) detected at top of grid" + "detail": "7 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -88,7 +88,7 @@ { "name": "score_changes", "pass": true, - "detail": "score changed from 0 to 144" + "detail": "score changed from 98 to 364" }, { "name": "game_over", @@ -98,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 43 pieces, no crashes" + "detail": "played for 30s, placed 42 pieces, no crashes" } ], "summary": { @@ -108,25 +108,26 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 43, + "pieces_placed": 42, "lines_cleared": 1, - "max_score_observed": 144, + "max_score_observed": 98, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 288, + "frames": 286, "events_count": 10, - "pieces_spawned": 5, + "pieces_spawned": 7, "pieces_locked": 11, "lines_cleared": 1, "piece_types_seen": [ + "I", "unknown" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 231 + "load_time_ms": 27 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -123,10 +123,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.81, + "score": 0.63, "total": 16, - "passed": 13, - "failed": 3, + "passed": 10, + "failed": 6, "report": { "implementation": { "renderer": "canvas", @@ -138,10 +138,10 @@ "height": 400 }, "controls": { - "left": "a", - "right": "d", - "down": "s", - "rotate": "z", + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", "drop": "Space" }, "start_mechanism": "auto", @@ -181,33 +181,33 @@ }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotation observed, piece types seen: [O, L, unknown]" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": true, - "detail": "piece immediately dropped to bottom (grid-verified)" + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "4 new piece(s) detected at top of grid" + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "11 pieces placed during play session" + "detail": "10 pieces placed during play session" }, { "name": "line_clear", @@ -227,37 +227,36 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 35 pieces, no crashes" + "detail": "played for 30s, placed 34 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 13, - "failed": 3, - "score": 0.81 + "passed": 10, + "failed": 6, + "score": 0.63 }, "gameplay": { - "pieces_placed": 35, + "pieces_placed": 34, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 304, - "events_count": 9, - "pieces_spawned": 4, - "pieces_locked": 11, + "frames": 309, + "events_count": 4, + "pieces_spawned": 3, + "pieces_locked": 10, "lines_cleared": 1, "piece_types_seen": [ - "O", - "L", + "S", "unknown" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 216 + "load_time_ms": 35 }, "accessibility": { "issues": [ @@ -270,19 +269,19 @@ } } }, - "outcome_score": 0.63, - "score": 0.63, + "outcome_score": 0.69, + "score": 0.69, "sonarqube": { - "bugs": 2, + "bugs": 0, "vulnerabilities": 0, "code_smells": 9, - "cognitive_complexity": 85, - "lines_of_code": 653, - "duplication_pct": 0.0, - "tech_debt_minutes": 18, + "cognitive_complexity": 96, + "lines_of_code": 618, + "duplication_pct": 4.1, + "tech_debt_minutes": 39, "maintainability": "A", - "reliability": "C", + "reliability": "A", "security": "A", - "score": 0.45 + "score": 0.75 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -9,10 +9,10 @@ "height": 400 }, "controls": { - "left": "a", - "right": "d", - "down": "s", - "rotate": "z", + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", "drop": "Space" }, "start_mechanism": "auto", @@ -52,33 +52,33 @@ }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotation observed, piece types seen: [O, L, unknown]" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", - "pass": true, - "detail": "piece immediately dropped to bottom (grid-verified)" + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" }, { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "4 new piece(s) detected at top of grid" + "detail": "3 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", "pass": true, - "detail": "11 pieces placed during play session" + "detail": "10 pieces placed during play session" }, { "name": "line_clear", @@ -98,37 +98,36 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 35 pieces, no crashes" + "detail": "played for 30s, placed 34 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 13, - "failed": 3, - "score": 0.81 + "passed": 10, + "failed": 6, + "score": 0.63 }, "gameplay": { - "pieces_placed": 35, + "pieces_placed": 34, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 304, - "events_count": 9, - "pieces_spawned": 4, - "pieces_locked": 11, + "frames": 309, + "events_count": 4, + "pieces_spawned": 3, + "pieces_locked": 10, "lines_cleared": 1, "piece_types_seen": [ - "O", - "L", + "S", "unknown" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 216 + "load_time_ms": 35 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -197,12 +197,12 @@ { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "4 new piece(s) detected at top of grid" + "detail": "1 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -227,7 +227,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 39 pieces, no crashes" + "detail": "played for 30s, placed 38 pieces, no crashes" } ], "summary": { @@ -237,16 +237,16 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 39, + "pieces_placed": 38, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 298, - "events_count": 11, - "pieces_spawned": 4, + "frames": 295, + "events_count": 10, + "pieces_spawned": 1, "pieces_locked": 11, "lines_cleared": 1, "piece_types_seen": [ @@ -255,7 +255,7 @@ "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 303 + "load_time_ms": 31 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -68,12 +68,12 @@ { "name": "piece_locks", "pass": true, - "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + "detail": "filled cells persist at bottom (grid-verified, 1 lock event(s))" }, { "name": "new_piece_spawns", "pass": true, - "detail": "4 new piece(s) detected at top of grid" + "detail": "1 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -98,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 39 pieces, no crashes" + "detail": "played for 30s, placed 38 pieces, no crashes" } ], "summary": { @@ -108,16 +108,16 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 39, + "pieces_placed": 38, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 298, - "events_count": 11, - "pieces_spawned": 4, + "frames": 295, + "events_count": 10, + "pieces_spawned": 1, "pieces_locked": 11, "lines_cleared": 1, "piece_types_seen": [ @@ -126,7 +126,7 @@ "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 303 + "load_time_ms": 31 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -95,10 +95,10 @@ }, "html_validation": { "valid": false, - "errors": 0 + "errors": 14 }, "duplication_percentage": 0.0, - "score": 0.4 + "score": 0.75 }, "transcript_analysis": { "total_events": 124, @@ -124,22 +124,163 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.94, + "total": 16, + "passed": 15, + "failed": 1, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 240, + "height": 480 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "x", + "drop": "Space" + }, + "start_mechanism": "click_canvas", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": true, + "detail": "no console errors" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via click_canvas" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_right", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "move_down", + "pass": true, + "detail": "grid state changed after key press (grid-verified)" + }, + { + "name": "rotate", + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + }, + { + "name": "all_pieces_rotate", + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" + }, + { + "name": "hard_drop", + "pass": true, + "detail": "piece immediately dropped to bottom (grid-verified)" + }, + { + "name": "piece_locks", + "pass": true, + "detail": "filled cells persist at bottom (grid-verified, 2 lock event(s))" + }, + { + "name": "new_piece_spawns", + "pass": true, + "detail": "8 new piece(s) detected at top of grid" + }, + { + "name": "multiple_pieces", + "pass": true, + "detail": "11 pieces placed during play session" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": true, + "detail": "score changed from 322 to 332" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": true, + "detail": "played for 30s, placed 23 pieces, no crashes" + } + ], + "summary": { + "total": 16, + "passed": 15, + "failed": 1, + "score": 0.94 + }, + "gameplay": { + "pieces_placed": 23, + "lines_cleared": 1, + "max_score_observed": 332, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 340, + "events_count": 9, + "pieces_spawned": 8, + "pieces_locked": 11, + "lines_cleared": 1, + "piece_types_seen": [ + "unknown" + ], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 77 + }, + "accessibility": { + "issues": [ + "canvas without aria-label or role", + "canvas without aria-label or role" + ], + "issue_count": 2, + "pass": false + } + } }, - "outcome_score": 0.375, - "score": 0.375, + "outcome_score": 0.595, + "score": 0.595, "sonarqube": { - "bugs": 0, + "bugs": 2, "vulnerabilities": 0, - "code_smells": 9, - "cognitive_complexity": 96, - "lines_of_code": 618, - "duplication_pct": 4.1, - "tech_debt_minutes": 39, + "code_smells": 18, + "cognitive_complexity": 202, + "lines_of_code": 1146, + "duplication_pct": 28.7, + "tech_debt_minutes": 64, "maintainability": "A", - "reliability": "A", + "reliability": "C", "security": "A", - "score": 0.75 + "score": 0.25 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=high_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -52,13 +52,13 @@ }, { "name": "rotate", - "pass": false, - "detail": "no shape change detected after rotate key" + "pass": true, + "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" }, { "name": "all_pieces_rotate", - "pass": false, - "detail": "could not detect any piece rotations via grid reader" + "pass": true, + "detail": "rotation confirmed but could not identify individual piece types" }, { "name": "hard_drop", @@ -73,7 +73,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "4 new piece(s) detected at top of grid" + "detail": "8 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -82,13 +82,13 @@ }, { "name": "line_clear", - "pass": false, - "detail": "could not trigger or detect a line clear via grid reader" + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": true, - "detail": "score changed from 304 to 318" + "detail": "score changed from 322 to 332" }, { "name": "game_over", @@ -98,35 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 22 pieces, no crashes" + "detail": "played for 30s, placed 23 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 12, - "failed": 4, - "score": 0.75 + "passed": 15, + "failed": 1, + "score": 0.94 }, "gameplay": { - "pieces_placed": 22, - "lines_cleared": 0, - "max_score_observed": 318, + "pieces_placed": 23, + "lines_cleared": 1, + "max_score_observed": 332, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 341, - "events_count": 7, - "pieces_spawned": 4, + "frames": 340, + "events_count": 9, + "pieces_spawned": 8, "pieces_locked": 11, - "lines_cleared": 0, + "lines_cleared": 1, "piece_types_seen": [ "unknown" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 214 + "load_time_ms": 77 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -125,10 +125,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.81, + "score": 0.75, "total": 16, - "passed": 13, - "failed": 3, + "passed": 12, + "failed": 4, "report": { "implementation": { "renderer": "canvas", @@ -204,7 +204,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "2 new piece(s) detected at top of grid" + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -213,8 +213,8 @@ }, { "name": "line_clear", - "pass": true, - "detail": "1 line(s) cleared (grid-verified)" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", @@ -229,35 +229,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 22 pieces, no crashes" + "detail": "played for 30s, placed 28 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 13, - "failed": 3, - "score": 0.81 + "passed": 12, + "failed": 4, + "score": 0.75 }, "gameplay": { - "pieces_placed": 22, - "lines_cleared": 1, + "pieces_placed": 28, + "lines_cleared": 0, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 331, - "events_count": 9, - "pieces_spawned": 2, + "frames": 328, + "events_count": 8, + "pieces_spawned": 4, "pieces_locked": 11, - "lines_cleared": 1, + "lines_cleared": 0, "piece_types_seen": [ - "I" + "unknown" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 222 + "load_time_ms": 66 }, "accessibility": { "issues": [ @@ -270,19 +270,19 @@ } } }, - "outcome_score": 0.63, - "score": 0.63, + "outcome_score": 0.65, + "score": 0.65, "sonarqube": { - "bugs": 2, + "bugs": 0, "vulnerabilities": 0, - "code_smells": 7, - "cognitive_complexity": 97, - "lines_of_code": 654, - "duplication_pct": 0.0, - "tech_debt_minutes": 39, + "code_smells": 31, + "cognitive_complexity": 236, + "lines_of_code": 1088, + "duplication_pct": 28.1, + "tech_debt_minutes": 141, "maintainability": "A", - "reliability": "C", + "reliability": "A", "security": "A", - "score": 0.45 + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -73,7 +73,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "2 new piece(s) detected at top of grid" + "detail": "4 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -82,8 +82,8 @@ }, { "name": "line_clear", - "pass": true, - "detail": "1 line(s) cleared (grid-verified)" + "pass": false, + "detail": "could not trigger or detect a line clear via grid reader" }, { "name": "score_changes", @@ -98,35 +98,35 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 22 pieces, no crashes" + "detail": "played for 30s, placed 28 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 13, - "failed": 3, - "score": 0.81 + "passed": 12, + "failed": 4, + "score": 0.75 }, "gameplay": { - "pieces_placed": 22, - "lines_cleared": 1, + "pieces_placed": 28, + "lines_cleared": 0, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 331, - "events_count": 9, - "pieces_spawned": 2, + "frames": 328, + "events_count": 8, + "pieces_spawned": 4, "pieces_locked": 11, - "lines_cleared": 1, + "lines_cleared": 0, "piece_types_seen": [ - "I" + "unknown" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 222 + "load_time_ms": 66 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -125,11 +125,147 @@ }, "gameplay_bot": { "pass": false, - "score": 0, - "error": "Gameplay bot timed out after 180 seconds" + "score": 0.19, + "total": 16, + "passed": 3, + "failed": 13, + "report": { + "implementation": { + "renderer": "canvas", + "grid_detected": true, + "grid_bounds": { + "x": 0, + "y": 0, + "width": 75, + "height": 150 + }, + "controls": { + "left": "ArrowLeft", + "right": "ArrowRight", + "down": "ArrowDown", + "rotate": "ArrowUp", + "drop": "Space" + }, + "start_mechanism": "space", + "score_element_found": true, + "grid_confidence": 1 + }, + "tests": [ + { + "name": "game_loads", + "pass": false, + "detail": "1 console error(s): Unexpected token 'export'" + }, + { + "name": "game_starts", + "pass": true, + "detail": "started via space" + }, + { + "name": "auto_drop", + "pass": false, + "detail": "piece did not move down in 5 seconds (grid-verified)" + }, + { + "name": "move_left", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_right", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "move_down", + "pass": false, + "detail": "no grid change detected after key press" + }, + { + "name": "rotate", + "pass": false, + "detail": "no shape change detected after rotate key" + }, + { + "name": "all_pieces_rotate", + "pass": false, + "detail": "could not detect any piece rotations via grid reader" + }, + { + "name": "hard_drop", + "pass": false, + "detail": "no grid change with bottom cells detected after hard drop key" + }, + { + "name": "piece_locks", + "pass": false, + "detail": "10 lock event(s) but 0 spawns detected - likely false positive from UI misread" + }, + { + "name": "new_piece_spawns", + "pass": false, + "detail": "could not detect new piece spawning at top via grid reader" + }, + { + "name": "multiple_pieces", + "pass": false, + "detail": "only 10 piece(s) detected, need at least 3" + }, + { + "name": "line_clear", + "pass": true, + "detail": "1 line(s) cleared (grid-verified)" + }, + { + "name": "score_changes", + "pass": false, + "detail": "score stayed at 0" + }, + { + "name": "game_over", + "pass": true, + "detail": "game stopped after stacking to top" + }, + { + "name": "playable_30s", + "pass": false, + "detail": "5 console error(s), 0 play errors" + } + ], + "summary": { + "total": 16, + "passed": 3, + "failed": 13, + "score": 0.19 + }, + "gameplay": { + "pieces_placed": 20, + "lines_cleared": 1, + "max_score_observed": 0, + "play_duration_seconds": 30, + "errors_during_play": 0 + }, + "session": { + "frames": 345, + "events_count": 2, + "pieces_spawned": 0, + "pieces_locked": 10, + "lines_cleared": 1, + "piece_types_seen": [], + "grid_read_success_rate": 1 + }, + "performance": { + "load_time_ms": 20 + }, + "accessibility": { + "issues": [], + "issue_count": 0, + "pass": true + } + } }, - "outcome_score": 0.225, - "score": 0.225, + "outcome_score": 0.32, + "score": 0.32, "sonarqube": { "bugs": 2, "vulnerabilities": 0, diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -115,7 +115,7 @@ "errors_during_play": 0 }, "session": { - "frames": 323, + "frames": 345, "events_count": 2, "pieces_spawned": 0, "pieces_locked": 10, @@ -124,7 +124,7 @@ "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 290 + "load_time_ms": 20 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -141,7 +141,7 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "z", + "rotate": "x", "drop": "Space" }, "start_mechanism": "auto", @@ -227,7 +227,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 36 pieces, no crashes" + "detail": "played for 30s, placed 33 pieces, no crashes" } ], "summary": { @@ -237,26 +237,26 @@ "score": 0.81 }, "gameplay": { - "pieces_placed": 36, + "pieces_placed": 33, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 306, + "frames": 309, "events_count": 8, "pieces_spawned": 9, "pieces_locked": 11, "lines_cleared": 1, "piece_types_seen": [ "unknown", - "O" + "T" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 163 + "load_time_ms": 32 }, "accessibility": { "issues": [], @@ -265,19 +265,19 @@ } } }, - "outcome_score": 0.73, - "score": 0.73, + "outcome_score": 0.68, + "score": 0.68, "sonarqube": { - "bugs": 1, + "bugs": 0, "vulnerabilities": 0, - "code_smells": 5, - "cognitive_complexity": 78, - "lines_of_code": 698, - "duplication_pct": 0.0, - "tech_debt_minutes": 25, + "code_smells": 31, + "cognitive_complexity": 236, + "lines_of_code": 1088, + "duplication_pct": 28.1, + "tech_debt_minutes": 141, "maintainability": "A", - "reliability": "C", + "reliability": "A", "security": "A", - "score": 0.65 + "score": 0.55 } } \ No newline at end of file diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=haiku_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -12,7 +12,7 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "z", + "rotate": "x", "drop": "Space" }, "start_mechanism": "auto", @@ -98,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 36 pieces, no crashes" + "detail": "played for 30s, placed 33 pieces, no crashes" } ], "summary": { @@ -108,26 +108,26 @@ "score": 0.81 }, "gameplay": { - "pieces_placed": 36, + "pieces_placed": 33, "lines_cleared": 1, "max_score_observed": 0, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 306, + "frames": 309, "events_count": 8, "pieces_spawned": 9, "pieces_locked": 11, "lines_cleared": 1, "piece_types_seen": [ "unknown", - "O" + "T" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 163 + "load_time_ms": 32 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -215,7 +215,7 @@ { "name": "score_changes", "pass": true, - "detail": "score changed from 160 to 162" + "detail": "score changed from 194 to 216" }, { "name": "game_over", @@ -237,12 +237,12 @@ "gameplay": { "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 162, + "max_score_observed": 216, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 336, + "frames": 351, "events_count": 2, "pieces_spawned": 0, "pieces_locked": 10, @@ -251,7 +251,7 @@ "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 402 + "load_time_ms": 86 }, "accessibility": { "issues": [], @@ -265,11 +265,11 @@ "sonarqube": { "bugs": 0, "vulnerabilities": 0, - "code_smells": 17, - "cognitive_complexity": 123, - "lines_of_code": 958, - "duplication_pct": 2.8, - "tech_debt_minutes": 63, + "code_smells": 12, + "cognitive_complexity": 144, + "lines_of_code": 826, + "duplication_pct": 0.0, + "tech_debt_minutes": 71, "maintainability": "A", "reliability": "A", "security": "A", diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -88,7 +88,7 @@ { "name": "score_changes", "pass": true, - "detail": "score changed from 160 to 162" + "detail": "score changed from 194 to 216" }, { "name": "game_over", @@ -110,12 +110,12 @@ "gameplay": { "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 162, + "max_score_observed": 216, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 336, + "frames": 351, "events_count": 2, "pieces_spawned": 0, "pieces_locked": 10, @@ -124,7 +124,7 @@ "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 402 + "load_time_ms": 86 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -121,10 +121,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.94, + "score": 0.81, "total": 16, - "passed": 15, - "failed": 1, + "passed": 13, + "failed": 3, "report": { "implementation": { "renderer": "canvas", @@ -139,7 +139,7 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "z", "drop": "Space" }, "start_mechanism": "auto", @@ -179,13 +179,13 @@ }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotation observed, piece types seen: [unknown, S]" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", @@ -210,12 +210,12 @@ { "name": "line_clear", "pass": true, - "detail": "2 line(s) cleared (grid-verified)" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score stayed at 282" + "detail": "score stayed at 216" }, { "name": "game_over", @@ -230,31 +230,30 @@ ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 13, + "failed": 3, + "score": 0.81 }, "gameplay": { "pieces_placed": 37, - "lines_cleared": 2, - "max_score_observed": 282, + "lines_cleared": 1, + "max_score_observed": 216, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 296, - "events_count": 11, + "frames": 309, + "events_count": 9, "pieces_spawned": 4, "pieces_locked": 11, - "lines_cleared": 2, + "lines_cleared": 1, "piece_types_seen": [ - "unknown", - "S" + "unknown" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 141 + "load_time_ms": 38 }, "accessibility": { "issues": [ @@ -265,16 +264,16 @@ } } }, - "outcome_score": 0.795, - "score": 0.795, + "outcome_score": 0.73, + "score": 0.73, "sonarqube": { "bugs": 0, "vulnerabilities": 0, - "code_smells": 17, - "cognitive_complexity": 123, - "lines_of_code": 958, - "duplication_pct": 2.8, - "tech_debt_minutes": 63, + "code_smells": 13, + "cognitive_complexity": 105, + "lines_of_code": 616, + "duplication_pct": 0.0, + "tech_debt_minutes": 46, "maintainability": "A", "reliability": "A", "security": "A", diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -12,7 +12,7 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "ArrowUp", + "rotate": "z", "drop": "Space" }, "start_mechanism": "auto", @@ -52,13 +52,13 @@ }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotation observed, piece types seen: [unknown, S]" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", @@ -83,12 +83,12 @@ { "name": "line_clear", "pass": true, - "detail": "2 line(s) cleared (grid-verified)" + "detail": "1 line(s) cleared (grid-verified)" }, { "name": "score_changes", "pass": false, - "detail": "score stayed at 282" + "detail": "score stayed at 216" }, { "name": "game_over", @@ -103,31 +103,30 @@ ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 13, + "failed": 3, + "score": 0.81 }, "gameplay": { "pieces_placed": 37, - "lines_cleared": 2, - "max_score_observed": 282, + "lines_cleared": 1, + "max_score_observed": 216, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 296, - "events_count": 11, + "frames": 309, + "events_count": 9, "pieces_spawned": 4, "pieces_locked": 11, - "lines_cleared": 2, + "lines_cleared": 1, "piece_types_seen": [ - "unknown", - "S" + "unknown" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 141 + "load_time_ms": 38 }, "accessibility": { "issues": [ diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -121,10 +121,10 @@ }, "gameplay_bot": { "pass": false, - "score": 0.94, + "score": 0.81, "total": 16, - "passed": 15, - "failed": 1, + "passed": 13, + "failed": 3, "report": { "implementation": { "renderer": "canvas", @@ -179,13 +179,13 @@ }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotation observed, piece types seen: [unknown, T, S, I]" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", @@ -200,7 +200,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "8 new piece(s) detected at top of grid" + "detail": "7 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -215,7 +215,7 @@ { "name": "score_changes", "pass": false, - "detail": "score stayed at 350" + "detail": "score stayed at 298" }, { "name": "game_over", @@ -225,38 +225,38 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 41 pieces, no crashes" + "detail": "played for 30s, placed 40 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 13, + "failed": 3, + "score": 0.81 }, "gameplay": { - "pieces_placed": 41, + "pieces_placed": 40, "lines_cleared": 1, - "max_score_observed": 350, + "max_score_observed": 298, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 289, + "frames": 300, "events_count": 10, - "pieces_spawned": 8, + "pieces_spawned": 7, "pieces_locked": 11, "lines_cleared": 1, "piece_types_seen": [ "unknown", - "T", + "Z", "S", "I" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 100 + "load_time_ms": 39 }, "accessibility": { "issues": [], @@ -265,16 +265,16 @@ } } }, - "outcome_score": 0.795, - "score": 0.795, + "outcome_score": 0.73, + "score": 0.73, "sonarqube": { "bugs": 0, "vulnerabilities": 0, - "code_smells": 17, - "cognitive_complexity": 123, - "lines_of_code": 958, - "duplication_pct": 2.8, - "tech_debt_minutes": 63, + "code_smells": 12, + "cognitive_complexity": 144, + "lines_of_code": 826, + "duplication_pct": 0.0, + "tech_debt_minutes": 71, "maintainability": "A", "reliability": "A", "security": "A", diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=opus_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -52,13 +52,13 @@ }, { "name": "rotate", - "pass": true, - "detail": "piece shape changed after rotate key (grid-verified, 1 rotation(s))" + "pass": false, + "detail": "no shape change detected after rotate key" }, { "name": "all_pieces_rotate", - "pass": true, - "detail": "rotation observed, piece types seen: [unknown, T, S, I]" + "pass": false, + "detail": "could not detect any piece rotations via grid reader" }, { "name": "hard_drop", @@ -73,7 +73,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "8 new piece(s) detected at top of grid" + "detail": "7 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -88,7 +88,7 @@ { "name": "score_changes", "pass": false, - "detail": "score stayed at 350" + "detail": "score stayed at 298" }, { "name": "game_over", @@ -98,38 +98,38 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 41 pieces, no crashes" + "detail": "played for 30s, placed 40 pieces, no crashes" } ], "summary": { "total": 16, - "passed": 15, - "failed": 1, - "score": 0.94 + "passed": 13, + "failed": 3, + "score": 0.81 }, "gameplay": { - "pieces_placed": 41, + "pieces_placed": 40, "lines_cleared": 1, - "max_score_observed": 350, + "max_score_observed": 298, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 289, + "frames": 300, "events_count": 10, - "pieces_spawned": 8, + "pieces_spawned": 7, "pieces_locked": 11, "lines_cleared": 1, "piece_types_seen": [ "unknown", - "T", + "Z", "S", "I" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 100 + "load_time_ms": 39 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/eval_results.json @@ -139,7 +139,7 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "x", + "rotate": "ArrowUp", "drop": "Space" }, "start_mechanism": "auto", @@ -242,7 +242,7 @@ "errors_during_play": 0 }, "session": { - "frames": 327, + "frames": 346, "events_count": 2, "pieces_spawned": 0, "pieces_locked": 10, @@ -251,7 +251,7 @@ "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 133 + "load_time_ms": 46 }, "accessibility": { "issues": [], @@ -265,11 +265,11 @@ "sonarqube": { "bugs": 1, "vulnerabilities": 0, - "code_smells": 22, - "cognitive_complexity": 120, - "lines_of_code": 811, + "code_smells": 12, + "cognitive_complexity": 148, + "lines_of_code": 734, "duplication_pct": 0.0, - "tech_debt_minutes": 63, + "tech_debt_minutes": 85, "maintainability": "A", "reliability": "C", "security": "A", diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run1/gameplay-bot-report.json @@ -12,7 +12,7 @@ "left": "ArrowLeft", "right": "ArrowRight", "down": "ArrowDown", - "rotate": "x", + "rotate": "ArrowUp", "drop": "Space" }, "start_mechanism": "auto", @@ -115,7 +115,7 @@ "errors_during_play": 0 }, "session": { - "frames": 327, + "frames": 346, "events_count": 2, "pieces_spawned": 0, "pieces_locked": 10, @@ -124,7 +124,7 @@ "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 133 + "load_time_ms": 46 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/eval_results.json @@ -185,7 +185,7 @@ { "name": "all_pieces_rotate", "pass": true, - "detail": "rotation observed, piece types seen: [unknown, J]" + "detail": "rotation observed, piece types seen: [unknown, Z]" }, { "name": "hard_drop", @@ -200,7 +200,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "8 new piece(s) detected at top of grid" + "detail": "10 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -215,7 +215,7 @@ { "name": "score_changes", "pass": true, - "detail": "score changed from 126 to 266" + "detail": "score changed from 36 to 176" }, { "name": "game_over", @@ -225,7 +225,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 43 pieces, no crashes" + "detail": "played for 30s, placed 44 pieces, no crashes" } ], "summary": { @@ -235,26 +235,26 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 43, + "pieces_placed": 44, "lines_cleared": 1, - "max_score_observed": 266, + "max_score_observed": 176, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { "frames": 281, "events_count": 10, - "pieces_spawned": 8, + "pieces_spawned": 10, "pieces_locked": 11, "lines_cleared": 1, "piece_types_seen": [ "unknown", - "J" + "Z" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 146 + "load_time_ms": 37 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run2/gameplay-bot-report.json @@ -58,7 +58,7 @@ { "name": "all_pieces_rotate", "pass": true, - "detail": "rotation observed, piece types seen: [unknown, J]" + "detail": "rotation observed, piece types seen: [unknown, Z]" }, { "name": "hard_drop", @@ -73,7 +73,7 @@ { "name": "new_piece_spawns", "pass": true, - "detail": "8 new piece(s) detected at top of grid" + "detail": "10 new piece(s) detected at top of grid" }, { "name": "multiple_pieces", @@ -88,7 +88,7 @@ { "name": "score_changes", "pass": true, - "detail": "score changed from 126 to 266" + "detail": "score changed from 36 to 176" }, { "name": "game_over", @@ -98,7 +98,7 @@ { "name": "playable_30s", "pass": true, - "detail": "played for 30s, placed 43 pieces, no crashes" + "detail": "played for 30s, placed 44 pieces, no crashes" } ], "summary": { @@ -108,26 +108,26 @@ "score": 0.94 }, "gameplay": { - "pieces_placed": 43, + "pieces_placed": 44, "lines_cleared": 1, - "max_score_observed": 266, + "max_score_observed": 176, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { "frames": 281, "events_count": 10, - "pieces_spawned": 8, + "pieces_spawned": 10, "pieces_locked": 11, "lines_cleared": 1, "piece_types_seen": [ "unknown", - "J" + "Z" ], "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 146 + "load_time_ms": 37 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/eval_results.json @@ -216,7 +216,7 @@ { "name": "score_changes", "pass": true, - "detail": "score changed from 196 to 216" + "detail": "score changed from 230 to 262" }, { "name": "game_over", @@ -238,12 +238,12 @@ "gameplay": { "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 216, + "max_score_observed": 262, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 325, + "frames": 351, "events_count": 2, "pieces_spawned": 0, "pieces_locked": 10, @@ -252,7 +252,7 @@ "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 160 + "load_time_ms": 44 }, "accessibility": { "issues": [], diff --git a/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json b/results/runs/tetris_context_file=provided_effort=high_human_language=en_language=typescript_linter=on_max_budget=low_model=sonnet_playwright=on_prompt_style=simple_sub_agents=on_tool_edit=on_tool_glob=on_tool_grep=on_tool_read=on_tool_write=on_web_search=on_run3/gameplay-bot-report.json @@ -88,7 +88,7 @@ { "name": "score_changes", "pass": true, - "detail": "score changed from 196 to 216" + "detail": "score changed from 230 to 262" }, { "name": "game_over", @@ -110,12 +110,12 @@ "gameplay": { "pieces_placed": 20, "lines_cleared": 1, - "max_score_observed": 216, + "max_score_observed": 262, "play_duration_seconds": 30, "errors_during_play": 0 }, "session": { - "frames": 325, + "frames": 351, "events_count": 2, "pieces_spawned": 0, "pieces_locked": 10, @@ -124,7 +124,7 @@ "grid_read_success_rate": 1 }, "performance": { - "load_time_ms": 160 + "load_time_ms": 44 }, "accessibility": { "issues": [],

Impressum · Datenschutz