scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (36123B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing LLM Code Generation: A Systematic Evaluation of Multi-Agent Collaboration and Runtime Debugging for Improved Accuracy, Reliability, and Latency",
      6     "authors": [
      7       "Nazmus Ashrafi",
      8       "Salah Bouktif",
      9       "Mohammed Mediani"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2505.02133",
     14     "doi": "10.48550/arXiv.2505.02133"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims about 85% confidence level for combined vs ACT alone, and inability to show significance over Debug alone, are both supported by the t-test results in Section 4.2. Claims about 0.68% improvement match Table 2 analysis.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Causal claims like 'combining ACT and debugging improves accuracy' are supported by a controlled ablation design where components are systematically added/removed while other factors are held constant (same models, same datasets, same prompts). This single-variable manipulation is adequate for the causal claims made.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The title claims 'Improved Accuracy, Reliability, and Latency' broadly. The conclusion speaks of 'organizations seeking robust AI-driven coding solutions' and 'real-world programming scenarios.' However, results are only on HumanEval/HumanEval+ (Python, function-level tasks). No bounding to this narrow scope.",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper offers one interpretation for each finding (e.g., debugging provides 'rich context') but does not consider confounds such as the additional API calls providing more tokens/attempts regardless of the specific approach, or whether prompt design rather than multi-agent structure drives the differences.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper measures pass@1 and frames it as 'functional accuracy,' and measures HE/HE+ drop as 'code rigorousness.' These claims match the granularity of the measurements without overclaiming broader code quality constructs.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No dedicated limitations or threats-to-validity section. Some caveats are mentioned inline (e.g., 'A limitation of this approach is its reliance on a limited set of visible test cases' in Section 3; 'which may not be ideal' regarding same prompts for all models), but these are scattered and not substantive.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "A few specific caveats exist (same prompts for all models, visible test case limitation, n=1 design) but these are brief inline mentions, not substantive discussion. No systematic analysis of what could invalidate the findings.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper does not explicitly state what the results do NOT show. No mention of limitations to Python, function-level tasks, HumanEval-specific characteristics, or the narrow scope of the tested approaches. The conclusion speaks broadly about 'real-world programming scenarios.'",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding information, acknowledgments section, or grant numbers are provided anywhere in the paper.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly stated: all three authors are from the Department of Computer Science and Software Engineering, United Arab Emirates University. They are not affiliated with any LLM provider being evaluated.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No funding source is disclosed, making it impossible to assess funder independence. University affiliation suggests academic funding, but without explicit disclosure this cannot be confirmed.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is present in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms are explicitly defined: multi-agent collaboration (Section 2.1), runtime execution information-based debugging (Section 2.2), functional accuracy as pass@1 (Section 4.1.1), and code rigorousness as the HumanEval-to-HumanEval+ accuracy drop (Section 4.4).",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper clearly states its contribution: an empirical evaluation of combining multi-agent collaboration with runtime debugging across 19 LLMs on two benchmarks, with identification of optimal combinations for accuracy, rigor, and latency trade-offs.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 provides a substantive literature review covering AgentCoder, MapCoder, LDB, self-collaboration, self-debugging, and related frameworks. The methodology explicitly builds on [21] (ACT structure) and [23] (LDB debugging), situating contributions relative to these prior works.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "A GitHub repository is provided (https://github.com/nazmus-ashrafi/multiagent_vs_debugger) with agent prompts. Referenced in Section 3 and Section 4.3.1.",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The paper uses HumanEval and HumanEval+, both publicly available benchmark datasets. No proprietary data was collected.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No requirements.txt, Dockerfile, or detailed environment setup is provided. The paper does not describe library versions or dependencies needed to reproduce the experiments.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions are included in the paper or referenced. The GitHub repo contains prompts but no described workflow for replication.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "All results in Table 2 are point estimates (pass@1 scores) with no confidence intervals or error bars. The t-tests report p-values but no CIs on the accuracy differences.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "One-tailed paired t-tests are used (Section 4.2) to compare ACT+Debug vs ACT alone and ACT+Debug vs Debug alone, with explicit hypothesis formulation and p-value reporting.",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Mean accuracy differences are reported with baseline context: e.g., ACT+Debug mean 64.82% vs ACT alone 57.16% vs Debug alone 63.86% (Section 4.2). Per-model improvements are given in Table 2 with specific percentage point changes.",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "No justification for why 19 LLMs were selected as the sample size. No power analysis is discussed. The choice appears driven by availability rather than statistical planning.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "The paper explicitly uses n=1 (one sample per problem) per Section 4.1.1: 'we chose to generate only one sample per problem (n=1).' No variance across runs is reported.",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Six approaches are compared: Basic, AC, ACT, Debugger, AC+Debugger, and ACT+Debugger (Section 4.1.3, Figure 8). These include individual components and combinations.",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Baselines include the LDB debugger (2024) and self-collaboration framework (2023), both recent at time of writing. Models tested include GPT-4o, Claude 3.5 Sonnet, DeepSeek-V3, all contemporary.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "RQ2 (Section 4.3) systematically removes/adds components: Basic → AC → ACT → Debugger → AC+Debug → ACT+Debug, isolating the contribution of each component. Figure 8 shows the composition of each segment.",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "The paper evaluates pass@1 accuracy, code rigorousness (HumanEval vs HumanEval+ accuracy drop, RQ3), and generation latency (RQ4, Table 3).",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "Evaluation is entirely automated via pass/fail on test suites (HumanEval and HumanEval+). No human evaluation of code quality, readability, or correctness is performed.",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "The paper explicitly separates 'visible test cases' (used during the framework for debugging) from 'hidden test cases, reserved for evaluating the final output' (Section 3).",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Table 2 provides per-model breakdowns for all 19 LLMs across all 6 approaches on both datasets. Figures 4, 5 give detailed per-provider analysis.",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Section 4.2.3 discusses cases where ACT hurts performance (e.g., QwQ-Preview where AC/ACT performed significantly worse than Basic). Section 4.3.2 discusses specific failures like agentic complexity reducing accuracy for Llama, DeepSeek, etc.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Key negative findings reported: ACT+Debug does not significantly outperform Debug alone (Section 4.2.2); adding a Tester agent often reduces accuracy (Section 4.3.2); ACT+Debug shows highest robustness drop (Section 4.4.1, Figure 11).",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "Table 1 lists model names like 'GPT-4o', 'Claude 3.5 Sonnet', 'DeepSeek-V3' without specific version IDs or snapshot dates. The paper only states 'All APIs were accessed in the month of December 2024' which is insufficient per the schema requirement for exact versions.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Agent prompts are available in the GitHub repository (https://github.com/nazmus-ashrafi/multiagent_vs_debugger), referenced in Sections 3 and 4.3.1: 'All agent prompts, including those used in the debugging process, are can be found in our GitHub repository.'",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "No temperature, top-p, max tokens, or other LLM sampling parameters are reported. Only iteration limits are stated (retriesCT=3, retriesD=4 or 10). These significantly affect output quality.",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "The multi-agent scaffolding is described in detail in Section 3 with Figure 1 showing the architecture. Agent roles (Analyst, Coder, Tester), interaction flow, retry logic (retriesCT, retriesD), and the CFG-based debugging mechanism are all documented.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section 3 describes how HumanEval is segmented into three components: task description, visible test cases (for execution within framework), and hidden test cases (for final evaluation).",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "No raw experimental data (generated code, execution logs, per-problem pass/fail results) is released. Only aggregated pass@1 scores are shown in Table 2.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Data sources are well described: HumanEval (164 tasks, Section 4.1.2) and HumanEval+ (80x more tests). API access timing documented (December 2024, Table 1).",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants. Data sources are standard public benchmarks (HumanEval, HumanEval+).",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "The agent pipeline is described but the data pipeline from raw outputs to final metrics is not documented. No information on how many problems each model solved at each stage, how errors were handled, or what intermediate outputs looked like.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "The paper states APIs were accessed in December 2024 but does not state training data cutoff dates for any of the 19 models tested.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of whether HumanEval/HumanEval+ problems appear in the training data of the 19 models, despite HumanEval being published in 2021 and widely known to be contaminated in newer models.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "HumanEval was published in July 2021. All 19 models tested were trained well after this date and likely saw HumanEval solutions during training. This contamination risk is not addressed anywhere in the paper.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in this study. All evaluation is automated benchmark-based.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in this study.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants in this study.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in this study.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in this study.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in this study.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in this study.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Latency is reported in detail: Table 3 summarizes average time per approach (7.68 to 68.42 minutes), and Figure 13 shows per-model latency across approaches.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No total API costs, GPU hours, or hardware specifications are provided. Running 19 models × 6 approaches × 2 datasets represents substantial compute, but the total budget is not quantified.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "Only n=1 (one sample per problem) is generated. No multiple seeds or runs are conducted. Section 4.1.1: 'we chose to generate only one sample per problem (n=1).'",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": true,
    378           "justification": "Explicitly stated in Section 4.1.1: 'we chose to generate only one sample per problem (n=1) in our experiments.' This is clear, even though it's a single run.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "No hyperparameter search is described. The iteration limits (retriesCT=3, retriesD=4/10) appear to be set by design, but no search over LLM hyperparameters (temperature, etc.) is reported.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": true,
    390           "justification": "All six configurations are reported transparently in Table 2 with full per-model results. The paper does not selectively show only the best configuration — all approaches and all models are presented.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": true,
    395           "answer": false,
    396           "justification": "Multiple t-tests are conducted (ACT+Debug vs ACT, ACT+Debug vs Debug) without correction for multiple comparisons. The already-lenient α=0.15 exacerbates this issue.",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "No discussion of author-evaluation bias. The authors implement their own version of the ACT and LDB frameworks and compare them, but don't acknowledge that their implementations of baselines may systematically differ from the original authors' implementations.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": true,
    408           "justification": "RQ4 (Section 4.5, Table 3) explicitly compares latency vs accuracy across approaches: 'The AC + Debugger configuration attains the highest average accuracy (61.7%) across both datasets while maintaining a reasonable execution time of 38.42 minutes.'",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "No discussion of whether HumanEval/HumanEval+ actually measures the code generation quality the paper claims to evaluate. These are function-level Python tasks, but the paper claims implications for 'real-world programming scenarios' without questioning the benchmark's validity for that purpose.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": true,
    419           "answer": true,
    420           "justification": "The same scaffolding (ACT framework, LDB-based debugger) is applied consistently across all 19 models. Model comparisons use identical prompts and identical agent configurations, controlling for the scaffold variable.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "HumanEval was published in 2021 and all tested models were trained after 2021. Solutions and discussions of HumanEval problems are widely available online. This temporal leakage is not addressed.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "The visible test cases used during the debugging phase provide information about expected behavior. No discussion of whether this constitutes feature leakage relative to real-world usage conditions.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "No discussion of whether HumanEval problems or their variants appeared in model training data. Given the widespread use of HumanEval, near-duplicate problems likely exist in training corpora.",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are used.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "The combined ACT+Debugger approach significantly outperforms ACT alone (mean 64.82% vs 57.16% on HumanEval, α=0.15).",
    455       "evidence": "Paired one-tailed t-test on 19 LLMs, H0,1 rejected at the non-standard α=0.15 significance level.",
    456       "supported": "weak"
    457     },
    458     {
    459       "claim": "The combined ACT+Debugger approach does NOT significantly outperform Debugger alone (0.96% improvement, H0,2 not rejected).",
    460       "evidence": "Paired t-test at α=0.15 comparing ACT+Debug vs Debug alone; improvement too small to reach statistical significance even at this lenient threshold.",
    461       "supported": "strong"
    462     },
    463     {
    464       "claim": "Debugging-based approaches substantially outperform agentic workflows across most models (Debugger 63.86% vs ACT 57.16% mean on HumanEval).",
    465       "evidence": "Table 2 shows Debugger achieving higher accuracy than ACT for the majority of 19 LLMs across both datasets.",
    466       "supported": "moderate"
    467     },
    468     {
    469       "claim": "AC+Debugger achieves the optimal balance of accuracy (61.7%), code rigorousness, and latency (38.42 min) among all six configurations.",
    470       "evidence": "Table 3 and Sections 4.4–4.5 compare mean accuracy, HumanEval+ drop, and latency; AC+Debug achieves highest HumanEval+ accuracy among combination approaches at moderate latency.",
    471       "supported": "moderate"
    472     },
    473     {
    474       "claim": "Increasing agentic complexity reduces code rigorousness: ACT+Debug has the highest HumanEval-to-HumanEval+ accuracy drop (137.74).",
    475       "evidence": "Figure 11 shows ACT+Debug produces the largest accuracy drop vs AC+Debug (110.41) and Debugger alone (107.84).",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "The benefit of combining ACT with Debugger is greatest when ACT and Debugger performances are similar; large performance gaps between the two diminish the value of combining them.",
    480       "evidence": "Figures 6 and 7 show an inverse correlation between the ACT/AC-Debugger performance gap and the accuracy improvement from their combination across 38 data points.",
    481       "supported": "moderate"
    482     }
    483   ],
    484   "methodology_tags": [
    485     "benchmark-eval"
    486   ],
    487   "key_findings": "Across 19 LLMs evaluated on HumanEval and HumanEval+, runtime debugging alone (63.86% mean accuracy) consistently outperforms multi-agent agentic workflows (57.16%), and combining the two yields only a statistically non-significant 0.96% improvement over debugging alone. A simpler two-agent Analyst-Coder plus Debugger configuration achieves the best trade-off: highest code rigorousness and 61.7% mean accuracy at 38.42 minutes, compared to the full ACT+Debugger chain which requires 68.42 minutes with lower rigorousness. The effectiveness of combining approaches is governed by the performance gap between the individual techniques — when debugging significantly outperforms the agentic workflow for a given model, adding agentic collaboration introduces noise rather than benefit.",
    488   "red_flags": [
    489     {
    490       "flag": "Non-standard α=0.15 significance threshold",
    491       "detail": "The paper uses α=0.15 (15% significance level) for all hypothesis tests, more than three times the conventional α=0.05. This inflates Type I error rate substantially and makes the primary positive claim (ACT+Debug outperforms ACT alone) unreliable. The justification given ('detecting small performance gains') does not address why a non-standard threshold is scientifically appropriate."
    492     },
    493     {
    494       "flag": "No variance across runs — single-run point estimates",
    495       "detail": "All accuracy results are single-run point estimates (n=1 sample per problem per model). No multiple experiment repetitions are reported, meaning differences between configurations could reflect API non-determinism rather than systematic improvement. Paired t-tests on these single-run results treat stochastic outputs as fixed measurements."
    496     },
    497     {
    498       "flag": "HumanEval contamination unaddressed",
    499       "detail": "HumanEval was published in 2021. Most of the 19 tested models were trained after 2021, and their training corpora likely contain HumanEval solutions. This contamination risk is never acknowledged, which undermines the validity of pass@1 as an independent capability measure."
    500     },
    501     {
    502       "flag": "Overgeneralization beyond benchmark scope",
    503       "detail": "Claims about 'organizations seeking robust AI-driven coding solutions' and 'real-world AI applications' are based on 164 short, self-contained Python functions in HumanEval. Applicability to multi-file codebases, non-Python languages, or complex software engineering tasks is unwarranted and unstated."
    504     },
    505     {
    506       "flag": "Temperature and sampling hyperparameters absent",
    507       "detail": "Critical parameters (temperature, top-p, max tokens) are not reported, making it impossible to reproduce results, understand variance, or compare with other studies that used different sampling configurations on the same models."
    508     },
    509     {
    510       "flag": "Statistical inference treats heterogeneous LLMs as exchangeable units",
    511       "detail": "Paired t-tests use n=19 LLMs as statistical observations, conflating models of vastly different architectures, sizes (3.8B to 671B parameters), training paradigms, and domains as interchangeable data points. This violates exchangeability assumptions and inflates confidence in the significance results."
    512     }
    513   ],
    514   "cited_papers": [
    515     {
    516       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    517       "relevance": "Primary benchmark dataset and source of the pass@k metric central to the entire evaluation."
    518     },
    519     {
    520       "title": "AgentCoder: Multi-Agent-based Code Generation with Iterative Testing and Optimisation",
    521       "relevance": "Key multi-agent code generation framework directly related to the agentic approach being evaluated."
    522     },
    523     {
    524       "title": "MapCoder: Multi-Agent Code Generation for Competitive Problem Solving",
    525       "relevance": "Multi-agent system combining retrieval, planning, coding, and debugging agents; representative prior work."
    526     },
    527     {
    528       "title": "Debug like a Human: A Large Language Model Debugger via Verifying Runtime Execution Step-by-step (LDB)",
    529       "relevance": "Direct methodological basis for the debugging phase; block-level CFG decomposition adopted from this work."
    530     },
    531     {
    532       "title": "Self-collaboration Code Generation via ChatGPT",
    533       "relevance": "Direct basis for the ACT (Analyst-Coder-Tester) multi-agent structure implemented in this paper."
    534     },
    535     {
    536       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    537       "relevance": "Combined with LDB achieved 98.2 on HumanEval; cited as motivation for combining agentic and debugging approaches."
    538     },
    539     {
    540       "title": "Teaching Large Language Models to Self-Debug",
    541       "relevance": "Self-debugging framework using code explanation and execution feedback; directly relevant runtime debugging baseline."
    542     },
    543     {
    544       "title": "RGD: Multi-LLM Based Agent Debugger via Refinement and Generation Guidance",
    545       "relevance": "Multi-LLM three-agent debugging system; contemporary comparison point for combined agent+debugging approaches."
    546     },
    547     {
    548       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation (HumanEval+)",
    549       "relevance": "The stricter benchmark (80x more tests) used to assess code rigorousness — central to the paper's rigorousness metric."
    550     },
    551     {
    552       "title": "From Code to Correctness: Closing the Last Mile of Code Generation with Hierarchical Debugging (MGDebugger)",
    553       "relevance": "Hierarchical debugging system for LLM code generation using bottom-up block analysis; contemporary approach reviewed."
    554     }
    555   ],
    556   "engagement_factors": {
    557     "practical_relevance": {
    558       "score": 3,
    559       "justification": "Directly actionable: use AC+Debugger (not the more complex ACT+Debugger) for best accuracy-latency trade-off; debugging dominates agentic workflows; model-specific guidance provided for 19 real APIs practitioners can access today."
    560     },
    561     "surprise_contrarian": {
    562       "score": 2,
    563       "justification": "Counterintuitive finding that simpler agentic workflows outperform complex ones, and that adding a Tester agent reduces code rigorousness — challenges the prevailing assumption that more agentic collaboration equals better results."
    564     },
    565     "fear_safety": {
    566       "score": 0,
    567       "justification": "No safety concerns, security issues, or alignment risks are raised; this is a purely performance-oriented engineering study."
    568     },
    569     "drama_conflict": {
    570       "score": 1,
    571       "justification": "Mild tension between multi-agent hype and the finding that straightforward runtime debugging dominates; no major controversy or conflict with influential prior work beyond showing complexity has diminishing returns."
    572     },
    573     "demo_ability": {
    574       "score": 2,
    575       "justification": "GitHub repository with prompts is linked; practitioners with API access to any of the 19 listed models could replicate the AC+Debugger setup using the disclosed prompts and publicly available HumanEval benchmark."
    576     },
    577     "brand_recognition": {
    578       "score": 2,
    579       "justification": "Evaluates GPT-4o, Claude 3.5 Sonnet, DeepSeek-V3, Llama 3, and Gemini — all high-profile models from recognizable labs — lending practical relevance to engineers already using these systems."
    580     }
    581   },
    582   "hn_data": {
    583     "threads": [
    584       {
    585         "hn_id": "43390400",
    586         "title": "Deep Learning Is Not So Mysterious or Different",
    587         "points": 485,
    588         "comments": 126,
    589         "url": "https://news.ycombinator.com/item?id=43390400",
    590         "created_at": "2025-03-17T16:47:02Z"
    591       },
    592       {
    593         "hn_id": "45291024",
    594         "title": "Launch HN: Cactus (YC S25) – AI inference on smartphones",
    595         "points": 123,
    596         "comments": 63,
    597         "url": "https://news.ycombinator.com/item?id=45291024",
    598         "created_at": "2025-09-18T15:40:29Z"
    599       },
    600       {
    601         "hn_id": "44430311",
    602         "title": "Small language models are the future of agentic AI",
    603         "points": 113,
    604         "comments": 45,
    605         "url": "https://news.ycombinator.com/item?id=44430311",
    606         "created_at": "2025-07-01T03:33:49Z"
    607       },
    608       {
    609         "hn_id": "44659764",
    610         "title": "Mitigating Tool Squatting and Rug Pull Attacks in Model Context Protocol (MCP)",
    611         "points": 5,
    612         "comments": 0,
    613         "url": "https://news.ycombinator.com/item?id=44659764",
    614         "created_at": "2025-07-23T14:42:26Z"
    615       },
    616       {
    617         "hn_id": "44246361",
    618         "title": "Small Language Models Are the Future of Agentic AI",
    619         "points": 5,
    620         "comments": 0,
    621         "url": "https://news.ycombinator.com/item?id=44246361",
    622         "created_at": "2025-06-11T11:16:33Z"
    623       },
    624       {
    625         "hn_id": "44003454",
    626         "title": "Twist: Teleoperated Whole-Body Imitation System",
    627         "points": 2,
    628         "comments": 0,
    629         "url": "https://news.ycombinator.com/item?id=44003454",
    630         "created_at": "2025-05-16T09:44:32Z"
    631       },
    632       {
    633         "hn_id": "23087191",
    634         "title": "A Survey on Dialog Management: Recent Advances and Challenges",
    635         "points": 2,
    636         "comments": 0,
    637         "url": "https://news.ycombinator.com/item?id=23087191",
    638         "created_at": "2020-05-06T01:52:26Z"
    639       },
    640       {
    641         "hn_id": "45549900",
    642         "title": "Agentic web browsing can't scale with cloud LLMs",
    643         "points": 1,
    644         "comments": 0,
    645         "url": "https://news.ycombinator.com/item?id=45549900",
    646         "created_at": "2025-10-11T15:29:17Z"
    647       },
    648       {
    649         "hn_id": "43291939",
    650         "title": "Deep Learning Is Not So Mysterious or Different",
    651         "points": 1,
    652         "comments": 0,
    653         "url": "https://news.ycombinator.com/item?id=43291939",
    654         "created_at": "2025-03-07T17:11:27Z"
    655       }
    656     ],
    657     "top_points": 485,
    658     "total_points": 737,
    659     "total_comments": 234
    660   }
    661 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs