scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30602B)
      1 {
      2   "paper": {
      3     "title": "A Deep Dive Into Large Language Model Code Generation Mistakes: What and Why?",
      4     "authors": [
      5       "QiHong Chen",
      6       "Jiachen Yu",
      7       "Jiawei Li",
      8       "Jiecheng Deng",
      9       "Justin Tian Jin Chen",
     10       "Iftekhar Ahmed"
     11     ],
     12     "year": 2024,
     13     "venue": "arXiv",
     14     "arxiv_id": "2411.01414",
     15     "doi": "10.48550/arXiv.2411.01414"
     16   },
     17   "scan_version": 3,
     18   "active_modules": [
     19     "experimental_rigor",
     20     "data_leakage"
     21   ],
     22   "methodology_tags": [
     23     "benchmark-eval",
     24     "qualitative"
     25   ],
     26   "key_findings": "Identifies 17 types of non-syntactic mistakes in LLM-generated code (10 new vs prior work) across GPT-4 and Qwen2.5-Coder on HumanEval-X and MBXP datasets. Six underlying reasons are identified through manual analysis and semi-automated verification, with 'Misleading Coding Question Specification' accounting for 56% of mistakes. GPT-4 with ReAct prompting achieves F1 of 0.78 for automated reason identification, though positional sensitivity remains poorly detected (F1=0.65).",
     27   "checklist": {
     28     "artifacts": {
     29       "code_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "A replication package is provided on figshare (reference [19]: https://figshare.com/s/10e27d42bf537f6321f7), described as containing prompts, data, and results."
     33       },
     34       "data_released": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The paper provides a benchmark of 202 coding questions via the figshare replication package. The base datasets HumanEval-X and MBXP are publicly available standard benchmarks."
     38       },
     39       "environment_specified": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No environment specifications, requirements files, or dependency lists are mentioned in the paper."
     43       },
     44       "reproduction_instructions": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No step-by-step reproduction instructions are provided in the paper. The replication package is referenced but no specific instructions for reproducing experiments are described."
     48       }
     49     },
     50     "statistical_methodology": {
     51       "confidence_intervals_or_error_bars": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "All results (precision, coverage rate, F1 scores) are reported as point estimates with no confidence intervals or error bars."
     55       },
     56       "significance_tests": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No statistical significance tests are performed. Comparisons between prompting approaches (Base vs Advanced vs ReAct) rely on raw F1 score differences without any hypothesis testing."
     60       },
     61       "effect_sizes_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "F1 scores are reported for all three approaches (0.64, 0.73, 0.78) with per-reason breakdowns in Table 2, providing baseline context for understanding the magnitude of improvements. Precision (0.95-0.97) and coverage rates (0.94) are reported with human baselines (1.0 and 0.98-0.99) for comparison."
     65       },
     66       "sample_size_justified": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No justification for the sample size of 202 instances in the benchmark. No power analysis or discussion of whether this sample is sufficient for the claims made."
     70       },
     71       "variance_reported": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No variance or standard deviation reported. The paper states 'we set the LLMs' temperature to 0 and executed multiple runs' but does not report how many runs or any spread measures across them."
     75       }
     76     },
     77     "evaluation_design": {
     78       "baselines_included": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Three prompting approaches are compared (Base Prompt, Advanced Prompt, Advanced+ReAct). Human evaluators serve as an upper baseline for mistake identification (precision 1.0, CR 0.98-0.99 vs GPT-4's 0.95-0.97, 0.94)."
     82       },
     83       "baselines_contemporary": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Comparisons are against contemporary methods: ReAct prompting (2022), self-consistency (2022), and the paper extends prior work by Fan et al. (2023), Song et al. (2023), and Tambon et al. (2025). Models used (GPT-4, Qwen2.5-Coder) are contemporary."
     87       },
     88       "ablation_study": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The three-level prompting comparison (Base → Advanced → Advanced+ReAct) functions as an ablation, showing the contribution of human-labeled reason definitions and tool-based reasoning. Per-reason F1 breakdown in Table 2 shows which components improve which reasons."
     92       },
     93       "multiple_metrics": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Multiple metrics are used: Precision, Coverage Rate (CR) for mistake identification; F1 score (with precision and recall in replication package) for reason identification. Jaccard similarity for APR validation."
     97       },
     98       "human_evaluation": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Four authors with 5+ years programming experience manually reviewed all incorrect LLM-generated code, conducted open coding with negotiated agreement, and served as the ground truth for evaluating GPT-4's identification capabilities."
    102       },
    103       "held_out_test_set": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "The 202-instance benchmark used for RQ3 evaluation was constructed from the same data analyzed in RQ1 and RQ2. There is no indication that any data was held out during prompt development for the Base, Advanced, and ReAct approaches."
    107       },
    108       "per_category_breakdown": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Table 1 provides per-category and per-type mistake frequency with severity breakdowns. Table 2 provides per-reason F1 scores for all three prompting approaches."
    112       },
    113       "failure_cases_discussed": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Detailed failure examples are shown in Figures 1-3 with code snippets and explanations. Specific failure patterns are discussed (e.g., positional sensitivity F1=0.25 with Base Prompt, edge case handling failures)."
    117       },
    118       "negative_results_reported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Positional sensitivity detection remains poor even with best method (F1=0.65). LLMs struggle with mathematical concepts (71.43% divergent algorithm errors for math knowledge). The paper acknowledges GPT-4 hallucinated some mistakes (precision < 1.0)."
    122       }
    123     },
    124     "claims_and_evidence": {
    125       "abstract_claims_supported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Abstract claims are supported: 17 mistake types (Table 1), 10 new (highlighted in Table 1), 6 reasons (Section 5.2), GPT-4 F1 of 0.78 for reason identification (Table 2). All claims have corresponding evidence in the paper."
    129       },
    130       "causal_claims_justified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The six identified reasons are causal claims ('X causes mistake Y'). The paper uses a semi-automated verification method: modifying the suspected cause (rephrasing, repositioning, adding examples) and checking if regenerated code passes all tests. This controlled manipulation provides reasonable causal evidence."
    134       },
    135       "generalization_bounded": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The title 'A Deep Dive Into Large Language Model Code Generation Mistakes' frames results broadly, but the study tests only GPT-4 and Qwen2.5-Coder on Python and Java using two benchmarks. The threats section acknowledges this but the title and abstract overgeneralize beyond the tested setting."
    139       },
    140       "alternative_explanations_discussed": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The threats-to-validity section discusses methodology limitations (prompt design, test case coverage) but does not consider alternative explanations for the observed results, such as whether the 6 identified reasons are the actual causes vs artifacts of the analysis method, or whether the mistake taxonomy could be structured differently."
    144       },
    145       "proxy_outcome_distinction": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The paper measures test case failures and manually categorizes mistakes, framing findings as 'code generation mistakes.' The claims match the granularity of measurements — they study mistakes (test failures) and call them mistakes, without inflating to broader claims about LLM reasoning or intelligence."
    149       }
    150     },
    151     "setup_transparency": {
    152       "model_versions_specified": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Specific model versions are stated: 'GPT-4-0125-preview (GPT-4)' and 'qwen2.5-coder-14b-instruct' (Section 3)."
    156       },
    157       "prompts_provided": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The paper states 'the full prompt is included in our replication package [19]' (figshare link provided). While not included in the paper body, the prompts are in the linked repository."
    161       },
    162       "hyperparameters_reported": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Temperature set to 0 for code generation and reason identification, 0.5 for paraphrasing (Section 3, Section 5.1.1). CHATREPAIR used with default settings, 30 trials maximum."
    166       },
    167       "scaffolding_described": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The ReAct scaffolding for RQ3 is described in detail: three custom tools (Function Call Analysis Tool, Function Signature Explainer Tool, Coding Question Specification Ambiguity Check Tool) with their implementation described in Section 6.1.2. CHATREPAIR's iterative conversation process is described in Section 4.1."
    171       },
    172       "data_preprocessing_documented": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Data preparation is documented: LLMs prompted on all coding questions, syntactic failures filtered (27 discarded from 25 questions), test case results collected, APR applied, Jaccard similarity computed for validation (Section 3, Section 4.1)."
    176       }
    177     },
    178     "limitations_and_scope": {
    179       "limitations_section_present": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 7 'Threats to Validity' discusses construct, internal, and external validity with substantive content across three paragraphs."
    183       },
    184       "threats_to_validity_specific": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Specific threats discussed: prompt design may affect code quality, test cases may not be comprehensive (construct); manual examination may introduce incorrect understanding and overlooked mistakes (internal); results may not generalize to other LLMs, languages, or benchmarks (external). These are specific to this study."
    188       },
    189       "scope_boundaries_stated": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Explicit scope boundaries: 'Our findings may not be generalizable to LLM-generated code from other benchmarks or programming languages beyond Java and Python and all available LLMs.' Also: 'the identified reasons...are not exhaustive since, with the evolution of LLMs, their mistakes and underlying reasons also evolve.'"
    193       }
    194     },
    195     "data_integrity": {
    196       "raw_data_available": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "A replication package is provided on figshare (reference [19]) containing the 202 coding question benchmark with labeled reasons. The base datasets (HumanEval-X, MBXP) are publicly available."
    200       },
    201       "data_collection_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Data collection is described: LLMs prompted on 2,268 coding questions (328 HumanEval-X + 1,940 MBXP), test cases executed, 27 syntactic failures discarded, remaining failures retained with test case information (Section 3)."
    205       },
    206       "recruitment_methods_described": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "No human participants. Data sources are standard public benchmarks (HumanEval-X and MBXP)."
    210       },
    211       "data_pipeline_documented": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Pipeline documented: prompt LLMs → execute test cases → filter syntactic errors (27 removed) → collect failure information → apply APR (CHATREPAIR, 30 trials max) → compare with ground truth → compute Jaccard similarity → select reference code (APR preferred in 99% of cases)."
    215       }
    216     },
    217     "conflicts_of_interest": {
    218       "funding_disclosed": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No funding source is mentioned anywhere in the paper. No acknowledgments section listing grants or sponsors."
    222       },
    223       "affiliations_disclosed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Author affiliations are clearly listed: UC Irvine, UIUC, UC Riverside. Authors are not affiliated with the companies (OpenAI, Alibaba) whose models are evaluated."
    227       },
    228       "funder_independent_of_outcome": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "Funding is not disclosed, so independence cannot be verified. The authors are university-affiliated with no apparent commercial interest, but without explicit funding disclosure this cannot be confirmed."
    232       },
    233       "financial_interests_declared": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No competing interests or financial interests statement is present in the paper."
    237       }
    238     },
    239     "contamination": {
    240       "training_cutoff_stated": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No training data cutoff dates are stated for either GPT-4-0125-preview or Qwen2.5-Coder, despite both being evaluated on benchmarks that could be in their training data."
    244       },
    245       "train_test_overlap_discussed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No discussion of whether HumanEval-X or MBXP problems appeared in the training data of GPT-4 or Qwen2.5-Coder. HumanEval was published in 2021, well before these models' training cutoffs."
    249       },
    250       "benchmark_contamination_addressed": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "HumanEval (published 2021) and MBXP (published 2022) were both available online before GPT-4 and Qwen2.5-Coder were trained. No contamination discussion despite this being a significant concern for interpreting which mistakes are 'real' vs artifacts of memorization."
    254       }
    255     },
    256     "human_studies": {
    257       "pre_registered": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants. The manual analysis is performed by the authors as researchers, not as study subjects."
    261       },
    262       "irb_or_ethics_approval": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in the study."
    266       },
    267       "demographics_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in the study."
    271       },
    272       "inclusion_exclusion_criteria": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in the study."
    276       },
    277       "randomization_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in the study."
    281       },
    282       "blinding_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in the study."
    286       },
    287       "attrition_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human participants in the study."
    291       }
    292     },
    293     "cost_and_practicality": {
    294       "inference_cost_reported": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No inference cost or API costs reported despite extensive use of GPT-4 API for code generation, APR (CHATREPAIR with up to 30 trials), paraphrasing, test case generation, and reason identification."
    298       },
    299       "compute_budget_stated": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No total computational budget stated. The study involves thousands of GPT-4 API calls across multiple experiments but provides no cost or compute information."
    303       }
    304     },
    305     "experimental_rigor": {
    306       "seed_sensitivity_reported": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Temperature is set to 0 for deterministic outputs, but the paper mentions 'executed multiple runs' without reporting any variance across runs or seed sensitivity analysis."
    310       },
    311       "number_of_runs_stated": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The paper says 'executed multiple runs for each coding problem' but does not state the exact number of runs."
    315       },
    316       "hyperparameter_search_budget": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "No hyperparameter search is described. CHATREPAIR uses 'default settings as recommended by Xia et al.' but no search over configurations is reported for the prompting approaches."
    320       },
    321       "best_config_selection_justified": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "All three configurations (Base, Advanced, Advanced+ReAct) are reported transparently in Table 2 with results for each. No selective reporting of only the best configuration."
    325       },
    326       "multiple_comparison_correction": {
    327         "applies": false,
    328         "answer": false,
    329         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    330       },
    331       "self_comparison_bias_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The authors evaluate their own mistake taxonomy and reason identification benchmark without acknowledging potential bias from being both the creators of the ground truth and the evaluators of automated performance."
    335       },
    336       "compute_budget_vs_performance": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The ReAct approach uses significantly more compute (multiple tool calls with GPT-4) than the Base or Advanced prompts, but performance is not reported as a function of compute budget."
    340       },
    341       "benchmark_construct_validity": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "No discussion of whether HumanEval-X and MBXP problems represent real-world coding challenges. The paper does not question whether coding competition-style problems capture the types of mistakes that matter in practice."
    345       },
    346       "scaffold_confound_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "The ReAct approach adds three custom tools (scaffolding) on top of GPT-4, but the paper does not discuss whether the performance gains come from the tools vs the reasoning approach. For code generation, the same prompts are used for both models, but no scaffold confound discussion is present."
    350       }
    351     },
    352     "data_leakage": {
    353       "temporal_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of temporal leakage. HumanEval (2021) and MBXP (2022) were published before GPT-4 and Qwen2.5-Coder were trained. Models may have memorized solutions, affecting which mistakes they make."
    357       },
    358       "feature_leakage_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of feature leakage. The prompts include full coding question specifications, function signatures, and input-output examples — no discussion of whether this setup leaks answer information."
    362       },
    363       "non_independence_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No discussion of whether training data included HumanEval or MBXP problems or structurally similar problems."
    367       },
    368       "leakage_detection_method": {
    369         "applies": true,
    370         "answer": false,
    371         "justification": "No concrete leakage detection method is applied (no canary strings, membership inference, or decontamination)."
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "17 types of non-syntactic mistakes identified in LLM-generated code, 10 of which were overlooked by previous studies",
    378       "evidence": "Table 1 lists all 17 mistake types across 7 categories with frequency percentages and severity distributions. Newly identified types are highlighted. Analysis based on GPT-4 and Qwen2.5-Coder generated code on HumanEval-X and MBXP (Section 4.2).",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Six underlying reasons identified for LLM code generation mistakes, with Misleading Coding Question Specification accounting for 56.19%",
    383       "evidence": "Section 5.2 describes all six reasons with examples (Figures 3a-d). Semi-automated verification: modifying the suspected cause and regenerating code to check if it passes tests. Frequency breakdown: MCQS 56.19%, PIOD 21.26%, ITK 5.71%, EC 4.86%, MFS 4.44%, PS 4.12%.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "GPT-4 achieves precision of 0.95-0.97 and coverage rate of 0.94 for mistake identification, comparable to human evaluators",
    388       "evidence": "Section 6.2.1: GPT-4 precision 0.97 (HumanEval-X), 0.95 (MBXP), CR 0.94 on both. Humans: precision 1.0, CR 0.98-0.99. Single model evaluation, no variance reported.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "GPT-4 with ReAct prompting achieves F1 of 0.78 for reason identification, outperforming Base (0.64) and Advanced (0.73) prompts",
    393       "evidence": "Table 2 shows per-reason F1 scores for all three approaches. ReAct improves across all reasons, with strongest gain in PIOD (0.67→0.87) and consistent but smaller gains elsewhere.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "LLMs frequently misapply mathematical concepts, with 71.43% of math knowledge mistakes being Divergent Algorithm Design Errors (most severe category)",
    398       "evidence": "Table 1 severity frequency column: Incorrect Math Knowledge Error shows 7.69% FADE, 20.88% PADE, 71.43% DADE. Example in Figure 1(b) shows incorrect standard deviation calculation. Math knowledge errors account for 14.24% of all mistakes.",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "APR-fixed code showed higher similarity to LLM code than ground truth in 99% of cases",
    403       "evidence": "Section 4.1: Jaccard similarity 0.88 for Qwen2.5-Coder APR code, 0.72-0.86 for GPT-4 APR code, vs 0.22-0.44 between LLM code and ground truth. Quantitative comparison across all instances.",
    404       "supported": "strong"
    405     }
    406   ],
    407   "red_flags": [
    408     {
    409       "flag": "No contamination discussion",
    410       "detail": "HumanEval (2021) and MBXP (2022) were published years before GPT-4 and Qwen2.5-Coder were trained. If models memorized solutions, the 'mistakes' observed may not represent genuine generation failures — they could reflect partial memorization artifacts. This fundamentally affects the validity of the mistake taxonomy."
    411     },
    412     {
    413       "flag": "GPT-4 used to evaluate GPT-4",
    414       "detail": "GPT-4 is used to paraphrase coding questions (Section 5.1.1), generate test inputs (Section 5.1.1), summarize function signatures (Section 6.1.2), check specification ambiguity (Section 6.1.2), AND identify mistakes and reasons (RQ3). This creates circularity: GPT-4's failure patterns may be systematically invisible to GPT-4's evaluation."
    415     },
    416     {
    417       "flag": "No statistical tests for comparisons",
    418       "detail": "All comparisons between prompting approaches (Base vs Advanced vs ReAct) and between GPT-4 vs human evaluators are made by comparing raw numbers without any significance testing. The differences could be within noise."
    419     },
    420     {
    421       "flag": "Unspecified number of runs",
    422       "detail": "The paper states 'executed multiple runs for each coding problem' but never specifies the exact number. This makes it impossible to assess the reliability of the code generation results and whether the 'most deterministic outputs' selection introduced bias."
    423     },
    424     {
    425       "flag": "Majority of mistakes attributed to specification ambiguity, not LLM capability",
    426       "detail": "56.19% of mistakes are attributed to Misleading Coding Question Specification — essentially blaming the benchmarks rather than the models. This raises the question of whether the paper is studying LLM limitations or benchmark limitations, undermining the title's framing."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Evaluating large language models trained on code",
    432       "authors": [
    433         "Mark Chen",
    434         "Jerry Tworek"
    435       ],
    436       "year": 2021,
    437       "arxiv_id": "2107.03374",
    438       "relevance": "Introduced HumanEval benchmark and Codex model, foundational for LLM code generation evaluation."
    439     },
    440     {
    441       "title": "GPT-4 technical report",
    442       "authors": [
    443         "Josh Achiam"
    444       ],
    445       "year": 2023,
    446       "arxiv_id": "2303.08774",
    447       "relevance": "Technical report for GPT-4, one of the two models evaluated in this study."
    448     },
    449     {
    450       "title": "Automated repair of programs from large language models",
    451       "authors": [
    452         "Zhiyu Fan",
    453         "Xiang Gao",
    454         "Martin Mirchev"
    455       ],
    456       "year": 2023,
    457       "relevance": "Prior work identifying error categories in LLM-generated code from LeetCode problems with Codex."
    458     },
    459     {
    460       "title": "An Empirical Study of Code Generation Errors made by Large Language Models",
    461       "authors": [
    462         "Da Song",
    463         "Zijie Zhou"
    464       ],
    465       "year": 2023,
    466       "relevance": "Identified syntactic and semantic mistakes in LLM-generated code on HumanEval, a direct predecessor to this study."
    467     },
    468     {
    469       "title": "Bugs in large language models generated code: An empirical study",
    470       "authors": [
    471         "Florian Tambon",
    472         "Arghavan Moradi-Dakhel"
    473       ],
    474       "year": 2025,
    475       "relevance": "Examined 333 bugs from LLM-generated code identifying ten mistake categories, directly compared in this paper."
    476     },
    477     {
    478       "title": "Large language models and simple, stupid bugs",
    479       "authors": [
    480         "Kevin Jesse",
    481         "Toufique Ahmed"
    482       ],
    483       "year": 2023,
    484       "relevance": "Found LLMs produce SStuBs due to training data quality issues, informing the training-induced mistake hypothesis."
    485     },
    486     {
    487       "title": "LLM hallucinations in practical code generation: Phenomena, mechanism, and mitigation",
    488       "authors": [
    489         "Ziyao Zhang",
    490         "Yanlin Wang"
    491       ],
    492       "year": 2024,
    493       "arxiv_id": "2409.20550",
    494       "relevance": "Categorized eight types of code generation mistakes from CoderEval, directly compared in this study."
    495     },
    496     {
    497       "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    498       "authors": [
    499         "Steven Xia",
    500         "Lingming Zhang"
    501       ],
    502       "year": 2023,
    503       "arxiv_id": "2304.00385",
    504       "relevance": "CHATREPAIR automated program repair tool used as core methodology for fixing incorrect LLM-generated code."
    505     },
    506     {
    507       "title": "React: Synergizing reasoning and acting in language models",
    508       "authors": [
    509         "Shunyu Yao",
    510         "Jeffrey Zhao"
    511       ],
    512       "year": 2022,
    513       "arxiv_id": "2210.03629",
    514       "relevance": "ReAct prompting technique used for the best-performing reason identification approach in this study."
    515     },
    516     {
    517       "title": "ClarifyGPT: Empowering LLM-based Code Generation with Intention Clarification",
    518       "authors": [
    519         "Fangwen Mu",
    520         "Lin Shi"
    521       ],
    522       "year": 2023,
    523       "arxiv_id": "2310.10996",
    524       "relevance": "Addressed ambiguous coding requirements as a cause of LLM mistakes, related to the specification ambiguity finding."
    525     },
    526     {
    527       "title": "Qwen2.5-coder technical report",
    528       "authors": [
    529         "Binyuan Hui",
    530         "Jian Yang"
    531       ],
    532       "year": 2024,
    533       "arxiv_id": "2409.12186",
    534       "relevance": "Technical report for Qwen2.5-Coder, one of the two models evaluated in this study."
    535     },
    536     {
    537       "title": "Do Large Language Models Pay Similar Attention Like Human Programmers When Generating Code?",
    538       "authors": [
    539         "Bonan Kou",
    540         "Shengmai Chen"
    541       ],
    542       "year": 2024,
    543       "relevance": "Analyzed attention patterns in LLM code generation, related to the positional sensitivity finding in this study."
    544     }
    545   ],
    546   "engagement_factors": {
    547     "practical_relevance": {
    548       "score": 1,
    549       "justification": "Taxonomy of LLM code mistakes is informative but not directly actionable as a tool or technique practitioners can apply."
    550     },
    551     "surprise_contrarian": {
    552       "score": 1,
    553       "justification": "The finding that 56% of mistakes stem from ambiguous specifications rather than model limitations is mildly surprising but not a strong contrarian claim."
    554     },
    555     "fear_safety": {
    556       "score": 0,
    557       "justification": "No safety, security, or risk angle is discussed."
    558     },
    559     "drama_conflict": {
    560       "score": 1,
    561       "justification": "Implicitly questions benchmark quality by attributing most failures to specification ambiguity rather than model capability, but doesn't frame this as a controversy."
    562     },
    563     "demo_ability": {
    564       "score": 1,
    565       "justification": "Replication package on Figshare exists but no live demo or easy-to-run tool."
    566     },
    567     "brand_recognition": {
    568       "score": 1,
    569       "justification": "Uses GPT-4 and mentions OpenAI but authors are from UC Irvine and UIUC, not major AI labs."
    570     }
    571   }
    572 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs