ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (32632B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "A Deep Dive Into Large Language Model Code Generation Mistakes: What and Why?",
      6     "authors": [
      7       "QiHong Chen",
      8       "Jiachen Yu",
      9       "Jiawei Li",
     10       "Jiecheng Deng",
     11       "Justin Tian Jin Chen",
     12       "Iftekhar Ahmed"
     13     ],
     14     "year": 2024,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2411.01414",
     17     "doi": "10.48550/arXiv.2411.01414"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Abstract claims are supported: 17 mistake types (Table 1), 10 new (highlighted in Table 1), 6 reasons (Section 5.2), GPT-4 F1 of 0.78 for reason identification (Table 2). All claims have corresponding evidence in the paper.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The six identified reasons are causal claims ('X causes mistake Y'). The paper uses a semi-automated verification method: modifying the suspected cause (rephrasing, repositioning, adding examples) and checking if regenerated code passes all tests. This controlled manipulation provides reasonable causal evidence.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title 'A Deep Dive Into Large Language Model Code Generation Mistakes' frames results broadly, but the study tests only GPT-4 and Qwen2.5-Coder on Python and Java using two benchmarks. The threats section acknowledges this but the title and abstract overgeneralize beyond the tested setting.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The threats-to-validity section discusses methodology limitations (prompt design, test case coverage) but does not consider alternative explanations for the observed results, such as whether the 6 identified reasons are the actual causes vs artifacts of the analysis method, or whether the mistake taxonomy could be structured differently.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper measures test case failures and manually categorizes mistakes, framing findings as 'code generation mistakes.' The claims match the granularity of measurements — they study mistakes (test failures) and call them mistakes, without inflating to broader claims about LLM reasoning or intelligence.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 7 'Threats to Validity' discusses construct, internal, and external validity with substantive content across three paragraphs.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Specific threats discussed: prompt design may affect code quality, test cases may not be comprehensive (construct); manual examination may introduce incorrect understanding and overlooked mistakes (internal); results may not generalize to other LLMs, languages, or benchmarks (external). These are specific to this study.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Explicit scope boundaries: 'Our findings may not be generalizable to LLM-generated code from other benchmarks or programming languages beyond Java and Python and all available LLMs.' Also: 'the identified reasons...are not exhaustive since, with the evolution of LLMs, their mistakes and underlying reasons also evolve.'",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding source is mentioned anywhere in the paper. No acknowledgments section listing grants or sponsors.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are clearly listed: UC Irvine, UIUC, UC Riverside. Authors are not affiliated with the companies (OpenAI, Alibaba) whose models are evaluated.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Funding is not disclosed, so independence cannot be verified. The authors are university-affiliated with no apparent commercial interest, but without explicit funding disclosure this cannot be confirmed.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial interests statement is present in the paper.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key terms are defined: 'non-syntactic mistakes' (two categories: runtime errors and test failures), severity levels FADE/PADE/DADE, and coverage rate CR are all formally defined.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 1 explicitly lists four numbered contributions: a mistake taxonomy, a reasons list, a 202-question benchmark, and an empirical LLM self-identification evaluation.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 reviews Fan et al., Song et al., Zhang et al., and Tambon et al. and explicitly identifies their limitations (small datasets, earlier LLMs, Python-only), positioning this work as an extension along each dimension.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "A replication package is provided on figshare (reference [19]: https://figshare.com/s/10e27d42bf537f6321f7), described as containing prompts, data, and results.",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "The paper provides a benchmark of 202 coding questions via the figshare replication package. The base datasets HumanEval-X and MBXP are publicly available standard benchmarks.",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No environment specifications, requirements files, or dependency lists are mentioned in the paper.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No step-by-step reproduction instructions are provided in the paper. The replication package is referenced but no specific instructions for reproducing experiments are described.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "All results (precision, coverage rate, F1 scores) are reported as point estimates with no confidence intervals or error bars.",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No statistical significance tests are performed. Comparisons between prompting approaches (Base vs Advanced vs ReAct) rely on raw F1 score differences without any hypothesis testing.",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "F1 scores are reported for all three approaches (0.64, 0.73, 0.78) with per-reason breakdowns in Table 2, providing baseline context for understanding the magnitude of improvements. Precision (0.95-0.97) and coverage rates (0.94) are reported with human baselines (1.0 and 0.98-0.99) for comparison.",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "No justification for the sample size of 202 instances in the benchmark. No power analysis or discussion of whether this sample is sufficient for the claims made.",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No variance or standard deviation reported. The paper states 'we set the LLMs' temperature to 0 and executed multiple runs' but does not report how many runs or any spread measures across them.",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Three prompting approaches are compared (Base Prompt, Advanced Prompt, Advanced+ReAct). Human evaluators serve as an upper baseline for mistake identification (precision 1.0, CR 0.98-0.99 vs GPT-4's 0.95-0.97, 0.94).",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Comparisons are against contemporary methods: ReAct prompting (2022), self-consistency (2022), and the paper extends prior work by Fan et al. (2023), Song et al. (2023), and Tambon et al. (2025). Models used (GPT-4, Qwen2.5-Coder) are contemporary.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "The three-level prompting comparison (Base → Advanced → Advanced+ReAct) functions as an ablation, showing the contribution of human-labeled reason definitions and tool-based reasoning. Per-reason F1 breakdown in Table 2 shows which components improve which reasons.",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Multiple metrics are used: Precision, Coverage Rate (CR) for mistake identification; F1 score (with precision and recall in replication package) for reason identification. Jaccard similarity for APR validation.",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Four authors with 5+ years programming experience manually reviewed all incorrect LLM-generated code, conducted open coding with negotiated agreement, and served as the ground truth for evaluating GPT-4's identification capabilities.",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": false,
    213           "justification": "The 202-instance benchmark used for RQ3 evaluation was constructed from the same data analyzed in RQ1 and RQ2. There is no indication that any data was held out during prompt development for the Base, Advanced, and ReAct approaches.",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Table 1 provides per-category and per-type mistake frequency with severity breakdowns. Table 2 provides per-reason F1 scores for all three prompting approaches.",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Detailed failure examples are shown in Figures 1-3 with code snippets and explanations. Specific failure patterns are discussed (e.g., positional sensitivity F1=0.25 with Base Prompt, edge case handling failures).",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Positional sensitivity detection remains poor even with best method (F1=0.65). LLMs struggle with mathematical concepts (71.43% divergent algorithm errors for math knowledge). The paper acknowledges GPT-4 hallucinated some mistakes (precision < 1.0).",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "Specific model versions are stated: 'GPT-4-0125-preview (GPT-4)' and 'qwen2.5-coder-14b-instruct' (Section 3).",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "The paper states 'the full prompt is included in our replication package [19]' (figshare link provided). While not included in the paper body, the prompts are in the linked repository.",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Temperature set to 0 for code generation and reason identification, 0.5 for paraphrasing (Section 3, Section 5.1.1). CHATREPAIR used with default settings, 30 trials maximum.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "The ReAct scaffolding for RQ3 is described in detail: three custom tools (Function Call Analysis Tool, Function Signature Explainer Tool, Coding Question Specification Ambiguity Check Tool) with their implementation described in Section 6.1.2. CHATREPAIR's iterative conversation process is described in Section 4.1.",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Data preparation is documented: LLMs prompted on all coding questions, syntactic failures filtered (27 discarded from 25 questions), test case results collected, APR applied, Jaccard similarity computed for validation (Section 3, Section 4.1).",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "A replication package is provided on figshare (reference [19]) containing the 202 coding question benchmark with labeled reasons. The base datasets (HumanEval-X, MBXP) are publicly available.",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Data collection is described: LLMs prompted on 2,268 coding questions (328 HumanEval-X + 1,940 MBXP), test cases executed, 27 syntactic failures discarded, remaining failures retained with test case information (Section 3).",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participants. Data sources are standard public benchmarks (HumanEval-X and MBXP).",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "Pipeline documented: prompt LLMs → execute test cases → filter syntactic errors (27 removed) → collect failure information → apply APR (CHATREPAIR, 30 trials max) → compare with ground truth → compute Jaccard similarity → select reference code (APR preferred in 99% of cases).",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "No training data cutoff dates are stated for either GPT-4-0125-preview or Qwen2.5-Coder, despite both being evaluated on benchmarks that could be in their training data.",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "No discussion of whether HumanEval-X or MBXP problems appeared in the training data of GPT-4 or Qwen2.5-Coder. HumanEval was published in 2021, well before these models' training cutoffs.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "HumanEval (published 2021) and MBXP (published 2022) were both available online before GPT-4 and Qwen2.5-Coder were trained. No contamination discussion despite this being a significant concern for interpreting which mistakes are 'real' vs artifacts of memorization.",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants. The manual analysis is performed by the authors as researchers, not as study subjects.",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants in the study.",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants in the study.",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants in the study.",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants in the study.",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants in the study.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants in the study.",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "No inference cost or API costs reported despite extensive use of GPT-4 API for code generation, APR (CHATREPAIR with up to 30 trials), paraphrasing, test case generation, and reason identification.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "No total computational budget stated. The study involves thousands of GPT-4 API calls across multiple experiments but provides no cost or compute information.",
    368           "source": "opus"
    369         }
    370       },
    371       "experimental_rigor": {
    372         "seed_sensitivity_reported": {
    373           "applies": true,
    374           "answer": false,
    375           "justification": "Temperature is set to 0 for deterministic outputs, but the paper mentions 'executed multiple runs' without reporting any variance across runs or seed sensitivity analysis.",
    376           "source": "opus"
    377         },
    378         "number_of_runs_stated": {
    379           "applies": true,
    380           "answer": false,
    381           "justification": "The paper says 'executed multiple runs for each coding problem' but does not state the exact number of runs.",
    382           "source": "opus"
    383         },
    384         "hyperparameter_search_budget": {
    385           "applies": true,
    386           "answer": false,
    387           "justification": "No hyperparameter search is described. CHATREPAIR uses 'default settings as recommended by Xia et al.' but no search over configurations is reported for the prompting approaches.",
    388           "source": "opus"
    389         },
    390         "best_config_selection_justified": {
    391           "applies": true,
    392           "answer": true,
    393           "justification": "All three configurations (Base, Advanced, Advanced+ReAct) are reported transparently in Table 2 with results for each. No selective reporting of only the best configuration.",
    394           "source": "opus"
    395         },
    396         "multiple_comparison_correction": {
    397           "applies": false,
    398           "answer": false,
    399           "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable.",
    400           "source": "opus"
    401         },
    402         "self_comparison_bias_addressed": {
    403           "applies": true,
    404           "answer": false,
    405           "justification": "The authors evaluate their own mistake taxonomy and reason identification benchmark without acknowledging potential bias from being both the creators of the ground truth and the evaluators of automated performance.",
    406           "source": "opus"
    407         },
    408         "compute_budget_vs_performance": {
    409           "applies": true,
    410           "answer": false,
    411           "justification": "The ReAct approach uses significantly more compute (multiple tool calls with GPT-4) than the Base or Advanced prompts, but performance is not reported as a function of compute budget.",
    412           "source": "opus"
    413         },
    414         "benchmark_construct_validity": {
    415           "applies": true,
    416           "answer": false,
    417           "justification": "No discussion of whether HumanEval-X and MBXP problems represent real-world coding challenges. The paper does not question whether coding competition-style problems capture the types of mistakes that matter in practice.",
    418           "source": "opus"
    419         },
    420         "scaffold_confound_addressed": {
    421           "applies": true,
    422           "answer": false,
    423           "justification": "The ReAct approach adds three custom tools (scaffolding) on top of GPT-4, but the paper does not discuss whether the performance gains come from the tools vs the reasoning approach. For code generation, the same prompts are used for both models, but no scaffold confound discussion is present.",
    424           "source": "opus"
    425         }
    426       },
    427       "data_leakage": {
    428         "temporal_leakage_addressed": {
    429           "applies": true,
    430           "answer": false,
    431           "justification": "No discussion of temporal leakage. HumanEval (2021) and MBXP (2022) were published before GPT-4 and Qwen2.5-Coder were trained. Models may have memorized solutions, affecting which mistakes they make.",
    432           "source": "opus"
    433         },
    434         "feature_leakage_addressed": {
    435           "applies": true,
    436           "answer": false,
    437           "justification": "No discussion of feature leakage. The prompts include full coding question specifications, function signatures, and input-output examples — no discussion of whether this setup leaks answer information.",
    438           "source": "opus"
    439         },
    440         "non_independence_addressed": {
    441           "applies": true,
    442           "answer": false,
    443           "justification": "No discussion of whether training data included HumanEval or MBXP problems or structurally similar problems.",
    444           "source": "opus"
    445         },
    446         "leakage_detection_method": {
    447           "applies": true,
    448           "answer": false,
    449           "justification": "No concrete leakage detection method is applied (no canary strings, membership inference, or decontamination).",
    450           "source": "opus"
    451         }
    452       }
    453     }
    454   },
    455   "claims": [
    456     {
    457       "claim": "LLMs make 17 types of non-syntactic mistakes, 10 of which were overlooked by prior studies",
    458       "evidence": "Table 1 lists all 17 types with highlighted new ones; the paper explains the dataset size (11x larger than prior work) and multi-language scope enabled finding new types",
    459       "supported": "strong"
    460     },
    461     {
    462       "claim": "Misleading coding question specification is the most common cause of LLM mistakes, accounting for 56.19% of cases",
    463       "evidence": "Section 5.2 reports this percentage and provides examples; interventions (paraphrasing) confirmed the causal link in individual cases",
    464       "supported": "moderate"
    465     },
    466     {
    467       "claim": "GPT-4 achieves near-human precision (0.97 on HumanEval-X, 0.95 on MBXP) in identifying mistakes in LLM-generated code",
    468       "evidence": "Section 6.2.1 reports precision and coverage rate for both GPT-4 and human evaluators, showing GPT-4 close to human performance",
    469       "supported": "moderate"
    470     },
    471     {
    472       "claim": "ReAct prompting improves reason identification F1 from 0.73 (Advanced Prompt) to 0.78",
    473       "evidence": "Table 2 shows per-reason and average F1 scores across three prompt conditions; improvement is consistent across five of six reason categories",
    474       "supported": "moderate"
    475     },
    476     {
    477       "claim": "Math knowledge mistakes are predominantly divergent (DADE at 71.43%), unlike most other categories",
    478       "evidence": "Table 1 severity distribution shows math knowledge errors diverge most from intended functionality, consistent with the authors' interpretation that LLMs fundamentally misapply mathematical concepts",
    479       "supported": "strong"
    480     },
    481     {
    482       "claim": "LLMs struggle most with identifying positional sensitivity as a reason for mistakes (F1=0.65 even with ReAct)",
    483       "evidence": "Table 2 shows positional sensitivity has the lowest F1 across all conditions (0.25 base, 0.62 advanced, 0.65 ReAct), and the paper calls this out for future work",
    484       "supported": "strong"
    485     }
    486   ],
    487   "methodology_tags": [
    488     "qualitative",
    489     "benchmark-eval",
    490     "case-study"
    491   ],
    492   "key_findings": "The paper identifies 17 types of non-syntactic mistakes in GPT-4 and Qwen2.5-Coder generated Python and Java code, with 10 types not previously reported in the literature; the most common mistake types are Conditional Misalignment Error (15.65%) and Incorrect Math Knowledge Error (14.24%). Six underlying reasons are identified, dominated by misleading coding question specification (56.19%) and poor input-output demonstrations (21.26%); causal validation via prompt intervention provides semi-experimental support. GPT-4 can identify mistakes with near-human precision (~0.96) and can identify reasons with F1=0.78 using ReAct prompting, though positional sensitivity remains the hardest reason to detect (F1=0.65).",
    493   "red_flags": [
    494     {
    495       "flag": "No statistical significance tests",
    496       "detail": "Comparative claims (e.g., ReAct improves F1 by 5 points) are made without any significance tests or confidence intervals, making it impossible to assess whether differences are meaningful given the 202-instance benchmark."
    497     },
    498     {
    499       "flag": "Benchmark contamination unaddressed",
    500       "detail": "HumanEval-X and MBXP are widely known benchmarks almost certainly in GPT-4's and Qwen2.5-Coder's training data; the paper never discusses whether contamination affects the types or rates of mistakes observed."
    501     },
    502     {
    503       "flag": "Manual coding subjectivity",
    504       "detail": "The open-coding process for categorizing 17 mistake types and 6 reasons is inherently subjective; while negotiated agreement is used, inter-rater reliability (kappa or percent agreement) is never reported."
    505     },
    506     {
    507       "flag": "Small reason benchmark",
    508       "detail": "The 202-instance reason identification benchmark is small for a 6-class classification task; some categories have very few instances (e.g., MFS 4.44% ≈ 9 instances), making per-category F1 estimates unreliable."
    509     },
    510     {
    511       "flag": "APR repair validity assumption",
    512       "detail": "The paper assumes CHATREPAIR-fixed code represents the LLM's intended solution (Jaccard similarity 0.72-0.88), but this assumption is not independently validated and could inflate the count of correctable mistakes."
    513     }
    514   ],
    515   "cited_papers": [
    516     {
    517       "title": "Automated repair of programs from large language models",
    518       "relevance": "Fan et al. 2023 — prior work on LLM code mistakes that this paper directly extends, used as baseline comparison for mistake taxonomy"
    519     },
    520     {
    521       "title": "An Empirical Study of Code Generation Errors made by Large Language Models",
    522       "relevance": "Song et al. 2023 — key prior work on non-syntactic mistake categorization that this paper claims to expand upon"
    523     },
    524     {
    525       "title": "Bugs in large language models generated code: An empirical study",
    526       "relevance": "Tambon et al. 2025 — directly comparable empirical study on LLM code bugs used as comparison baseline"
    527     },
    528     {
    529       "title": "Evaluating large language models trained on code",
    530       "relevance": "Codex/HumanEval paper (Chen et al. 2021) — foundational benchmark and model for code generation research"
    531     },
    532     {
    533       "title": "Multi-lingual evaluation of code generation models (MBXP)",
    534       "relevance": "Athiwaratkun et al. 2022 — primary dataset used in this study"
    535     },
    536     {
    537       "title": "ReAct: Synergizing reasoning and acting in language models",
    538       "relevance": "Yao et al. 2022 — prompting technique used for reason identification in RQ3"
    539     },
    540     {
    541       "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT (CHATREPAIR)",
    542       "relevance": "Xia & Zhang 2023 — APR tool used to obtain corrected reference code for mistake analysis"
    543     },
    544     {
    545       "title": "CodeGeeX: A pre-trained model for code generation with multilingual benchmarking on HumanEval-X",
    546       "relevance": "Zheng et al. 2023 — source of HumanEval-X dataset used in evaluation"
    547     }
    548   ],
    549   "engagement_factors": {
    550     "practical_relevance": {
    551       "score": 1,
    552       "justification": "Taxonomy of LLM code mistakes is informative but not directly actionable as a tool or technique practitioners can apply."
    553     },
    554     "surprise_contrarian": {
    555       "score": 1,
    556       "justification": "The finding that 56% of mistakes stem from ambiguous specifications rather than model limitations is mildly surprising but not a strong contrarian claim."
    557     },
    558     "fear_safety": {
    559       "score": 0,
    560       "justification": "No safety, security, or risk angle is discussed."
    561     },
    562     "drama_conflict": {
    563       "score": 1,
    564       "justification": "Implicitly questions benchmark quality by attributing most failures to specification ambiguity rather than model capability, but doesn't frame this as a controversy."
    565     },
    566     "demo_ability": {
    567       "score": 1,
    568       "justification": "Replication package on Figshare exists but no live demo or easy-to-run tool."
    569     },
    570     "brand_recognition": {
    571       "score": 1,
    572       "justification": "Uses GPT-4 and mentions OpenAI but authors are from UC Irvine and UIUC, not major AI labs."
    573     }
    574   },
    575   "hn_data": {
    576     "threads": [
    577       {
    578         "hn_id": "42307849",
    579         "title": "\"Oh, shit I opened the document \": Suspicious Mail in VR Headsets[pdf]",
    580         "points": 2,
    581         "comments": 1,
    582         "url": "https://news.ycombinator.com/item?id=42307849",
    583         "created_at": "2024-12-03T16:22:05Z"
    584       },
    585       {
    586         "hn_id": "40263764",
    587         "title": "A scalable approach to network reconstruction",
    588         "points": 2,
    589         "comments": 0,
    590         "url": "https://news.ycombinator.com/item?id=40263764",
    591         "created_at": "2024-05-05T10:37:34Z"
    592       },
    593       {
    594         "hn_id": "42465432",
    595         "title": "Glider: Small model beats GPT on eval tasks",
    596         "points": 2,
    597         "comments": 0,
    598         "url": "https://news.ycombinator.com/item?id=42465432",
    599         "created_at": "2024-12-19T20:33:09Z"
    600       },
    601       {
    602         "hn_id": "38873897",
    603         "title": "Static Deadlock Detection for Rust Programs",
    604         "points": 1,
    605         "comments": 0,
    606         "url": "https://news.ycombinator.com/item?id=38873897",
    607         "created_at": "2024-01-04T23:55:09Z"
    608       },
    609       {
    610         "hn_id": "38870705",
    611         "title": "Scalable network reconstruction in subquadratic time",
    612         "points": 1,
    613         "comments": 0,
    614         "url": "https://news.ycombinator.com/item?id=38870705",
    615         "created_at": "2024-01-04T18:48:09Z"
    616       }
    617     ],
    618     "top_points": 2,
    619     "total_points": 8,
    620     "total_comments": 1
    621   }
    622 }

Impressum · Datenschutz