scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25665B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "A Deep Dive Into Large Language Model Code Generation Mistakes: What and Why?",
      6     "authors": [
      7       "QiHong Chen",
      8       "Jiachen Yu",
      9       "Jiawei Li",
     10       "Jiecheng Deng",
     11       "Justin Tian Jin Chen",
     12       "Iftekhar Ahmed"
     13     ],
     14     "year": 2024,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2411.01414",
     17     "doi": "10.48550/arXiv.2411.01414"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All four abstract claims are supported: 17 mistake types in Table 1, 10 newly identified; 6 reasons in Section 5.2; GPT-4 mistake identification precision ~0.96; ReAct F1=0.78 in Table 2.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper makes causal claims that specific prompt features cause mistakes, and validates them by modifying the causative factor (rephrasing, repositioning) and checking whether regenerated code passes tests — a reasonable intervention design for this context.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Section 7 explicitly bounds findings to Python and Java, two specific LLMs, and two specific benchmarks, acknowledging results may not generalize to other languages, benchmarks, or LLMs.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper proposes 6 reasons for mistakes and validates them via intervention, but does not systematically discuss alternative explanations for why these factors cause errors or whether multiple reasons may interact.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper uses test case pass/fail as a proxy for code correctness and explicitly acknowledges in Section 7 that 'test cases might not be comprehensive,' distinguishing measured outcomes from broader correctness.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 7 'Threats to Validity' has three dedicated subsections: Construct validity, Internal validity, and External validity.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Specific threats named include: prompt design influence, incomplete test coverage, manual examination bias, non-exhaustive reason identification, and limitation to Java/Python only — these go beyond generic disclaimers.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Explicit scope boundaries stated: two LLMs (GPT-4, Qwen2.5-Coder), two programming languages (Python, Java), two datasets (HumanEval-X, MBXP), and non-syntactic mistakes only.",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding acknowledgment or disclosure appears anywhere in the paper.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All six authors' university affiliations are disclosed on the title page (UCI, UIUC, UCR).",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No funding is disclosed, so independence of funder from outcome cannot be assessed.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement or declaration of financial interests appears in the paper.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 3 defines 'non-syntactic mistakes' precisely (two categories: runtime errors and functional failures), and all 17 mistake types plus 3 severity levels (FADE, PADE, DADE) are formally defined.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The introduction lists 4 explicit contributions: a derived list of mistakes, a derived list of reasons, a 202-instance benchmark, and an empirical investigation of LLM auto-identification.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 engages substantively with Fan et al., Song et al., Tambon et al., and others, explicitly contrasting this study's scope (more data, newer models, two languages, causal analysis) with prior limitations.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "A replication package is linked at https://figshare.com/s/10e27d42bf537f6321f7, referenced repeatedly throughout the paper for code, prompts, and results.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "HumanEval-X and MBXP are standard public benchmarks; the 202-instance reason-identification benchmark is included in the replication package.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No requirements.txt, Dockerfile, or explicit dependency list is mentioned; tools like ast, javalang, BeautifulSoup, and all-mpnet-base-v2 are referenced but no environment spec is provided.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "The paper refers to the replication package for details but provides no step-by-step reproduction instructions within the paper itself.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "All results (precision, coverage rate, F1) are reported as single point estimates with no confidence intervals or error bars.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No statistical significance tests are applied when comparing prompt approaches (Base vs Advanced vs Advanced+ReAct) or when comparing GPT-4 to human evaluators.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Actual metric values (F1, precision, coverage rate) are reported for all comparisons in Table 2, conveying the magnitude of differences between approaches.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The 202-instance benchmark size is not justified by power analysis; dataset sizes derive from the chosen benchmarks, not from a principled sample size determination.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Temperature is set to 0 for determinism, and no variance across runs is reported for any result.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "The Base Prompt serves as a baseline for reason identification in RQ3, with Advanced Prompt and Advanced+ReAct as progressively enhanced conditions.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "GPT-4 and Qwen2.5-Coder are both contemporary, top-performing models at time of study; no weak or stale baselines are used.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "The three prompting conditions (Base, Advanced, Advanced+ReAct) constitute an ablation of increasing prompt complexity for reason identification.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Precision, Coverage Rate (CR), and F1 score are all used for evaluation in RQ3; Table 1 also reports severity frequency distributions.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Human evaluators (all authors, each with 5+ years of Python/Java experience) independently reviewed and labeled mistakes and reasons using open coding and negotiated agreement, providing the gold standard.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "A 202-instance benchmark is constructed from the full analysis and used as a held-out evaluation set for assessing GPT-4 reason identification performance in RQ3.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Table 1 breaks down 17 mistake types by category with severity frequencies; Table 2 breaks down F1 scores per reason across all three prompting approaches.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "The paper discusses that positional sensitivity is the hardest reason for GPT-4 to identify (Base F1=0.25) and attributes this to limitations in attention-related reasoning.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Positional sensitivity identification failure (F1=0.25 for base prompt) is explicitly reported as a negative result, and the paper calls for future work to address it.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "Exact model versions are specified: GPT-4-0125-preview and qwen2.5-coder-14b-instruct.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "Prompts are described at a high level but not reproduced in the paper; the paper repeatedly defers to the replication package for full prompt text.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Temperature=0 for code generation (determinism), temperature=0.5 for paraphrasing and ambiguity checking are explicitly reported.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "The ReAct scaffolding is described in detail including all three tools (Function Call Analysis, Function Signature Explainer, Coding Question Specification Ambiguity Check) with implementation specifics.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Data pipeline is documented: prompt LLMs → run test cases → filter syntactic mistakes → apply APR (CHATREPAIR) → validate via Jaccard similarity → collect failures for analysis.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "The replication package on figshare contains the raw LLM-generated codes with non-syntactic mistakes and associated test failure information.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 3 describes the full data collection: prompting procedure, test execution, syntactic filtering, APR repair, and final dataset composition.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participant recruitment; authors serve as annotators and are not recruited subjects.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The full pipeline from dataset selection through LLM prompting, test execution, APR repair, Jaccard validation, manual annotation, and evaluation is documented across Sections 3–5.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "No training data cutoff is stated for GPT-4-0125-preview or Qwen2.5-Coder despite evaluating them on public benchmarks.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "The paper does not discuss whether HumanEval-X or MBXP problems appeared in either model's training data, which is a significant omission for capability evaluation.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "HumanEval-X and MBXP are widely-used public benchmarks predating both models; potential contamination is not acknowledged or addressed anywhere in the paper.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human subjects study; authors are annotators, not participants.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants; IRB not applicable.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants; demographics not applicable.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants; inclusion/exclusion criteria not applicable.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human experimental study; randomization not applicable.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human experimental study; blinding not applicable.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants; attrition not applicable.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "No inference cost or latency is reported for running GPT-4 on 2,268 coding problems or for the reason identification experiments.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "No total computational budget is stated anywhere in the paper.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "LLMs make 17 types of non-syntactic mistakes, 10 of which were overlooked by prior studies.",
    376       "evidence": "Table 1 enumerates all 17 types; highlighted types are the 10 new ones; comparison to Fan et al., Song et al., and Tambon et al. is made in related work and Section 4.2.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "Misleading Coding Question Specification is the most common cause of mistakes, accounting for 56.19% of cases.",
    381       "evidence": "Section 5.2 reports this figure from the reason analysis; validated by paraphrasing prompts and re-testing.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "GPT-4 identifies non-syntactic mistakes with precision ~0.96 and coverage rate ~0.94, comparable to human evaluators.",
    386       "evidence": "Section 6.2.1 reports precision 0.97 (HumanEval-X) / 0.95 (MBXP) and CR 0.94 for GPT-4 vs precision 1.0 and CR 0.98-0.99 for humans.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "GPT-4 using ReAct achieves F1=0.78 for identifying reasons behind LLM code generation mistakes.",
    391       "evidence": "Table 2 shows Advanced Prompt+ReAct average F1=0.78 across 6 reason categories, up from 0.64 (Base) and 0.73 (Advanced).",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Positional sensitivity in prompts causes LLMs to miss conditions, and simply repositioning information in the prompt can fix these mistakes.",
    396       "evidence": "Section 5.2 provides Figure 3(d) showing a concrete example where repositioning the 'y as vowel at end' rule corrected the LLM output; verified across 4.12% of mistakes.",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "LLMs make math knowledge errors (14.24% of mistakes) that stem from incorrect knowledge learned during training.",
    401       "evidence": "Section 5.2 item 3 (ITK, 5.71%) and IMKE (14.24% in Table 1) with Figure 1(b) showing a concrete variance formula error; causal attribution relies on cross-language comparison.",
    402       "supported": "weak"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "qualitative",
    407     "benchmark-eval",
    408     "case-study"
    409   ],
    410   "key_findings": "The paper identifies 17 types of non-syntactic mistakes in code generated by GPT-4 and Qwen2.5-Coder across HumanEval-X and MBXP datasets, with 10 types previously unreported in the literature. The dominant cause of mistakes is misleading prompt specifications (56.19%), followed by poor input-output demonstrations (21.26%). GPT-4 can automatically identify these mistakes with high precision (~0.96) and coverage (~0.94), and can identify underlying reasons with F1=0.78 using ReAct prompting, though positional sensitivity remains a challenging case (F1=0.25 without augmentation).",
    411   "red_flags": [
    412     {
    413       "flag": "No significance tests",
    414       "detail": "All comparisons between prompting approaches (Base vs Advanced vs ReAct) and between GPT-4 and human evaluators are made without statistical significance tests, making it impossible to determine whether differences are meaningful."
    415     },
    416     {
    417       "flag": "Contamination not addressed",
    418       "detail": "HumanEval-X and MBXP are widely-used public benchmarks that almost certainly appear in GPT-4 and Qwen2.5-Coder training data; this is not acknowledged, which may inflate the apparent correctness of LLM outputs."
    419     },
    420     {
    421       "flag": "Annotator conflict of interest",
    422       "detail": "The same authors who identified and labeled the reasons in RQ2 also evaluated whether GPT-4 correctly identified those same reasons in RQ3, creating circular validation risk despite use of negotiated agreement."
    423     },
    424     {
    425       "flag": "No confidence intervals",
    426       "detail": "All metrics (precision, coverage rate, F1) are single point estimates with no uncertainty quantification, limiting interpretability of findings."
    427     },
    428     {
    429       "flag": "Paper format anomaly",
    430       "detail": "The ACM reference format shows year 2018 and placeholder DOI, suggesting the paper is a preprint not yet formally published, but this is not clearly disclosed."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "An Empirical Study of Code Generation Errors made by Large Language Models",
    436       "relevance": "Direct prior work this paper extends; identified 7 syntactic and non-syntactic mistake categories on HumanEval using ChatGPT."
    437     },
    438     {
    439       "title": "Bugs in large language models generated code: An empirical study",
    440       "relevance": "Prior work identifying 10 bug categories from LLM-generated code on CoderEval; directly compared against in this paper."
    441     },
    442     {
    443       "title": "Automated repair of programs from large language models",
    444       "relevance": "Fan et al. 2023 — identified 4 syntactic mistake categories; a key baseline for comparison in this study."
    445     },
    446     {
    447       "title": "Large language models and simple, stupid bugs",
    448       "relevance": "Attributes LLM code errors to training data quality issues; one of the foundational hypotheses tested in RQ2."
    449     },
    450     {
    451       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    452       "relevance": "The prompting technique used in RQ3 for automated reason identification; achieves the best F1=0.78."
    453     },
    454     {
    455       "title": "Multi-lingual evaluation of code generation models",
    456       "relevance": "MBXP dataset used in this study — 1,940 multilingual coding questions across Python and Java."
    457     },
    458     {
    459       "title": "CodeGeeX: A pre-trained model for code generation with multilingual benchmarking on HumanEval-X",
    460       "relevance": "HumanEval-X dataset used in this study — 328 coding problems in Python and Java with test cases."
    461     },
    462     {
    463       "title": "LLM hallucinations in practical code generation: Phenomena, mechanism, and mitigation",
    464       "relevance": "Related work on categorizing LLM code mistakes; Zhang et al. identified 8 mistake types on CoderEval-generated code."
    465     }
    466   ],
    467   "engagement_factors": {
    468     "practical_relevance": {
    469       "score": 2,
    470       "justification": "Directly actionable for developers: rephrasing prompts, adding edge case examples, and repositioning key instructions are all concrete techniques practitioners can apply."
    471     },
    472     "surprise_contrarian": {
    473       "score": 1,
    474       "justification": "Misleading prompt wording causing 56% of mistakes is a notable finding, but the overall framing (LLMs make predictable errors) is not surprising."
    475     },
    476     "fear_safety": {
    477       "score": 1,
    478       "justification": "Paper notes that incorrect LLM-generated code used in production (e.g., Google) poses software quality risks, but this is framed gently rather than alarming."
    479     },
    480     "drama_conflict": {
    481       "score": 0,
    482       "justification": "No controversy or conflict angle; paper is a straightforward empirical classification study."
    483     },
    484     "demo_ability": {
    485       "score": 1,
    486       "justification": "Replication package is publicly available on figshare, enabling reproduction, but no interactive demo is provided."
    487     },
    488     "brand_recognition": {
    489       "score": 1,
    490       "justification": "Uses GPT-4 (OpenAI) and Qwen2.5-Coder (Alibaba), well-known models, but no famous lab affiliation among authors."
    491     }
    492   },
    493   "hn_data": {
    494     "threads": [
    495       {
    496         "hn_id": "42307849",
    497         "title": "\"Oh, shit I opened the document \": Suspicious Mail in VR Headsets[pdf]",
    498         "points": 2,
    499         "comments": 1,
    500         "url": "https://news.ycombinator.com/item?id=42307849",
    501         "created_at": "2024-12-03T16:22:05Z"
    502       },
    503       {
    504         "hn_id": "40263764",
    505         "title": "A scalable approach to network reconstruction",
    506         "points": 2,
    507         "comments": 0,
    508         "url": "https://news.ycombinator.com/item?id=40263764",
    509         "created_at": "2024-05-05T10:37:34Z"
    510       },
    511       {
    512         "hn_id": "42465432",
    513         "title": "Glider: Small model beats GPT on eval tasks",
    514         "points": 2,
    515         "comments": 0,
    516         "url": "https://news.ycombinator.com/item?id=42465432",
    517         "created_at": "2024-12-19T20:33:09Z"
    518       },
    519       {
    520         "hn_id": "38873897",
    521         "title": "Static Deadlock Detection for Rust Programs",
    522         "points": 1,
    523         "comments": 0,
    524         "url": "https://news.ycombinator.com/item?id=38873897",
    525         "created_at": "2024-01-04T23:55:09Z"
    526       },
    527       {
    528         "hn_id": "38870705",
    529         "title": "Scalable network reconstruction in subquadratic time",
    530         "points": 1,
    531         "comments": 0,
    532         "url": "https://news.ycombinator.com/item?id=38870705",
    533         "created_at": "2024-01-04T18:48:09Z"
    534       }
    535     ],
    536     "top_points": 2,
    537     "total_points": 8,
    538     "total_comments": 1
    539   }
    540 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs