scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25669B)
      1 {
      2   "scan_version": 3,
      3   "active_modules": [
      4     "experimental_rigor",
      5     "data_leakage"
      6   ],
      7   "paper": {
      8     "title": "Detecting and Correcting Hallucinations in LLM-Generated Code via Deterministic AST Analysis",
      9     "authors": [
     10       "Dipin Khati",
     11       "Daniel Rodriguez-Cardenas",
     12       "Paul Pantzer",
     13       "Denys Poshyvanyk"
     14     ],
     15     "year": 2026,
     16     "venue": "FORGE '26 (IEEE/ACM International Conference on AI Foundation Models and Software Engineering)",
     17     "arxiv_id": "2601.19106",
     18     "doi": "10.1145/3793655.3793725"
     19   },
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Replication package provided at GitHub (ref [3]: https://github.com/WM-SEMERU/Hallucinations-in-Code), explicitly stated as publicly available in §1."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The 200-sample dataset is stated to be part of the replication package: 'All data, code, and experimental configurations are publicly available in our replication package [3]' (§1)."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No environment specifications, requirements.txt, or dependency versions are mentioned in the paper. Only a GitHub link is provided."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided in the paper. The replication package is mentioned but no instructions for running it are given."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Only point estimates are reported (100% precision, 87.6% recall, 77.0% fix accuracy). No confidence intervals or error bars."
     48       },
     49       "significance_tests": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "The paper does not make comparative claims between systems. It evaluates a single deterministic system with no stochastic comparisons requiring significance tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "No comparative claims are made between systems, so effect sizes are not applicable. The paper reports absolute performance of a single system."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The dataset has 200 samples (161 hallucinated, 39 clean) but no justification for why this size was chosen or whether it is adequate for the claims made."
     63       },
     64       "variance_reported": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "The system is fully deterministic ('completed in under 0.2 seconds', §2.5). There are no stochastic runs to report variance across."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No baseline systems are compared against. The paper only reports its own framework's performance. Related work discusses PICARD, Synchromesh, LLM-in-the-loop repair, and Structural Trimming but does not compare against them experimentally."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No baselines are included, so contemporaneity cannot be assessed."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The framework has multiple components (AST parser, KB, validation rules for unknown API/bare calls/semantic inconsistency, correction module) but no ablation study isolating their contributions."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Reports precision, recall, F1-score for detection, and fix accuracy for correction (Tables 1-4, §3)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No human evaluation of the corrections is mentioned. Evaluation is entirely automated. Human evaluation would be relevant to assess whether corrections are semantically appropriate."
     95       },
     96       "held_out_test_set": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "The system is not trained/tuned — it is a deterministic rule-based framework. There is no training/validation/test split concern."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Tables 3 and 4 provide breakdowns by KCH type (Missing Imports, Mis-typed API Calls, Contextual Mismatches) and by library (numpy, pandas, matplotlib, json, requests)."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "§4 provides manual analysis of 37 failed cases (20 false negatives, 17 failed corrections) with specific examples like 'plt.plotx instead of plt.plot' and the surface-typo-vs-semantic-error problem."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Contextual Mismatches had only 33.3% detection rate and 0.0% correction accuracy (Table 3). Pandas had only 56.2% correction accuracy (Table 4). These are clearly negative results."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims of 100% precision, 87.6% recall, 0.934 F1, and 77.0% fix accuracy are all directly supported by results in §3 and Tables 1-4."
    122       },
    123       "causal_claims_justified": {
    124         "applies": false,
    125         "answer": false,
    126         "justification": "The paper does not make causal claims. It reports detection/correction performance of a deterministic tool without claiming causal relationships."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The abstract claims the framework offers 'a clear path toward trustworthy code generation' broadly, but it was tested only on 200 Python snippets across 5 libraries. The title says 'LLM-Generated Code' without bounding to Python. §4 acknowledges some limitations but the framing in abstract/title/conclusion overgeneralizes."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "No alternative explanations for the results are discussed. For instance, the high precision could partly reflect the simplicity of the curated dataset rather than inherent framework reliability."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper measures detection/correction on a curated dataset but frames results as 'trustworthy code generation' and 'reliable alternative to probabilistic repair' without discussing the gap between curated-dataset performance and real-world code generation trustworthiness."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The dataset was generated using 'GPT-5' (§2.6) but no version, snapshot date, or API version is specified."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Dataset was generated by 'prompting GPT-5 with task-oriented instructions' (§2.6) but the actual prompts used are not provided."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "No hyperparameters for GPT-5 generation (temperature, top-p, etc.) are reported. The edit-distance threshold for correction is also not specified."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The framework is a deterministic static analysis pipeline."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper states 200 samples were 'curated' from GPT-5 output but does not describe how many were initially generated, what curation criteria were applied, or how the 161/39 hallucinated/clean split was determined."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "§4 (Discussion and Future Work) contains a substantial paragraph beginning 'We must acknowledge the limitations of this study' discussing dataset size, library coverage, single-file analysis, and scope of targeted errors."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "§4 discusses specific threats: '200-sample dataset...is not exhaustive', 'error distribution may not reflect real-world prevalence', 'Knowledge Base was limited to five Python libraries', 'does not yet handle multi-module dataflows'."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "§4 explicitly states: 'our approach deliberately targets KCHs and does not attempt to solve more complex, multi-line logical errors' and 'currently focuses on single-file, function-level analysis.'"
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Replication package [3] is stated to contain all data, code, and experimental configurations."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "§2.6 describes dataset construction: 200 Python samples generated by prompting GPT-5 for 5 target libraries, composed of 161 hallucinated (3 categories) and 39 clean samples."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. Data is LLM-generated code snippets from a standard model, not a benchmark requiring recruitment description."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The paper does not describe how GPT-5 outputs were selected, filtered, or curated into the final 200. How many were generated initially? What criteria determined inclusion? This is undocumented."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding source is mentioned anywhere in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "All four authors are listed with William & Mary affiliation. No product being evaluated is affiliated with the authors."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not evidence of absence of conflict."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "The system being evaluated is a deterministic static analysis tool, not a pre-trained model. Contamination of model training data is not relevant to evaluating this framework."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "The framework is rule-based, not trained. No train/test overlap concern exists."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "The framework is deterministic and not trained on any data. Benchmark contamination is not applicable."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "§2.5 reports: 'the end-to-end analysis of all 200 samples completed in under 0.2 seconds on a single laptop CPU.'"
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "The compute is minimal and stated: under 0.2 seconds on a single laptop CPU for all 200 samples (§2.5)."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": false,
    301         "answer": false,
    302         "justification": "The framework is fully deterministic with no random seeds. No stochastic component exists."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": false,
    306         "answer": false,
    307         "justification": "Deterministic system — a single run always produces the same output. Multiple runs are unnecessary."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The framework likely has tunable parameters (e.g., edit distance threshold for fuzzy matching) but no hyperparameter search is described."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No discussion of how design choices (e.g., edit distance thresholds, semantic cue definitions) were selected or validated."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors evaluate their own system on their own curated dataset without acknowledging the potential bias of designing both the tool and its evaluation data."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": false,
    331         "answer": false,
    332         "justification": "No comparison with other systems, so compute-matched comparison is not applicable."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The 200-sample curated dataset's representativeness of real-world KCH distribution is not discussed. §4 briefly notes 'error distribution may not reflect real-world prevalence' but does not analyze construct validity."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is involved. The system is a standalone static analysis tool."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": false,
    348         "answer": false,
    349         "justification": "The evaluated system is a deterministic rule-based tool, not a trained model. Temporal leakage of training data is not applicable."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": false,
    353         "answer": false,
    354         "justification": "The evaluated system is rule-based, not a trained model. Feature leakage is not applicable."
    355       },
    356       "non_independence_addressed": {
    357         "applies": false,
    358         "answer": false,
    359         "justification": "No trained model is evaluated. Non-independence of train/test data is not applicable."
    360       },
    361       "leakage_detection_method": {
    362         "applies": false,
    363         "answer": false,
    364         "justification": "No trained model is evaluated. Leakage detection is not applicable."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "The framework detects KCHs with 100% precision (zero false positives) and 87.6% recall (0.934 F1-score).",
    371       "evidence": "Table 1 confusion matrix: 141 TP, 0 FP, 20 FN, 39 TN on 200-sample dataset (§3).",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "The framework auto-corrects 77.0% (124/161) of identified hallucinations.",
    376       "evidence": "Table 2 shows 124 successfully corrected out of 161 hallucinated samples (§3).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "The deterministic approach is a viable and reliable alternative to probabilistic repair.",
    381       "evidence": "Results on the curated dataset, but no direct comparison with probabilistic methods. Claim is based on achieving high precision and reasonable correction rate (§3-4).",
    382       "supported": "weak"
    383     },
    384     {
    385       "claim": "Detection was near-perfect for Missing Imports (97.9%) and numpy (100.0%) but lowest for Contextual Mismatches (33.3%) and matplotlib (72.2%).",
    386       "evidence": "Tables 3 and 4 provide per-category and per-library breakdowns (§3).",
    387       "supported": "strong"
    388     }
    389   ],
    390   "methodology_tags": [
    391     "benchmark-eval"
    392   ],
    393   "key_findings": "A deterministic AST-based framework for detecting Knowledge Conflicting Hallucinations in LLM-generated code achieves 100% precision and 87.6% recall on a 200-sample curated dataset, with 77% automatic correction rate. Performance varies significantly by error type and library — Missing Imports are nearly perfectly handled (97.9%) while Contextual Mismatches are poorly detected (33.3%) and never corrected. The framework runs in under 0.2 seconds for all 200 samples, demonstrating practical efficiency.",
    394   "red_flags": [
    395     {
    396       "flag": "Self-curated evaluation dataset",
    397       "detail": "The 200-sample dataset was generated and curated by the authors specifically for this tool. No independent or real-world benchmark is used. The dataset construction process (how GPT-5 outputs were selected/filtered) is not fully documented, raising concerns about whether the dataset favors the tool's capabilities."
    398     },
    399     {
    400       "flag": "No baseline comparisons",
    401       "detail": "The paper claims the framework is 'a viable and reliable alternative to probabilistic repair' but does not compare against any existing tool (PICARD, Synchromesh, LLM-in-the-loop repair, Structural Trimming, mypy). The claim of superiority is argumentative, not empirical."
    402     },
    403     {
    404       "flag": "Small and imbalanced dataset",
    405       "detail": "200 samples total with only 39 clean (negative) samples and only 3 Contextual Mismatch examples. The 100% precision claim rests on 39 negatives. The category with worst performance has only 3 samples — too few for meaningful conclusions."
    406     },
    407     {
    408       "flag": "Overclaiming in title and abstract",
    409       "detail": "Title says 'LLM-Generated Code' generically but evaluation is limited to 200 Python snippets across 5 libraries. Abstract claims 'a clear path toward trustworthy code generation' from a narrow evaluation."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Static Analysis as a Feedback Loop: Enhancing LLM-Generated Code Beyond Correctness",
    415       "authors": [
    416         "Scott Blyth",
    417         "Sherlock A. Licorish",
    418         "Christoph Treude",
    419         "Markus Wagner"
    420       ],
    421       "year": 2025,
    422       "arxiv_id": "2508.14419",
    423       "relevance": "LLM-in-the-loop repair using static analysis feedback, a direct comparison point for non-deterministic repair approaches."
    424     },
    425     {
    426       "title": "Mapping the Trust Terrain: LLMs in Software Engineering - Insights and Perspectives",
    427       "authors": [
    428         "Dipin Khati",
    429         "Yijin Liu",
    430         "David N. Palacio",
    431         "Yixuan Zhang",
    432         "Denys Poshyvanyk"
    433       ],
    434       "year": 2025,
    435       "doi": "10.1145/3771282",
    436       "relevance": "Empirical study on developer trust in LLM-generated code, directly relevant to understanding trust erosion from code hallucinations."
    437     },
    438     {
    439       "title": "Hallucination by Code Generation LLMs: Taxonomy, Benchmarks, Mitigation, and Challenges",
    440       "authors": [
    441         "Yunseo Lee",
    442         "John Youngeun Song",
    443         "Dongsun Kim"
    444       ],
    445       "year": 2025,
    446       "arxiv_id": "2504.20799",
    447       "relevance": "Taxonomy of code generation hallucinations with benchmarks, directly relevant to understanding and categorizing LLM code errors."
    448     },
    449     {
    450       "title": "Exploring and Evaluating Hallucinations in LLM-Powered Code Generation",
    451       "authors": [
    452         "Fang Liu",
    453         "Yang Liu",
    454         "Lin Shi",
    455         "Houkun Huang",
    456         "Ruifeng Wang"
    457       ],
    458       "year": 2024,
    459       "arxiv_id": "2404.00971",
    460       "relevance": "Defines Knowledge Conflicting Hallucinations (KCHs), the central concept this paper builds upon."
    461     },
    462     {
    463       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    464       "authors": [
    465         "Sida Peng",
    466         "Eirini Kalliamvakou",
    467         "Peter Cihon",
    468         "Mert Demirer"
    469       ],
    470       "year": 2023,
    471       "arxiv_id": "2302.06590",
    472       "relevance": "Key study on Copilot's productivity impact, relevant to understanding the LLM code generation landscape."
    473     },
    474     {
    475       "title": "Bugs in Large Language Models Generated Code: An Empirical Study",
    476       "authors": [
    477         "Florian Tambon",
    478         "Arghavan Moradi Dakhel",
    479         "Amin Nikanjam",
    480         "Foutse Khomh"
    481       ],
    482       "year": 2024,
    483       "arxiv_id": "2403.08937",
    484       "relevance": "Empirical study documenting bug patterns in LLM-generated code, establishing the taxonomy this paper targets."
    485     },
    486     {
    487       "title": "Towards Understanding the Characteristics of Code Generation Errors Made by Large Language Models",
    488       "authors": [
    489         "Zhijie Wang",
    490         "Zijie Zhou",
    491         "Da Song"
    492       ],
    493       "year": 2025,
    494       "arxiv_id": "2406.08731",
    495       "relevance": "Characterizes error types in LLM code generation, complementary to the KCH taxonomy."
    496     },
    497     {
    498       "title": "LLMLOOP: Improving LLM-Generated Code and Tests through Automated Iterative Feedback Loops",
    499       "authors": [
    500         "Ravin Ravi",
    501         "Dylan Bradshaw",
    502         "Stefano Ruberto"
    503       ],
    504       "year": 2025,
    505       "doi": "10.1109/ICSME64153.2025.00109",
    506       "relevance": "LLM-in-the-loop iterative repair approach, a non-deterministic alternative to the deterministic approach proposed here."
    507     },
    508     {
    509       "title": "Cutting the Root of Hallucination: Structural Trimming for Vulnerability Mitigation in Code LLMs",
    510       "authors": [
    511         "Yage Zhang"
    512       ],
    513       "year": 2025,
    514       "relevance": "AST-based pruning approach for code hallucinations — deletion-based rather than correction-based, a direct comparison point."
    515     },
    516     {
    517       "title": "Fixing Function-Level Code Generation Errors for Foundation Large Language Models",
    518       "authors": [
    519         "Hao Wen",
    520         "Yueheng Zhu",
    521         "Chao Liu"
    522       ],
    523       "year": 2025,
    524       "arxiv_id": "2409.00676",
    525       "relevance": "Addresses function-level code generation error fixing, related to the correction task in this paper."
    526     }
    527   ],
    528   "engagement_factors": {
    529     "practical_relevance": {
    530       "score": 2,
    531       "justification": "AST-based hallucination detection for LLM code is directly applicable to developer workflows, though the tool only covers 5 Python libraries currently."
    532     },
    533     "surprise_contrarian": {
    534       "score": 0,
    535       "justification": "The idea that static analysis can catch API misuse is well-understood; the results confirm expectations rather than challenging them."
    536     },
    537     "fear_safety": {
    538       "score": 0,
    539       "justification": "Addresses code correctness rather than safety, security, or misuse concerns."
    540     },
    541     "drama_conflict": {
    542       "score": 0,
    543       "justification": "No controversy or conflict; positions itself as complementary to existing approaches without challenging specific claims."
    544     },
    545     "demo_ability": {
    546       "score": 1,
    547       "justification": "Code is available on GitHub but requires setup with specific libraries and the custom dataset; not a quick-try tool."
    548     },
    549     "brand_recognition": {
    550       "score": 0,
    551       "justification": "From William & Mary's SEMERU lab, not a widely recognized institution in the AI/ML community."
    552     }
    553   }
    554 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs