scan.json (25669B)
1 { 2 "scan_version": 3, 3 "active_modules": [ 4 "experimental_rigor", 5 "data_leakage" 6 ], 7 "paper": { 8 "title": "Detecting and Correcting Hallucinations in LLM-Generated Code via Deterministic AST Analysis", 9 "authors": [ 10 "Dipin Khati", 11 "Daniel Rodriguez-Cardenas", 12 "Paul Pantzer", 13 "Denys Poshyvanyk" 14 ], 15 "year": 2026, 16 "venue": "FORGE '26 (IEEE/ACM International Conference on AI Foundation Models and Software Engineering)", 17 "arxiv_id": "2601.19106", 18 "doi": "10.1145/3793655.3793725" 19 }, 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "Replication package provided at GitHub (ref [3]: https://github.com/WM-SEMERU/Hallucinations-in-Code), explicitly stated as publicly available in §1." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The 200-sample dataset is stated to be part of the replication package: 'All data, code, and experimental configurations are publicly available in our replication package [3]' (§1)." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "No environment specifications, requirements.txt, or dependency versions are mentioned in the paper. Only a GitHub link is provided." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions are provided in the paper. The replication package is mentioned but no instructions for running it are given." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "Only point estimates are reported (100% precision, 87.6% recall, 77.0% fix accuracy). No confidence intervals or error bars." 48 }, 49 "significance_tests": { 50 "applies": false, 51 "answer": false, 52 "justification": "The paper does not make comparative claims between systems. It evaluates a single deterministic system with no stochastic comparisons requiring significance tests." 53 }, 54 "effect_sizes_reported": { 55 "applies": false, 56 "answer": false, 57 "justification": "No comparative claims are made between systems, so effect sizes are not applicable. The paper reports absolute performance of a single system." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The dataset has 200 samples (161 hallucinated, 39 clean) but no justification for why this size was chosen or whether it is adequate for the claims made." 63 }, 64 "variance_reported": { 65 "applies": false, 66 "answer": false, 67 "justification": "The system is fully deterministic ('completed in under 0.2 seconds', §2.5). There are no stochastic runs to report variance across." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": false, 74 "justification": "No baseline systems are compared against. The paper only reports its own framework's performance. Related work discusses PICARD, Synchromesh, LLM-in-the-loop repair, and Structural Trimming but does not compare against them experimentally." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": false, 79 "justification": "No baselines are included, so contemporaneity cannot be assessed." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": false, 84 "justification": "The framework has multiple components (AST parser, KB, validation rules for unknown API/bare calls/semantic inconsistency, correction module) but no ablation study isolating their contributions." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Reports precision, recall, F1-score for detection, and fix accuracy for correction (Tables 1-4, §3)." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "No human evaluation of the corrections is mentioned. Evaluation is entirely automated. Human evaluation would be relevant to assess whether corrections are semantically appropriate." 95 }, 96 "held_out_test_set": { 97 "applies": false, 98 "answer": false, 99 "justification": "The system is not trained/tuned — it is a deterministic rule-based framework. There is no training/validation/test split concern." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Tables 3 and 4 provide breakdowns by KCH type (Missing Imports, Mis-typed API Calls, Contextual Mismatches) and by library (numpy, pandas, matplotlib, json, requests)." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "§4 provides manual analysis of 37 failed cases (20 false negatives, 17 failed corrections) with specific examples like 'plt.plotx instead of plt.plot' and the surface-typo-vs-semantic-error problem." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Contextual Mismatches had only 33.3% detection rate and 0.0% correction accuracy (Table 3). Pandas had only 56.2% correction accuracy (Table 4). These are clearly negative results." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Abstract claims of 100% precision, 87.6% recall, 0.934 F1, and 77.0% fix accuracy are all directly supported by results in §3 and Tables 1-4." 122 }, 123 "causal_claims_justified": { 124 "applies": false, 125 "answer": false, 126 "justification": "The paper does not make causal claims. It reports detection/correction performance of a deterministic tool without claiming causal relationships." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The abstract claims the framework offers 'a clear path toward trustworthy code generation' broadly, but it was tested only on 200 Python snippets across 5 libraries. The title says 'LLM-Generated Code' without bounding to Python. §4 acknowledges some limitations but the framing in abstract/title/conclusion overgeneralizes." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "No alternative explanations for the results are discussed. For instance, the high precision could partly reflect the simplicity of the curated dataset rather than inherent framework reliability." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper measures detection/correction on a curated dataset but frames results as 'trustworthy code generation' and 'reliable alternative to probabilistic repair' without discussing the gap between curated-dataset performance and real-world code generation trustworthiness." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": false, 148 "justification": "The dataset was generated using 'GPT-5' (§2.6) but no version, snapshot date, or API version is specified." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": false, 153 "justification": "Dataset was generated by 'prompting GPT-5 with task-oriented instructions' (§2.6) but the actual prompts used are not provided." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": false, 158 "justification": "No hyperparameters for GPT-5 generation (temperature, top-p, etc.) are reported. The edit-distance threshold for correction is also not specified." 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "No agentic scaffolding is used. The framework is a deterministic static analysis pipeline." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": false, 168 "justification": "The paper states 200 samples were 'curated' from GPT-5 output but does not describe how many were initially generated, what curation criteria were applied, or how the 161/39 hallucinated/clean split was determined." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "§4 (Discussion and Future Work) contains a substantial paragraph beginning 'We must acknowledge the limitations of this study' discussing dataset size, library coverage, single-file analysis, and scope of targeted errors." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "§4 discusses specific threats: '200-sample dataset...is not exhaustive', 'error distribution may not reflect real-world prevalence', 'Knowledge Base was limited to five Python libraries', 'does not yet handle multi-module dataflows'." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": true, 185 "justification": "§4 explicitly states: 'our approach deliberately targets KCHs and does not attempt to solve more complex, multi-line logical errors' and 'currently focuses on single-file, function-level analysis.'" 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": true, 192 "justification": "Replication package [3] is stated to contain all data, code, and experimental configurations." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "§2.6 describes dataset construction: 200 Python samples generated by prompting GPT-5 for 5 target libraries, composed of 161 hallucinated (3 categories) and 39 clean samples." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. Data is LLM-generated code snippets from a standard model, not a benchmark requiring recruitment description." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": false, 207 "justification": "The paper does not describe how GPT-5 outputs were selected, filtered, or curated into the final 200. How many were generated initially? What criteria determined inclusion? This is undocumented." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding source is mentioned anywhere in the paper." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "All four authors are listed with William & Mary affiliation. No product being evaluated is affiliated with the authors." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not evidence of absence of conflict." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial interests statement is present in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": false, 235 "answer": false, 236 "justification": "The system being evaluated is a deterministic static analysis tool, not a pre-trained model. Contamination of model training data is not relevant to evaluating this framework." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": false, 240 "answer": false, 241 "justification": "The framework is rule-based, not trained. No train/test overlap concern exists." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": false, 245 "answer": false, 246 "justification": "The framework is deterministic and not trained on any data. Benchmark contamination is not applicable." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": true, 290 "justification": "§2.5 reports: 'the end-to-end analysis of all 200 samples completed in under 0.2 seconds on a single laptop CPU.'" 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": true, 295 "justification": "The compute is minimal and stated: under 0.2 seconds on a single laptop CPU for all 200 samples (§2.5)." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": false, 301 "answer": false, 302 "justification": "The framework is fully deterministic with no random seeds. No stochastic component exists." 303 }, 304 "number_of_runs_stated": { 305 "applies": false, 306 "answer": false, 307 "justification": "Deterministic system — a single run always produces the same output. Multiple runs are unnecessary." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "The framework likely has tunable parameters (e.g., edit distance threshold for fuzzy matching) but no hyperparameter search is described." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": false, 317 "justification": "No discussion of how design choices (e.g., edit distance thresholds, semantic cue definitions) were selected or validated." 318 }, 319 "multiple_comparison_correction": { 320 "applies": false, 321 "answer": false, 322 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors evaluate their own system on their own curated dataset without acknowledging the potential bias of designing both the tool and its evaluation data." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": false, 331 "answer": false, 332 "justification": "No comparison with other systems, so compute-matched comparison is not applicable." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "The 200-sample curated dataset's representativeness of real-world KCH distribution is not discussed. §4 briefly notes 'error distribution may not reflect real-world prevalence' but does not analyze construct validity." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "No scaffolding is involved. The system is a standalone static analysis tool." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": false, 348 "answer": false, 349 "justification": "The evaluated system is a deterministic rule-based tool, not a trained model. Temporal leakage of training data is not applicable." 350 }, 351 "feature_leakage_addressed": { 352 "applies": false, 353 "answer": false, 354 "justification": "The evaluated system is rule-based, not a trained model. Feature leakage is not applicable." 355 }, 356 "non_independence_addressed": { 357 "applies": false, 358 "answer": false, 359 "justification": "No trained model is evaluated. Non-independence of train/test data is not applicable." 360 }, 361 "leakage_detection_method": { 362 "applies": false, 363 "answer": false, 364 "justification": "No trained model is evaluated. Leakage detection is not applicable." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "The framework detects KCHs with 100% precision (zero false positives) and 87.6% recall (0.934 F1-score).", 371 "evidence": "Table 1 confusion matrix: 141 TP, 0 FP, 20 FN, 39 TN on 200-sample dataset (§3).", 372 "supported": "strong" 373 }, 374 { 375 "claim": "The framework auto-corrects 77.0% (124/161) of identified hallucinations.", 376 "evidence": "Table 2 shows 124 successfully corrected out of 161 hallucinated samples (§3).", 377 "supported": "strong" 378 }, 379 { 380 "claim": "The deterministic approach is a viable and reliable alternative to probabilistic repair.", 381 "evidence": "Results on the curated dataset, but no direct comparison with probabilistic methods. Claim is based on achieving high precision and reasonable correction rate (§3-4).", 382 "supported": "weak" 383 }, 384 { 385 "claim": "Detection was near-perfect for Missing Imports (97.9%) and numpy (100.0%) but lowest for Contextual Mismatches (33.3%) and matplotlib (72.2%).", 386 "evidence": "Tables 3 and 4 provide per-category and per-library breakdowns (§3).", 387 "supported": "strong" 388 } 389 ], 390 "methodology_tags": [ 391 "benchmark-eval" 392 ], 393 "key_findings": "A deterministic AST-based framework for detecting Knowledge Conflicting Hallucinations in LLM-generated code achieves 100% precision and 87.6% recall on a 200-sample curated dataset, with 77% automatic correction rate. Performance varies significantly by error type and library — Missing Imports are nearly perfectly handled (97.9%) while Contextual Mismatches are poorly detected (33.3%) and never corrected. The framework runs in under 0.2 seconds for all 200 samples, demonstrating practical efficiency.", 394 "red_flags": [ 395 { 396 "flag": "Self-curated evaluation dataset", 397 "detail": "The 200-sample dataset was generated and curated by the authors specifically for this tool. No independent or real-world benchmark is used. The dataset construction process (how GPT-5 outputs were selected/filtered) is not fully documented, raising concerns about whether the dataset favors the tool's capabilities." 398 }, 399 { 400 "flag": "No baseline comparisons", 401 "detail": "The paper claims the framework is 'a viable and reliable alternative to probabilistic repair' but does not compare against any existing tool (PICARD, Synchromesh, LLM-in-the-loop repair, Structural Trimming, mypy). The claim of superiority is argumentative, not empirical." 402 }, 403 { 404 "flag": "Small and imbalanced dataset", 405 "detail": "200 samples total with only 39 clean (negative) samples and only 3 Contextual Mismatch examples. The 100% precision claim rests on 39 negatives. The category with worst performance has only 3 samples — too few for meaningful conclusions." 406 }, 407 { 408 "flag": "Overclaiming in title and abstract", 409 "detail": "Title says 'LLM-Generated Code' generically but evaluation is limited to 200 Python snippets across 5 libraries. Abstract claims 'a clear path toward trustworthy code generation' from a narrow evaluation." 410 } 411 ], 412 "cited_papers": [ 413 { 414 "title": "Static Analysis as a Feedback Loop: Enhancing LLM-Generated Code Beyond Correctness", 415 "authors": [ 416 "Scott Blyth", 417 "Sherlock A. Licorish", 418 "Christoph Treude", 419 "Markus Wagner" 420 ], 421 "year": 2025, 422 "arxiv_id": "2508.14419", 423 "relevance": "LLM-in-the-loop repair using static analysis feedback, a direct comparison point for non-deterministic repair approaches." 424 }, 425 { 426 "title": "Mapping the Trust Terrain: LLMs in Software Engineering - Insights and Perspectives", 427 "authors": [ 428 "Dipin Khati", 429 "Yijin Liu", 430 "David N. Palacio", 431 "Yixuan Zhang", 432 "Denys Poshyvanyk" 433 ], 434 "year": 2025, 435 "doi": "10.1145/3771282", 436 "relevance": "Empirical study on developer trust in LLM-generated code, directly relevant to understanding trust erosion from code hallucinations." 437 }, 438 { 439 "title": "Hallucination by Code Generation LLMs: Taxonomy, Benchmarks, Mitigation, and Challenges", 440 "authors": [ 441 "Yunseo Lee", 442 "John Youngeun Song", 443 "Dongsun Kim" 444 ], 445 "year": 2025, 446 "arxiv_id": "2504.20799", 447 "relevance": "Taxonomy of code generation hallucinations with benchmarks, directly relevant to understanding and categorizing LLM code errors." 448 }, 449 { 450 "title": "Exploring and Evaluating Hallucinations in LLM-Powered Code Generation", 451 "authors": [ 452 "Fang Liu", 453 "Yang Liu", 454 "Lin Shi", 455 "Houkun Huang", 456 "Ruifeng Wang" 457 ], 458 "year": 2024, 459 "arxiv_id": "2404.00971", 460 "relevance": "Defines Knowledge Conflicting Hallucinations (KCHs), the central concept this paper builds upon." 461 }, 462 { 463 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", 464 "authors": [ 465 "Sida Peng", 466 "Eirini Kalliamvakou", 467 "Peter Cihon", 468 "Mert Demirer" 469 ], 470 "year": 2023, 471 "arxiv_id": "2302.06590", 472 "relevance": "Key study on Copilot's productivity impact, relevant to understanding the LLM code generation landscape." 473 }, 474 { 475 "title": "Bugs in Large Language Models Generated Code: An Empirical Study", 476 "authors": [ 477 "Florian Tambon", 478 "Arghavan Moradi Dakhel", 479 "Amin Nikanjam", 480 "Foutse Khomh" 481 ], 482 "year": 2024, 483 "arxiv_id": "2403.08937", 484 "relevance": "Empirical study documenting bug patterns in LLM-generated code, establishing the taxonomy this paper targets." 485 }, 486 { 487 "title": "Towards Understanding the Characteristics of Code Generation Errors Made by Large Language Models", 488 "authors": [ 489 "Zhijie Wang", 490 "Zijie Zhou", 491 "Da Song" 492 ], 493 "year": 2025, 494 "arxiv_id": "2406.08731", 495 "relevance": "Characterizes error types in LLM code generation, complementary to the KCH taxonomy." 496 }, 497 { 498 "title": "LLMLOOP: Improving LLM-Generated Code and Tests through Automated Iterative Feedback Loops", 499 "authors": [ 500 "Ravin Ravi", 501 "Dylan Bradshaw", 502 "Stefano Ruberto" 503 ], 504 "year": 2025, 505 "doi": "10.1109/ICSME64153.2025.00109", 506 "relevance": "LLM-in-the-loop iterative repair approach, a non-deterministic alternative to the deterministic approach proposed here." 507 }, 508 { 509 "title": "Cutting the Root of Hallucination: Structural Trimming for Vulnerability Mitigation in Code LLMs", 510 "authors": [ 511 "Yage Zhang" 512 ], 513 "year": 2025, 514 "relevance": "AST-based pruning approach for code hallucinations — deletion-based rather than correction-based, a direct comparison point." 515 }, 516 { 517 "title": "Fixing Function-Level Code Generation Errors for Foundation Large Language Models", 518 "authors": [ 519 "Hao Wen", 520 "Yueheng Zhu", 521 "Chao Liu" 522 ], 523 "year": 2025, 524 "arxiv_id": "2409.00676", 525 "relevance": "Addresses function-level code generation error fixing, related to the correction task in this paper." 526 } 527 ], 528 "engagement_factors": { 529 "practical_relevance": { 530 "score": 2, 531 "justification": "AST-based hallucination detection for LLM code is directly applicable to developer workflows, though the tool only covers 5 Python libraries currently." 532 }, 533 "surprise_contrarian": { 534 "score": 0, 535 "justification": "The idea that static analysis can catch API misuse is well-understood; the results confirm expectations rather than challenging them." 536 }, 537 "fear_safety": { 538 "score": 0, 539 "justification": "Addresses code correctness rather than safety, security, or misuse concerns." 540 }, 541 "drama_conflict": { 542 "score": 0, 543 "justification": "No controversy or conflict; positions itself as complementary to existing approaches without challenging specific claims." 544 }, 545 "demo_ability": { 546 "score": 1, 547 "justification": "Code is available on GitHub but requires setup with specific libraries and the custom dataset; not a quick-try tool." 548 }, 549 "brand_recognition": { 550 "score": 0, 551 "justification": "From William & Mary's SEMERU lab, not a widely recognized institution in the AI/ML community." 552 } 553 } 554 }