scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23893B)
      1 {
      2   "paper": {
      3     "title": "Code Hallucination",
      4     "authors": [
      5       "Mirza Masfiqur Rahman",
      6       "Ashish Kundu",
      7       "Ramana Kompella",
      8       "Elisa Bertino"
      9     ],
     10     "year": 2024,
     11     "venue": "arXiv preprint",
     12     "arxiv_id": "2407.04831"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, Zenodo archive, or code release of any kind is mentioned in the paper. The HallTrigger technique is described only in natural language."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No dataset of hallucinated code examples, prompts used, or model outputs is released. The paper describes case studies but does not provide structured data."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specification, dependency list, or setup instructions are provided. The paper only mentions using ChatGPT (GPT-3.5, GPT-4), Gemini, and Copilot via their web/API interfaces."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions are provided. The prompts used to trigger hallucinations are shown in figures but there is no systematic protocol for reproduction."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No confidence intervals or error bars are reported. The paper presents qualitative case studies without any quantitative metrics or uncertainty measures."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "The paper makes no comparative quantitative claims that would require significance tests. Results are presented as qualitative case studies."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No effect sizes are reported. The paper does not quantify the rate or magnitude of hallucinations beyond qualitative observations (e.g., Table 1 uses symbols for hallucination/no hallucination without counts or rates)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The number of test cases, prompts, or trials per case study is not stated or justified. It is unclear how many prompts were tried for each case study or how often hallucinations occurred."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance or spread measures are reported. The paper does not indicate how many times each experiment was repeated or what the variation in outcomes was."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No baseline comparison is provided. The paper compares three models against each other (Table 1), but there is no baseline technique for detecting or triggering hallucinations to compare HallTrigger against."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The paper mentions CodeHalu and HalluCode as related work but does not compare HallTrigger against them quantitatively or systematically."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "HallTrigger uses three principles (interactive prompting, meta-prompts, RLHF-based reward prompting) but no ablation study is performed to measure the contribution of each component."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No quantitative metrics are used. The evaluation is entirely qualitative, with case studies showing individual examples of hallucinations. Table 1 uses symbols (hallucination/no hallucination) but no formal metrics."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The paper implicitly involves manual inspection of model outputs (the authors determined whether outputs were hallucinated), but no formal human evaluation protocol is described. There are no inter-rater reliability measures or structured evaluation criteria."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "This is a qualitative case study paper, not a benchmark evaluation. There is no train/test split structure."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 1 provides a per-case-study breakdown showing which models exhibited hallucination for each of the 9 case types, separating code generation (G) from code analysis (A) tasks."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The entire paper is about failure cases (hallucinations). Cases where models did NOT hallucinate are also noted (e.g., Copilot successfully avoided several hallucination types, marked with open circles in Table 1)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports cases where models did not exhibit hallucination (Table 1 shows multiple 'no hallucination' results for Copilot and Gemini), and notes that syntax errors were not found despite being plausible (Section 5.1, Case 6)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The abstract claims HallTrigger is 'effective' and that 'pervasive LLM hallucination has sheer impact on software development,' but the evidence consists only of 9 qualitative case studies with no quantitative measure of effectiveness or impact. The word 'effective' is not grounded in any metric."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper claims the three dynamic attributes (interactive prompts, meta-prompts, RLHF reward) cause hallucinations, but no controlled experiment isolates these factors. The causal mechanism is conjectured (Section 4: 'We conjecture that this largely instigates the creativity') rather than demonstrated."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The abstract and conclusion claim hallucination is 'pervasive' and 'model-agnostic' based on testing only 3 models with 9 case studies. The title 'Code Hallucination' without qualification suggests generality beyond what was tested. Section 8 claims the approach 'demonstrates that code hallucination is prevalent for all black box large models' based on only 3 models."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for why models produced the observed outputs. For example, Case 3 (memorized LeetCode solutions) could be explained by training data contamination rather than hallucination per se, but this distinction is not explored."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper says 'ChatGPT (OpenAI GPT-3.5, 4)', 'Google Gemini', and 'Microsoft Copilot' without specific version numbers, API endpoints, or snapshot dates. No specific model version like 'gpt-4-0613' is provided."
    138       },
    139       "prompts_provided": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Several actual prompts are shown in the figures (Figures 1-7) and described in the case studies. For example, the exact prompt for Case 1 is shown in Figure 2, and the modified LeetCode prompt for Case 3 is quoted in the text."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "For Copilot, the paper states 'we kept the setting to balanced' (Section 3), but no temperature, top-p, or other API parameters are reported for ChatGPT or Gemini. These settings significantly affect output."
    148       },
    149       "scaffolding_described": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "HallTrigger uses sequential multi-turn prompting, meta-prompts, and reward-based prompting (Section 4), but the exact scaffolding/workflow is described only at a high level. The sequence of prompts, decision logic for adapting prompts, and specific meta-prompt templates are not fully specified."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "No data preprocessing is documented. It is unclear how prompts were designed, how many variations were tested, or how representative examples were selected from potentially many attempts."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 7 is titled 'Limitations' and discusses two specific limitations: the manual effort required by HallTrigger and the difficulty of remediation."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The Limitations section discusses manual effort and remediation difficulty, but does not address threats to the validity of the findings themselves — e.g., selection bias in case studies, lack of quantification, potential non-reproducibility due to model updates, or the small number of models tested."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound findings to specific models, languages (all examples are Python), or types of code tasks. The conclusion makes broad claims without scoping them."
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No raw data (full conversation logs, complete model outputs, all attempted prompts) is made available. Only selected examples are shown in figures."
    182       },
    183       "data_collection_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The data collection procedure is vaguely described. It is unclear how many prompts were tested, over what time period, how case studies were selected, or what the selection criteria were for the examples shown."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No human participants were recruited. The study involves the authors prompting LLMs directly."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "No data pipeline is documented. The path from prompt design to example selection to Table 1 results is not described. It is unclear how the authors determined whether a model response constituted hallucination versus a correct-but-different solution."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "No funding acknowledgment or statement is present in the paper."
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Author affiliations are clearly listed: three authors from Cisco Research and one from Purdue University. Since the paper evaluates third-party models (OpenAI, Google, Microsoft) rather than Cisco products, the affiliation does not create a direct conflict."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding source is disclosed, so independence cannot be assessed. Three of four authors are from Cisco Research, which is a competitor to the companies whose products are being evaluated (OpenAI, Google, Microsoft), creating a potential bias toward finding flaws."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No competing interests or financial interests statement is present in the paper."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "The paper does not evaluate model capability on a benchmark. It tests whether models can be prompted to produce hallucinated code — this is about triggering failure modes, not measuring knowledge. Contamination is not relevant to the claims."
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate model performance on a benchmark. However, Case 3 does implicitly involve contamination (LeetCode problems in training data), which the paper actually discusses as a hallucination type rather than a contamination issue."
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No benchmark evaluation is performed. The study triggers hallucinations through crafted prompts, not through benchmark testing."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants were involved in this study. The authors manually interacted with LLMs."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants; no IRB approval needed."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No inference cost, API cost, or time per experiment is reported, despite the method involving multiple rounds of prompting commercial APIs."
    280       },
    281       "compute_budget_stated": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No computational budget or total cost is reported."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "HallTrigger can effectively trigger arbitrary code hallucinations from black-box LLMs without accessing model parameters or architecture.",
    291       "evidence": "Section 4 describes the HallTrigger framework using three principles (interactive prompting, meta-prompts, RLHF-based reward). Sections 5.1 and 5.2 show 9 case studies. Table 1 shows hallucination results across 3 models. However, 'effectiveness' is not quantified — no success rate, no count of attempts vs. triggered hallucinations.",
    292       "supported": "weak"
    293     },
    294     {
    295       "claim": "Code hallucination is pervasive and model-agnostic across popular black-box LLMs.",
    296       "evidence": "Table 1 shows hallucinations were observed in all three tested models (GPT-4, Gemini, Copilot), but with varying frequency. However, only 3 models were tested with 9 case studies, sample sizes per case are not reported, and 'pervasive' is not quantified.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "LLMs exhibit memorization behavior that causes them to produce solutions matching training data even when requirements are modified.",
    301       "evidence": "Case 3 (Section 5.1) shows GPT-4 generating a solution closely matching the standard LeetCode 'Merge k Sorted Lists' solution despite the output requirement being changed to 'return 5 in a list'. Variable and class names match LeetCode skeleton code.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Models can be repetitively trapped in hallucination loops through sequential prompting.",
    306       "evidence": "Case 8 (Section 5.1) shows Gemini-Advanced repeatedly failing to correctly count lines in generated code, even after correction prompts. Figure 5 illustrates the model claiming 5 lines is 10 lines.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "LLMs fail to identify logical flaws in code that closely resembles known algorithms.",
    311       "evidence": "Case 9 (Section 5.2) shows models failing to detect modified/incorrect logic in merge sort and pair sum code segments, treating them as correct versions of the algorithms.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": [
    316     "case-study",
    317     "qualitative"
    318   ],
    319   "key_findings": "The paper presents a taxonomy of 9 types of code hallucination in LLMs and introduces HallTrigger, a technique to deliberately trigger hallucinations using interactive prompting, meta-prompts, and reward-based feedback. Testing on ChatGPT (GPT-3.5/4), Gemini, and Microsoft Copilot, the authors demonstrate that all models are susceptible to various hallucination types including incorrect algorithm suggestions, code bloating, imaginary methods, runtime errors, and failure to detect logical flaws. The results are entirely qualitative with no quantitative metrics of hallucination rates or effectiveness.",
    320   "red_flags": [
    321     {
    322       "flag": "No quantitative evaluation",
    323       "detail": "The entire evaluation consists of 9 qualitative case studies with cherry-picked examples. No hallucination rates, success rates for HallTrigger, or any quantitative metrics are reported. Table 1 uses symbols (hallucination/no hallucination) without counts or frequencies."
    324     },
    325     {
    326       "flag": "Unknown sample size and selection bias",
    327       "detail": "It is completely unclear how many prompts were tested per case study, how many attempts were needed to produce the shown examples, or whether the shown examples are representative. The cases appear to be selected to illustrate specific phenomena without systematic sampling."
    328     },
    329     {
    330       "flag": "Overclaiming from limited evidence",
    331       "detail": "The paper claims hallucination is 'pervasive' and 'model-agnostic' based on 3 models and 9 case studies. The abstract claims 'sheer impact on software development' without any evidence of real-world impact. The conclusion generalizes to 'all black box large models.'"
    332     },
    333     {
    334       "flag": "No model version specification",
    335       "detail": "Models are referred to as 'GPT-4', 'Gemini', and 'Copilot' without version numbers or dates. Model behavior changes across versions, so these results may not be reproducible."
    336     },
    337     {
    338       "flag": "Conflation of hallucination with other phenomena",
    339       "detail": "Some cases (e.g., Case 3 memorized LeetCode solutions, Case 6 runtime errors) may be better characterized as training data contamination, overfitting, or ordinary bugs rather than hallucination. The paper's definition of hallucination is broad enough to encompass many distinct failure modes."
    340     },
    341     {
    342       "flag": "Potential competitor bias",
    343       "detail": "Three of four authors are from Cisco Research, evaluating products from competitors (OpenAI, Google, Microsoft). No competing interests statement is provided."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "Evaluating large language models trained on code",
    349       "authors": ["Mark Chen", "Jerry Tworek"],
    350       "year": 2021,
    351       "arxiv_id": "2107.03374",
    352       "relevance": "Introduces HumanEval benchmark and Codex, foundational work for LLM code generation evaluation."
    353     },
    354     {
    355       "title": "CodeHalu: Code hallucinations in LLMs driven by execution-based verification",
    356       "authors": ["Yuchen Tian", "Weixiang Yan"],
    357       "year": 2024,
    358       "arxiv_id": "2405.00253",
    359       "relevance": "Directly related taxonomy of code hallucination types with execution-based verification methodology."
    360     },
    361     {
    362       "title": "Exploring and evaluating hallucinations in LLM-powered code generation",
    363       "authors": ["Fang Liu", "Yang Liu"],
    364       "year": 2024,
    365       "arxiv_id": "2404.00971",
    366       "relevance": "HalluCode: Related taxonomy and evaluation of hallucination recognition capabilities in code LLMs."
    367     },
    368     {
    369       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    370       "authors": ["Carlos E. Jimenez", "John Yang"],
    371       "year": 2024,
    372       "relevance": "Major benchmark for evaluating LLMs on real software engineering tasks from GitHub issues."
    373     },
    374     {
    375       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    376       "authors": ["Jiawei Liu", "Chunqiu Steven Xia"],
    377       "year": 2023,
    378       "relevance": "EvalPlus framework for rigorous evaluation of LLM code generation correctness, directly relevant to code quality assessment."
    379     },
    380     {
    381       "title": "Large language models for software engineering: A systematic literature review",
    382       "authors": ["Xinyi Hou", "Yanjie Zhao"],
    383       "year": 2024,
    384       "arxiv_id": "2308.10620",
    385       "relevance": "Comprehensive survey of LLMs in software engineering, relevant as a survey-of-surveys candidate."
    386     },
    387     {
    388       "title": "How secure is code generated by ChatGPT?",
    389       "authors": ["Raphaël Khoury", "Anderson R. Avila"],
    390       "year": 2023,
    391       "arxiv_id": "2304.09655",
    392       "relevance": "Evaluates security of ChatGPT-generated code, related to code quality and safety assessment."
    393     },
    394     {
    395       "title": "Hallucination is inevitable: An innate limitation of large language models",
    396       "authors": ["Ziwei Xu", "Sanjay Jain"],
    397       "year": 2024,
    398       "arxiv_id": "2401.11817",
    399       "relevance": "Theoretical work arguing hallucination cannot be completely eliminated from LLMs, foundational for understanding LLM limitations."
    400     },
    401     {
    402       "title": "INSIDE: LLMs' internal states retain the power of hallucination detection",
    403       "authors": ["Chao Chen", "Kai Liu"],
    404       "year": 2024,
    405       "relevance": "Hallucination detection method using LLM internal states, relevant to AI safety and reliability."
    406     },
    407     {
    408       "title": "Generate and pray: Using SALLMS to evaluate the security of LLM generated code",
    409       "authors": ["Mohammed Latif Siddiq", "Joanna C. S. Santos"],
    410       "year": 2023,
    411       "arxiv_id": "2311.00889",
    412       "relevance": "Security evaluation framework for LLM-generated code, related to code safety assessment methodology."
    413     },
    414     {
    415       "title": "Code security vulnerability repair using reinforcement learning with large language models",
    416       "authors": ["Nafis Tanveer Islam", "Mohammad Bahrami Karkevandi"],
    417       "year": 2024,
    418       "arxiv_id": "2401.07031",
    419       "relevance": "Uses LLMs for vulnerability repair, relevant to LLM-based code repair capabilities."
    420     }
    421   ]
    422 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs