scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24783B)
      1 {
      2   "paper": {
      3     "title": "Collu-Bench: A Benchmark for Predicting Language Model Hallucinations in Code",
      4     "authors": ["Nan Jiang", "Qi Li", "Lin Tan", "Tianyi Zhang"],
      5     "year": 2024,
      6     "venue": "Preprint Under Review",
      7     "arxiv_id": "2410.09997"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No source code repository URL is provided in the paper. The paper provides a HuggingFace dataset link (https://huggingface.co/datasets/lt-asset/collu-bench) but no code repository for the benchmark construction pipeline or prediction experiments."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The dataset is released on HuggingFace at https://huggingface.co/datasets/lt-asset/collu-bench, stated explicitly in Section 1: 'Availability: https://huggingface.co/datasets/lt-asset/collu-bench.'"
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper mentions using scikit-learn and tree-sitter but does not provide version details or a reproducible environment setup."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided to reproduce the benchmark construction or prediction experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Tables 2-5 report only point accuracy estimates with no confidence intervals, error bars, or uncertainty measures despite using 5-fold cross-validation."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper compares multiple ML and NN models (e.g., 'RF produces higher accuracy than SVC, AB, GB, and MLP') but no statistical significance tests are reported to support these comparative claims."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Results are reported as raw accuracy percentages (e.g., 33.09%, 33.15%) without effect sizes, baseline context for improvement magnitude, or standardized measures."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The benchmark contains 13,234 instances but no justification is given for why this sample size is sufficient. For the human evaluation of error rate, only 100 samples were checked with no power analysis or justification for that number."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Although 5-fold cross-validation is used, no standard deviation, variance, or spread measures across folds are reported. Only single accuracy numbers are shown in Tables 2-5."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple baselines are included: five ML methods (SVC, AB, RF, GB, MLP) and four NN architectures (CNN, GRU, LSTM, Transformer) are compared against each other, plus a simple string matching baseline (57% vs 86% agreement) for the data collection pipeline."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The prediction baselines are all traditional ML/NN methods (Random Forest, SVC, LSTM, CNN, GRU, Transformer). No contemporary LLM-based or specialized hallucination detection methods are used as baselines, even though the paper's related work discusses methods like RARR and Chain-of-Verification."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper conducts implicit ablation through feature analysis: it tests different data split settings (All-in-one, One-per-dataset, One-per-LLM) and examines different feature types (log probabilities, token types). The three data split configurations serve as ablations over data organization."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "Only accuracy is reported as an evaluation metric. No other metrics such as precision, recall, F1, or AUC are used despite this being a prediction task where class imbalance is acknowledged (3:1 downsampling ratio mentioned in Appendix A.5)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 4.2 describes a human evaluation where two developers reviewed 100 randomly selected samples to assess the accuracy of the automated hallucination token localization pipeline, finding 86% agreement."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses 5-fold cross-validation with 80% training and 20% test data per fold, as described in Section 5."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per dataset (MBPP, HumanEval, HumanEval-Java, Defects4J, SWE-bench) in Tables 2 and 4, and per LLM in Tables 3 and 5. Token type analysis is also broken down by category in Table 1."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4.2 discusses the 14 error cases in hallucination labeling and identifies the cause (missing equivalent canonical solutions). Section 6 (Limitation) also discusses failure modes of the automated pipeline."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that overall accuracy is only 22.03-33.15%, explicitly framing this as a challenging task. GPT-4o-mini's unique patterns that cause cross-model prediction to fail (~0% accuracy) are also reported as negative results."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims about 13,234 instances, 11 LLMs, five datasets, 22.03-33.15% accuracy, and the benchmark's features (log probabilities, token types, execution feedback) are all supported by the experimental results in Sections 4 and 5."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper's causal claims are modest and well-supported. Claims like 'LLMs are less confident when hallucinating' are supported by probability distribution analysis (Figure 3). The paper avoids strong causal language, mostly using observational framing."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper is careful about generalization. Section 6 (Limitation) explicitly acknowledges limits in the range of selected LLMs and datasets. The title and claims are specifically about the benchmark, not about hallucination in general."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for its findings. For example, the observation that 'LLMs are less confident when hallucinating' could have alternative explanations (e.g., lower confidence may correlate with harder problems). No such alternatives are considered."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Model families and sizes are specified (e.g., DeepSeekCoder-1.3B/6.7B/33B, StarCoder2-3B/7B/15B, CodeLlama-7B/13B/34B, Llama3-8B, GPT-4o-mini) with release dates in Table 7, but specific model version identifiers or snapshot dates for GPT-4o-mini are not provided. For open-source models, the exact checkpoint versions are not specified."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full few-shot prompt templates are provided in Appendix A.2 (Figures 4 and 5), including the system prompt, five-shot examples with actual code, and the test sample format for both code generation and automated program repair tasks."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix A.5 reports hyperparameters for ML models (default scikit-learn), NN architectures (number of layers, hidden dimensions, batch size, epochs, optimizer), and generation parameters (temperature 0.8 for sampling, greedy decoding for generation) in Sections 3.1 and 3.3."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The paper uses direct LLM prompting for code generation/repair and standard ML/NN training for prediction."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3 describes the full data preprocessing pipeline in detail: diverse canonical solution collection via sampling (Section 3.1), program normalization using tree-sitter (Section 3.1), hallucination token localization (Section 3.3), and additional signal collection (Section 3.4). Filtering criteria and the downsampling ratio (3:1) are also documented."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 is a dedicated 'Limitation' section that discusses two specific limitations: errors in automated hallucination token indices and the limited range of LLMs and datasets."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 6 identifies specific threats: (1) the automated pipeline's non-perfect accuracy (14% error rate on 100-sample check), (2) limited to 11 LLMs and 5 datasets which may not represent all patterns. These are specific to this study rather than generic disclaimers."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6 explicitly states what was not tested: 'There exist lots of different LLMs and code generation or program repair datasets, we select the set of state-of-the-art, widely-used LLMs.' The paper also clarifies its focus on token-level first hallucination prediction rather than all hallucinations."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The full benchmark dataset is available on HuggingFace (https://huggingface.co/datasets/lt-asset/collu-bench), which includes the raw LLM-generated code, log probabilities, token types, and hallucination indices."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3 describes the data collection in detail: datasets selected (Section 3.2), LLMs used (Section 3.2), generation procedure with greedy decoding and few-shot prompting (Section 3.3), and hallucination token localization through normalization and comparison (Sections 3.1, 3.3)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants were recruited as subjects. The two developers who reviewed 100 samples in Section 4.2 were for data validation, not as study participants. The data sources are standard public benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline from dataset selection to final benchmark is documented: Section 3.1 (code equivalence handling and normalization), Section 3.2 (dataset and LLM selection), Section 3.3 (generation and hallucination localization), Section 3.4 (additional signal collection). Table 8 shows instance counts per LLM and dataset."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The Acknowledgement section states: 'This research was supported in part by NSF 1901242 and 2006688 and a CFI fund.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are from Purdue University, USA, as stated in the author information. They are not affiliated with any of the companies whose models are evaluated."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The funders are NSF (National Science Foundation) and CFI, which are government/academic funding agencies with no financial stake in the evaluation of specific LLMs."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is included in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper lists release dates for the LLMs in Table 7 but does not state their training data cutoff dates. For GPT-4o-mini, even the training data and process are explicitly noted as 'unknown.'"
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "The paper explicitly addresses data leakage for Defects4J by including HumanEval-Java, which was 'transformed from HumanEval to overcome the data leakage threat of Defects4J' (Section 3.2). However, this is acknowledged as an issue rather than comprehensively addressed for all benchmarks."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "HumanEval (2021) and MBPP (2021) were published before most models' training cutoffs and could be in training data. While HumanEval-Java is used to mitigate Defects4J leakage, the paper does not discuss contamination risk for HumanEval and MBPP themselves, despite these being well-known contamination risks."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human subjects study was conducted. The two developers who reviewed 100 samples were performing data validation, not participating as research subjects."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human subjects study was conducted."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human subjects study was conducted."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human subjects study was conducted."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human subjects study was conducted."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human subjects study was conducted."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human subjects study was conducted."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference costs, API costs, or latency figures are reported despite using GPT-4o-mini API and running 11 LLMs for generation. The cost of sampling 100 programs per problem per LLM is not quantified."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No compute budget, GPU hours, or hardware specifications are mentioned despite the significant computational cost of running 11 LLMs across 5 datasets with extensive sampling (100 samples per problem per LLM)."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Collu-Bench includes 13,234 code hallucination instances collected from five datasets and 11 diverse LLMs.",
    286       "evidence": "Table 8 in Appendix A.4 provides detailed counts per LLM and dataset, totaling 13,234 instances.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "LLMs are less confident when hallucinating, as hallucinated tokens have lower probability than correct tokens.",
    291       "evidence": "Figure 3 shows probability distributions of correct vs. hallucinated tokens across all LLMs, datasets, and token types (Section 4.1).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "LLMs are more likely to hallucinate when generating certain types of tokens such as Keyword, Identifier, and Type Identifier.",
    296       "evidence": "Table 1 (Section 4.1) shows hallucination rates by token type, with Keyword having the highest rate (8.24-15.35%) across most LLMs.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Random forest produces the highest overall accuracy of 33.09% for per-token prediction; LSTM produces the highest overall accuracy of 33.15% for per-sample prediction.",
    301       "evidence": "Tables 2 and 4 (Sections 5.1 and 5.2) show these results under the All-in-one setting.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "GPT-4o-mini has the most unique hallucination pattern compared to other LLMs.",
    306       "evidence": "Tables 3 and 5 show that predictors trained on other LLMs perform near 0% on GPT-4o-mini's data, and vice versa. This is consistent across both per-token (RF) and per-sample (LSTM) predictions.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "The proposed pipeline of sampling diverse canonical solutions and program normalization reduces the error rate from 43% to 14%.",
    311       "evidence": "Section 4.2 reports that simple string/token matching achieves only 57/100 agreement with human developers, while the proposed pipeline achieves 86/100.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "Collu-Bench is a benchmark of 13,234 code hallucination instances from 11 LLMs across 5 datasets, designed for predicting and localizing the first hallucinated token in LLM-generated code. Key findings include that LLMs show lower confidence (log probability) when hallucinating and are more error-prone on Keyword tokens. Preliminary prediction experiments using ML and NN approaches achieve only 22-33% accuracy, indicating the task is challenging. GPT-4o-mini exhibits distinctly different hallucination patterns from open-source models, making cross-model prediction ineffective.",
    317   "red_flags": [
    318     {
    319       "flag": "No variance or uncertainty reported despite cross-validation",
    320       "detail": "All results use 5-fold cross-validation but only report single accuracy numbers with no standard deviation across folds. This makes it impossible to assess the reliability of comparisons between models."
    321     },
    322     {
    323       "flag": "Single metric evaluation",
    324       "detail": "Only accuracy is used to evaluate hallucination prediction despite acknowledged class imbalance (3:1 downsampling ratio). Precision, recall, and F1 would provide a more complete picture of model performance."
    325     },
    326     {
    327       "flag": "Incomplete contamination analysis",
    328       "detail": "HumanEval (2021) and MBPP (2021) are well-known contamination risks for models released in 2023-2024, but the paper does not discuss whether models may have memorized these benchmarks' solutions, which could affect what constitutes a 'hallucination.'"
    329     },
    330     {
    331       "flag": "No compute cost reported for extensive sampling",
    332       "detail": "The pipeline samples 100 programs per problem per LLM to collect canonical solutions, which is computationally expensive, but no cost or compute budget is reported."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    338       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    339       "year": 2024,
    340       "relevance": "Major benchmark for project-level automated program repair used as one of the five datasets in Collu-Bench."
    341     },
    342     {
    343       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    344       "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    345       "year": 2024,
    346       "arxiv_id": "2405.15793",
    347       "relevance": "Prominent agentic approach to software engineering referenced as a key application of LLMs in code."
    348     },
    349     {
    350       "title": "AutoCodeRover: Autonomous Program Improvement",
    351       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    352       "year": 2024,
    353       "arxiv_id": "2404.05427",
    354       "relevance": "Autonomous program repair agent relevant to understanding LLM capabilities in code modification."
    355     },
    356     {
    357       "title": "Evaluating Large Language Models Trained on Code",
    358       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    359       "year": 2021,
    360       "relevance": "Introduced the HumanEval benchmark, a foundational dataset used in Collu-Bench and widely used for code generation evaluation."
    361     },
    362     {
    363       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    364       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    365       "year": 2023,
    366       "relevance": "Introduced EvalPlus for rigorous LLM code evaluation, used as the test harness in Collu-Bench."
    367     },
    368     {
    369       "title": "Impact of Code Language Models on Automated Program Repair",
    370       "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"],
    371       "year": 2023,
    372       "doi": "10.1109/ICSE48619.2023.00125",
    373       "relevance": "Evaluates LLMs for automated program repair at ICSE, directly relevant to understanding LLM coding capabilities and limitations."
    374     },
    375     {
    376       "title": "A Deep Dive into Large Language Models for Automated Bug Localization and Repair",
    377       "authors": ["Soneya Binta Hossain", "Nan Jiang", "Qiang Zhou"],
    378       "year": 2024,
    379       "relevance": "Studies LLMs for bug localization and repair; pointer network approach from this work is adopted for per-sample prediction in Collu-Bench."
    380     },
    381     {
    382       "title": "Exploring and Evaluating Hallucinations in LLM-Powered Code Generation",
    383       "authors": ["Fang Liu"],
    384       "year": 2024,
    385       "arxiv_id": "2404.00971",
    386       "relevance": "HalluCode benchmark for code hallucination taxonomy, a key related work for understanding LLM hallucinations in code."
    387     },
    388     {
    389       "title": "CodeHalu: Code Hallucinations in LLMs Driven by Execution-Based Verification",
    390       "authors": ["Yuchen Tian", "Weixiang Yan", "Qian Yang"],
    391       "year": 2024,
    392       "arxiv_id": "2405.00253",
    393       "relevance": "Another code hallucination benchmark using execution-based verification, directly comparable to Collu-Bench."
    394     },
    395     {
    396       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    397       "authors": ["Daya Guo", "Qihao Zhu"],
    398       "year": 2024,
    399       "arxiv_id": "2401.14196",
    400       "relevance": "One of the main LLM families evaluated in Collu-Bench, relevant to understanding code-specialized LLM capabilities."
    401     },
    402     {
    403       "title": "StarCoder 2 and The Stack v2: The Next Generation",
    404       "authors": ["Anton Lozhkov", "Raymond Li"],
    405       "year": 2024,
    406       "arxiv_id": "2402.19173",
    407       "relevance": "Another major code LLM family evaluated in Collu-Bench."
    408     },
    409     {
    410       "title": "Can LLM Replace Stack Overflow? A Study on Robustness and Reliability of Large Language Model Code Generation",
    411       "authors": ["Li Zhong", "Zilong Wang"],
    412       "year": 2024,
    413       "doi": "10.1609/aaai.v38i19.30185",
    414       "relevance": "Studies API misuse in LLM code generation, relevant to understanding code hallucination patterns."
    415     }
    416   ]
    417 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs