scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24712B)
      1 {
      2   "paper": {
      3     "title": "Assessing the Answerability of Queries in Retrieval-Augmented Code Generation",
      4     "authors": ["Geonmin Kim", "Jaeyeon Kim", "Hancheol Park", "Wooksu Shin", "Tae-Ho Kim"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2411.05547",
      8     "doi": "10.48550/arXiv.2411.05547"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "The paper provides a GitHub URL (https://github.com/Nota-NetsPresso/RaCGEval) in Section 1: 'RaCGEval benchmark dataset and pre-trained answerability assessment models are publicly available.'"
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The RaCGEval benchmark dataset is publicly released at the GitHub URL. The paper also uses publicly available datasets (CoNaLa, TorchDataEval, BeatNumEval, MonkeyEval from Zan et al., 2022)."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided in the paper. The paper mentions using QLoRA and specific models but does not list library versions or dependencies."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions are provided in the paper. While the benchmark and models are released, the paper does not include a README with commands to run or a 'Reproducing Results' section."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Table 2 reports only point estimates of accuracy (e.g., 46.7%) with no confidence intervals or error bars. Figure 4 and Figure 5 also show point estimates without uncertainty quantification."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper makes comparative claims (e.g., fine-tuning improves over zero-shot, ICL improves accuracy) but provides no statistical significance tests. Differences between models and methods are compared by raw numbers only."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper reports raw accuracy percentages (e.g., 31.2%, 33.0%, 46.7%) but does not provide effect sizes with baseline context in a standardized format. While absolute differences can be inferred, no formal effect size measures are reported."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The benchmark has 1,016 total samples (Table 1) but no justification is given for why this size is adequate. No power analysis or discussion of whether the sample supports the statistical claims being made."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. It is unclear whether experiments were run multiple times or represent single runs."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper includes multiple baselines: zero-shot inference on three LLMs (gpt-3.5, llama3, gemma), fine-tuned versions, and in-context learning variants. Table 2 and Figure 4 compare these approaches."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The baselines use contemporary models including gpt-3.5-turbo-0613, llama3-instruct-8b, and gemma-1.1-7b-it, all of which were recent at the time of writing (2024)."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Section 5 provides ablation-like analysis: Section 5.1 examines the effect of in-context learning (comparing zero-shot vs. ICL, with and without fine-tuning), and Section 5.2 analyzes the trade-off between coverage and precision. These show which components matter."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper uses accuracy for the classification task (Table 2) and pass@k (k=10) for code generation quality (Figure 5). Coverage vs. precision trade-off is also analyzed."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Section 2.4 describes human annotation by four programming experts, with an additional two experts for inter-annotator agreement validation using Fleiss' Kappa (0.7408). The humans evaluate the benchmark labels, which is part of the system's ground truth."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The paper does not explicitly describe a train/dev/test split for the RaCGEval benchmark. The training data (Section 3.2.2: 2192 answerable, 2191 unanswerable, 2185 partially answerable from CoNaLa) is separate from the benchmark, but there is no mention of a held-out dev set used for hyperparameter tuning vs. the test benchmark."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Figure 4 provides per-domain breakdowns (NetsPressoEval, TorchDataEval, BeatNumEval, MonkeyEval) for zero-shot, ICL, and fine-tuned settings across multiple LLM backbones."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The Limitations section discusses specific failure cases: unanswerable types not covered, the gap between verification accuracy and pass@k ('the LLM can generate code without the gold API documents, using its prior knowledge'), and the limitations of domain adaptation via random ICL examples."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports that zero-shot performance is near chance level (~33%) for all models (Table 2), that 'the highest verification accuracy does not always achieve the best pass@k' (Limitations), and discusses where fine-tuning alone is insufficient without domain adaptation."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims the task is 'very challenging' with baseline models at '46.7%' accuracy. Table 2 confirms this (gemma fine-tuned: 46.7%). The abstract also mentions methods to 'significantly improve performance,' supported by Section 5 (ICL improvements shown in Figure 4)."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper's main causal claims are that fine-tuning and ICL improve accuracy. These are supported by controlled comparisons: same models with/without fine-tuning (Table 2), same models with/without ICL (Figure 4). The ablation design is adequate for these claims."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The abstract and title frame the work broadly as 'Retrieval-Augmented Code Generation' but the benchmark only covers four specific libraries (NetsPresso, TorchData, BeatNum, Monkey), three of which are modified/private versions. The paper does not bound its generalization claims to these specific domains."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper does not substantively discuss alternative explanations for its results. For example, the poor zero-shot performance could be due to prompt design, model unfamiliarity with modified API names (BeatNum, Monkey), or the three-class nature of the task. These alternatives are not considered."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Section 4.1 specifies exact model versions: 'gemma-1.1-7b-it', 'llama3-instruct-8b', and 'gpt-3.5-turbo-0613'. These are specific enough to identify the exact models."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The prompt template is provided in full in Table 4 (Appendix B), and the few-shot examples are provided in Tables 5-8 (Appendix D). The actual prompt text is given, not just a natural language description."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper mentions using QLoRA for fine-tuning but does not report key hyperparameters such as learning rate, number of epochs, LoRA rank, temperature for inference, or other sampling parameters. Section 4.1 states the method but omits these details."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used. The system is a straightforward classification pipeline: prompt -> LLM -> prediction token."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 2.2 describes the API documentation sources. Section 2.3 describes in detail the three methods for generating partially answerable and unanswerable samples, including similarity thresholds (tau_r = 0.8, tau_i = 0.1) and the CodeT5+ encoder used. Section 3.2 describes training data construction."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "There is a dedicated 'Limitations' section after the Conclusion that discusses three specific limitations."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The Limitations section raises specific threats: (1) RaCGEval may not cover all real-world unanswerable types, (2) random ICL examples may not be optimal for domain adaptation, (3) verification accuracy and pass@k are not perfectly correlated because LLMs can use prior knowledge. These are specific to this study."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The paper does not explicitly state what the results do NOT show or what settings are excluded. While the Limitations section discusses potential improvements, it does not delineate clear scope boundaries (e.g., 'our results apply only to these four libraries' or 'we did not test larger models')."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The RaCGEval benchmark dataset is publicly released at the GitHub URL (https://github.com/Nota-NetsPresso/RaCGEval), including pre-trained models."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 2.2 describes the four API documentation sources in detail. Sections 2.3 and 2.4 describe how samples were generated and annotated, including the similarity thresholds and methods used."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "Section 2.4 mentions 'four programming experts' and 'additional two experts' for annotation but does not describe how these annotators were recruited, their qualifications, or potential selection bias. The paper only states they are 'programming experts' without further characterization."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The full pipeline is documented: API documentation sources (Section 2.2), answerable sample construction, three methods for generating unanswerable/partially answerable samples with similarity thresholds (Section 2.3), annotation process with inter-annotator agreement (Section 2.4), and final dataset statistics (Table 1)."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The Acknowledgments section states: 'This work was supported by Artificial intelligence industrial convergence cluster development project funded by the Ministry of Science and ICT(MSIT, Korea) & Gwangju Metropolitan City.'"
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "All authors are listed as affiliated with Nota Inc. The paper header clearly states 'Nota Inc.' with author emails at nota.ai."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The funder is a Korean government project (Ministry of Science and ICT & Gwangju Metropolitan City), which appears independent of the specific outcomes of this benchmark study."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement is present. Notably, all authors are from Nota Inc., the company that develops NetsPresso, which is one of the four libraries in the benchmark. This potential conflict is not explicitly acknowledged."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The paper uses gpt-3.5-turbo-0613, llama3-instruct-8b, and gemma-1.1-7b-it on the benchmark but does not state the training data cutoff dates for any of these models."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Section 2.2 explicitly addresses this: 'the benchmark dataset includes APIs that most language models are unlikely to have encountered during training.' They use private (NetsPressoEval) or modified (BeatNumEval, MonkeyEval with rephrased keywords) API documentation specifically to avoid overlap."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "The benchmark is designed to mitigate contamination: NetsPressoEval uses private API docs, BeatNumEval and MonkeyEval use 'keywords and structure rephrased' versions of NumPy/Pandas. Section 2.2 explains this was done 'to ensure a fair assessment that excludes prior knowledge of each LLM.'"
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "The human involvement is limited to expert annotation of the benchmark dataset, not a human subjects study. No participant-level data is collected about the annotators."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "The annotators are programming experts performing a professional task (labeling data), not human subjects in a research study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human subjects study. The annotators are described only as 'programming experts' in their professional capacity."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human subjects study. Annotator selection is a professional staffing decision, not participant recruitment."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human subjects study requiring randomization."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human subjects study requiring blinding."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human subjects study. All 6 annotators completed their annotation tasks."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "The paper motivates the task by mentioning computational cost savings ('reduce computational costs for LLM service providers') but does not report actual inference costs, API costs, tokens consumed, or latency for any of the models evaluated."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No information is provided about GPU hours, training time, total API spend, or hardware used for the experiments. The paper mentions using QLoRA to reduce VRAM but does not quantify the actual compute budget."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "The answerability assessment task in RaCG remains very challenging, with baseline models exhibiting a low performance of 46.7%.",
    287       "evidence": "Table 2 shows the best model (gemma fine-tuned) achieves 46.7% accuracy. Zero-shot baselines range from 31.2% to 36.9%, near the 33.3% chance level for 3-way classification.",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "In-context learning substantially improves accuracy compared to zero-shot inference, suggesting domain adaptation is crucial.",
    292       "evidence": "Figure 4 shows ICL-3w1s improves accuracy across all backbones and domains compared to zero-shot, both with and without fine-tuning. The paper states 'the RaCG benchmark prioritizes LLM learning domain information (i.e., by ICL) over learning task information (i.e., by fine-tuning).'",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "Fine-tuning LLMs improves answerability assessment over zero-shot inference.",
    297       "evidence": "Table 2: llama3 improves from 33.0% (zero-shot) to 36.5% (fine-tuned), gemma from 36.9% to 46.7%.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "The RaCGEval benchmark annotation is reliable, with substantial inter-annotator agreement (Fleiss' Kappa = 0.7408).",
    302       "evidence": "Section 2.4 reports Fleiss' Kappa of 0.7408 across three sets of labels from 6 total annotators, interpreted as 'substantial agreement' per Landis & Koch (1977).",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Introducing answerability assessment creates a controllable trade-off between coverage and code generation precision.",
    307       "evidence": "Figure 5 shows pass@k vs. coverage curves for various verification models, demonstrating that reducing coverage (accepting fewer queries as answerable) increases the precision of generated code.",
    308       "supported": "moderate"
    309     }
    310   ],
    311   "methodology_tags": ["benchmark-eval"],
    312   "key_findings": "The paper introduces RaCGEval, a benchmark for evaluating whether LLMs can determine if a user query is answerable given retrieved API descriptions in retrieval-augmented code generation. Zero-shot LLM performance is near chance level (~33%) on the 3-way classification task, while fine-tuning reaches 46.7%, indicating the task is challenging. In-context learning with domain-specific examples provides larger improvements than task-specific fine-tuning alone, suggesting domain adaptation is the primary bottleneck. The paper also demonstrates a practical coverage-precision trade-off: filtering queries through an answerability model before code generation improves code quality at the cost of coverage.",
    313   "red_flags": [
    314     {
    315       "flag": "Company evaluating its own product",
    316       "detail": "All authors are from Nota Inc., which develops NetsPresso. NetsPressoEval is one of the four benchmark subsets, and the paper promotes NetsPresso API documentation as a use case. This conflict of interest is not explicitly acknowledged."
    317     },
    318     {
    319       "flag": "No error bars or uncertainty quantification",
    320       "detail": "All results (Table 2, Figures 4-5) are reported as point estimates with no confidence intervals, error bars, or indication of variance across runs. It is unclear if experiments were run multiple times."
    321     },
    322     {
    323       "flag": "Missing hyperparameter details",
    324       "detail": "Key training hyperparameters (learning rate, epochs, LoRA rank, inference temperature) are not reported despite using QLoRA fine-tuning, making reproduction difficult."
    325     },
    326     {
    327       "flag": "Obfuscated benchmark names may confuse difficulty assessment",
    328       "detail": "BeatNumEval and MonkeyEval are modified versions of NumPy and Pandas with rephrased keywords. While this mitigates contamination, it also means LLMs face an artificially difficult task due to unfamiliar naming rather than genuine reasoning difficulty. This confound is not discussed."
    329     }
    330   ],
    331   "cited_papers": [
    332     {
    333       "title": "Evaluating large language models trained on code",
    334       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    335       "year": 2021,
    336       "arxiv_id": "2107.03374",
    337       "relevance": "Introduced HumanEval and pass@k metric, foundational for LLM code generation evaluation."
    338     },
    339     {
    340       "title": "QLoRA: Efficient Finetuning of Quantized LLMs",
    341       "authors": ["Tim Dettmers", "Artidoro Pagnoni", "Ari Holtzman", "Luke Zettlemoyer"],
    342       "year": 2023,
    343       "relevance": "Parameter-efficient fine-tuning method used as the training approach in this paper's experiments."
    344     },
    345     {
    346       "title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",
    347       "authors": ["Shahul Es", "Jithin James", "Luis Espinosa Anke", "Steven Schockaert"],
    348       "year": 2024,
    349       "relevance": "Method for evaluating RAG hallucinations, directly compared in the related work as an alternative approach."
    350     },
    351     {
    352       "title": "SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models",
    353       "authors": ["Potsawee Manakul", "Adian Liusie", "Mark Gales"],
    354       "year": 2023,
    355       "relevance": "Hallucination detection method using response consistency, discussed as a related approach for detecting unreliable LLM outputs."
    356     },
    357     {
    358       "title": "LLatrieval: LLM-Verified Retrieval for Verifiable Generation",
    359       "authors": ["Xiaonan Li", "Changtai Zhu", "Linyang Li"],
    360       "year": 2024,
    361       "relevance": "Provides the prompt template used for zero-shot answerability assessment in this paper's experiments."
    362     },
    363     {
    364       "title": "CodeHalu: Code Hallucinations in LLMs Driven by Execution-Based Verification",
    365       "authors": ["Yuchen Tian", "Weixiang Yan", "Qian Yang"],
    366       "year": 2024,
    367       "arxiv_id": "2405.00253",
    368       "relevance": "Benchmark for evaluating code hallucinations in LLMs, directly relevant to the code generation reliability topic."
    369     },
    370     {
    371       "title": "CodeT5+: Open Code Large Language Models for Code Understanding and Generation",
    372       "authors": ["Yue Wang", "Hung Le", "Akhilesh Gotmare"],
    373       "year": 2023,
    374       "relevance": "Code LLM whose encoder is used for computing semantic similarity between queries and APIs in the benchmark construction."
    375     },
    376     {
    377       "title": "When Language Model Meets Private Library",
    378       "authors": ["Daoguang Zan", "Bei Chen", "Zeqi Lin"],
    379       "year": 2022,
    380       "relevance": "Source of three of the four benchmark datasets (TorchDataEval, BeatNumEval, MonkeyEval) used in RaCGEval."
    381     },
    382     {
    383       "title": "Private-Library-Oriented Code Generation with Large Language Models",
    384       "authors": ["Daoguang Zan"],
    385       "year": 2023,
    386       "arxiv_id": "2307.15370",
    387       "relevance": "Follow-up work on private library code generation, directly relevant to the RaCG task setting."
    388     },
    389     {
    390       "title": "HaluEval: A Large-Scale Hallucination Evaluation Benchmark for Large Language Models",
    391       "authors": ["Junyi Li", "Xiaoxue Cheng", "Xin Zhao"],
    392       "year": 2023,
    393       "relevance": "Large-scale hallucination benchmark for LLMs, related to the broader goal of detecting unreliable LLM outputs."
    394     },
    395     {
    396       "title": "Retrieval Augmentation Reduces Hallucination in Conversation",
    397       "authors": ["Kurt Shuster", "Spencer Poff", "Moya Chen"],
    398       "year": 2021,
    399       "relevance": "Foundational work on RAG showing it reduces hallucination, motivating the RaCG setting studied in this paper."
    400     }
    401   ]
    402 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs