scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20528B)
      1 {
      2   "paper": {
      3     "title": "Designing Empirical Studies on LLM-Based Code Generation: Towards a Reference Framework",
      4     "authors": ["Nathalia Nascimento", "Everton Guimaraes", "Paulo Alencar"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.03862"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["survey_methodology"],
     11   "methodology_tags": ["meta-analysis", "theoretical"],
     12   "key_findings": "The paper proposes a bottom-up theoretical framework for designing empirical studies on LLM-based code generation, organized into six core components: Coding Task, Quality and Metrics Evaluation, Empirical Research, Environment, LLM Model, and Generated Output. The framework was grounded in analysis of 13 papers (11 most cited from a 32-paper dataset plus 2 snowballed), with 9 used for construction and 2 for validation. The validation mappings revealed extension opportunities such as formalizing non-determinism and prompt chaining as framework elements.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No source code or repository URL is provided. The framework is presented only as a figure and textual description."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper references a publicly available curated bibliography dataset at doi:10.5281/zenodo.17230476 (reference [1])."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment or dependency specifications are provided. The work is primarily conceptual but could have specified any analysis tooling."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No reproduction instructions are provided for replicating the framework construction or paper selection process."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "This is a theoretical framework paper with no quantitative experiments or statistical results."
     41       },
     42       "significance_tests": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "No comparative quantitative claims are made that would require significance tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "No quantitative experiments are conducted; the paper proposes a conceptual framework."
     51       },
     52       "sample_size_justified": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "No experimental sample sizes to justify; this is a framework paper."
     56       },
     57       "variance_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "No experimental runs producing variance; this is a theoretical/conceptual contribution."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 2 (Related Work) compares against four prior frameworks: Schneider et al. [17], Yeo et al. [23], De Martino et al. [4] (PRIMES), and Wagner et al. [20], explaining how the proposed framework differs from each."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "All compared frameworks are from 2024-2025, which is contemporary to this 2025 paper."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "The framework is a conceptual artifact with no separable components to ablate."
     78       },
     79       "multiple_metrics": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "No quantitative evaluation of the framework is performed; validation is done via qualitative case mappings."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No human evaluation or expert review of the framework's utility or completeness is reported. The validation consists only of the authors mapping two studies to the framework."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section 6 uses 2 randomly selected studies that were not used in framework construction as held-out validation instances."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 1 provides per-component breakdowns of how each validation instance maps to the framework, plus extension opportunities per component."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The validation instances reveal gaps — elements not covered by the framework (e.g., stability as a quality attribute, prompt chaining under prompt engineering), which are discussed as extension opportunities in Section 6 and Table 1."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper identifies framework limitations through the validation mappings — elements that the framework does not yet cover, such as non-determinism analysis and specification conformance metrics."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims the framework 'organizes evaluation around core components' and they 'demonstrate its applicability through representative case mappings.' Both are supported by Figure 1, Table 1, and Section 6."
    115       },
    116       "causal_claims_justified": {
    117         "applies": false,
    118         "answer": false,
    119         "justification": "The paper makes no causal claims. It proposes a framework and demonstrates applicability through case mappings."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title says 'Towards a Reference Framework' suggesting generality, but the framework is grounded in only 9 papers from a single database (ACM DL), with validation on only 2 instances. The paper does not explicitly bound the generalization to LLM code generation studies found in ACM DL."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "No discussion of alternative framework designs, whether different paper selections would yield different components, or whether the bottom-up approach introduces selection bias."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper claims the framework supports 'standardized and comprehensive experimentation' but measures applicability only through two author-performed case mappings, without discussing the gap between mapping coverage and actual standardization utility."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "No LLM models are used in this study; it is a framework proposal paper."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No prompting is used in this study."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No model inference or training is performed."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "Section 3 provides the search string and states 75 papers were retrieved, 32 retained after inclusion/exclusion, and 13 selected (11 most cited + 2 snowballed). However, the actual filtering criteria for reducing 75 to 32 are stated only generically. The criteria for 'most cited' selection threshold and snowballing source are not detailed."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations or threats-to-validity section. The paper moves from Section 6 (framework instances) directly to Section 7 (Conclusion) and Section 8 (Future Plans) without discussing limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity are discussed anywhere in the paper."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the framework does NOT cover or what settings it should NOT be applied to, beyond vague mentions of future extension to other SE tasks in Section 8."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The curated bibliography dataset is available at doi:10.5281/zenodo.17230476 (reference [1]), described as 'publicly available' with 'selection justifications.'"
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3 describes the search string, database (ACM DL), time range (2023-2025), and selection process (75 retrieved, 32 retained, 13 selected)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants; data source is a literature search of a standard database."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The pipeline from 75 to 32 papers lacks detail on how many were excluded at each criterion. The further reduction from 32 to 13 is described as '11 most cited + 2 snowballed' but without stating the citation threshold or snowballing procedure."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed: Pennsylvania State University and University of Waterloo."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding disclosure, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "The paper does not evaluate any pre-trained model on a benchmark. It is a framework proposal based on literature analysis."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "No model evaluation on benchmarks is performed."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No model evaluation on benchmarks is performed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "This is a theoretical framework paper; no inference or computation is performed."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "This is a theoretical framework paper with no computational experiments."
    289       }
    290     },
    291     "survey_methodology": {
    292       "prisma_or_structured_protocol": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Section 3 describes a structured search with a boolean query and inclusion/exclusion criteria, but there is no PRISMA flow diagram, no protocol registration, and the filtering steps lack the detail needed for a reproducible systematic protocol. Only 13 of 32 retained papers were analyzed, selected by citation count rather than a systematic criterion."
    296       },
    297       "quality_assessment_of_sources": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The paper does not assess the methodological quality of the 13 source papers it analyzes. All papers are treated equally in extracting framework components regardless of their rigor."
    301       },
    302       "publication_bias_discussed": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No discussion of publication bias. The selection of '11 most cited papers' actively introduces citation bias (well-cited ≠ rigorous), and this is not acknowledged."
    306       }
    307     }
    308   },
    309   "claims": [
    310     {
    311       "claim": "Empirical evaluation of LLM-based code generation lacks standardization, with studies varying widely in goals, tasks, and metrics.",
    312       "evidence": "Stated in abstract and Section 1 as motivation, supported by the diversity shown across the 13 analyzed papers in Sections 4-5.",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "The proposed framework organizes evaluation around six core components and supports structured experimentation.",
    317       "evidence": "Figure 1 presents the framework structure. Sections 4-5 describe the components. The framework is grounded in analysis of 9 papers and validated on 2 held-out papers (Section 6, Table 1).",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "The framework's applicability is demonstrated through representative case mappings that reveal extension opportunities.",
    322       "evidence": "Table 1 maps two held-out papers (Ouyang et al. [14] and Ren et al. [16]) to framework components, identifying gaps such as non-determinism and prompt chaining. However, only 2 validation instances from a narrow sample limits the strength of this evidence.",
    323       "supported": "weak"
    324     }
    325   ],
    326   "red_flags": [
    327     {
    328       "flag": "Self-citation dominance",
    329       "detail": "Three of the authors' own papers [8, 11, 12] (plus [13]) are among the 13 papers used to construct and ground the framework. The framework is partly built on the authors' own experimental patterns, which could bias the framework toward their practices rather than community-wide patterns."
    330     },
    331     {
    332       "flag": "Tiny validation sample",
    333       "detail": "The framework is validated on only 2 papers, both from the same 32-paper dataset. This is insufficient to claim generalizability. The paper acknowledges this only implicitly through future work plans."
    334     },
    335     {
    336       "flag": "No limitations section",
    337       "detail": "The paper has no limitations or threats-to-validity discussion despite being a methodology paper about empirical rigor. This is a significant omission given the paper's own subject matter."
    338     },
    339     {
    340       "flag": "Citation-count selection bias",
    341       "detail": "Selecting the '11 most cited papers' for framework construction introduces bias toward well-known, mainstream approaches. Novel or minority experimental designs are systematically excluded. This is not acknowledged."
    342     },
    343     {
    344       "flag": "Single database search",
    345       "detail": "The search was conducted only in ACM Digital Library, excluding IEEE Xplore, arXiv, Scopus, and other major venues where LLM code generation papers are published."
    346     }
    347   ],
    348   "cited_papers": [
    349     {
    350       "title": "Guidelines for Empirical Studies in Software Engineering involving Large Language Models",
    351       "authors": ["Sebastian Baltes", "Florian Angermeir", "Chetan Arora"],
    352       "year": 2025,
    353       "relevance": "Meta-research on empirical methodology for LLM studies in software engineering."
    354     },
    355     {
    356       "title": "RMCBench: Benchmarking Large Language Models' Resistance to Malicious Code",
    357       "authors": ["Jiachi Chen", "Qingyuan Zhong", "Yanlin Wang"],
    358       "year": 2024,
    359       "doi": "10.1145/3691620.3695480",
    360       "relevance": "Benchmark for evaluating LLM safety in code generation contexts."
    361     },
    362     {
    363       "title": "Security Weaknesses of Copilot-Generated Code in GitHub Projects: An Empirical Study",
    364       "authors": ["Yujia Fu", "Peng Liang", "Amjed Tahir"],
    365       "year": 2025,
    366       "doi": "10.1145/3716848",
    367       "relevance": "Empirical study of security vulnerabilities in AI-generated code."
    368     },
    369     {
    370       "title": "On the Effectiveness of Large Language Models in Domain-Specific Code Generation",
    371       "authors": ["Xiaodong Gu", "Meng Chen", "Yalan Lin"],
    372       "year": 2025,
    373       "doi": "10.1145/3697012",
    374       "relevance": "Empirical evaluation of LLMs across code generation domains."
    375     },
    376     {
    377       "title": "Bias Testing and Mitigation in LLM-based Code Generation",
    378       "authors": ["Dong Huang", "Jie M. Zhang", "Qingwen Bu"],
    379       "year": 2025,
    380       "doi": "10.1145/3724117",
    381       "relevance": "Addresses bias in LLM code generation, relevant to evaluation methodology."
    382     },
    383     {
    384       "title": "An Empirical Study of the Non-Determinism of ChatGPT in Code Generation",
    385       "authors": ["Shuyin Ouyang", "Jie M. Zhang", "Mark Harman"],
    386       "year": 2025,
    387       "doi": "10.1145/3697010",
    388       "relevance": "Studies non-determinism in LLM code generation — key methodological concern for reproducibility."
    389     },
    390     {
    391       "title": "SALLM: Security Assessment of Generated Code",
    392       "authors": ["Mohammed Latif Siddiq", "Joanna Cecilia da Silva Santos"],
    393       "year": 2024,
    394       "doi": "10.1145/3691621.3694934",
    395       "relevance": "Security assessment framework for LLM-generated code."
    396     },
    397     {
    398       "title": "A Reference Model for Empirically Comparing LLMs with Humans",
    399       "authors": ["Kurt Schneider", "Farnaz Fotrousi", "Rebekka Wohlrab"],
    400       "year": 2025,
    401       "relevance": "Reference model for LLM-vs-human empirical comparisons, directly related framework."
    402     },
    403     {
    404       "title": "Framework for evaluating code generation ability of large language models",
    405       "authors": ["Sangyeop Yeo", "Yu-Seung Ma", "Sang Cheol Kim"],
    406       "year": 2024,
    407       "relevance": "Evaluation framework for LLM code generation with task taxonomy."
    408     },
    409     {
    410       "title": "Towards evaluation guidelines for empirical studies involving LLMs",
    411       "authors": ["Stefan Wagner", "Marvin Muñoz Barón", "Davide Falessi", "Sebastian Baltes"],
    412       "year": 2025,
    413       "relevance": "Guidelines for empirical LLM studies — directly relevant meta-research."
    414     },
    415     {
    416       "title": "CoderUJB: An Executable and Unified Java Benchmark for Practical Programming Scenarios",
    417       "authors": ["Zhengran Zeng", "Yidong Wang", "Rui Xie"],
    418       "year": 2024,
    419       "doi": "10.1145/3650212.3652115",
    420       "relevance": "Unified benchmark for LLM code generation evaluation."
    421     }
    422   ]
    423 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs