scan-v5.json (18629B)
1 { 2 "scan_version": 5, 3 "paper_type": "survey", 4 "paper": { 5 "title": "Designing Empirical Studies on LLM-Based Code Generation: Towards a Reference Framework", 6 "authors": [ 7 "Nathalia Nascimento", 8 "Everton Guimaraes", 9 "Paulo Alencar" 10 ], 11 "year": 2025, 12 "venue": "arXiv", 13 "arxiv_id": "2510.03862" 14 }, 15 "checklist": { 16 "claims_and_evidence": { 17 "abstract_claims_supported": { 18 "applies": true, 19 "answer": true, 20 "justification": "Abstract claims grounding in prior experience ([8,11,12]) and comparative analysis are supported by Section 3's documented search (75 papers, 32 retained, 13 analyzed).", 21 "source": "haiku" 22 }, 23 "causal_claims_justified": { 24 "applies": false, 25 "answer": false, 26 "justification": "This is a framework-design paper, not an empirical study making causal claims about experimental outcomes.", 27 "source": "haiku" 28 }, 29 "generalization_bounded": { 30 "applies": true, 31 "answer": true, 32 "justification": "Framework explicitly scoped to 'LLM-based code generation' studies. Section 8 acknowledges future extension to other SE tasks, defining current boundaries.", 33 "source": "haiku" 34 }, 35 "alternative_explanations_discussed": { 36 "applies": false, 37 "answer": false, 38 "justification": "Framework-design paper with no empirical claims requiring alternative explanations.", 39 "source": "haiku" 40 }, 41 "proxy_outcome_distinction": { 42 "applies": false, 43 "answer": false, 44 "justification": "No empirical claims about measured vs. claimed outcomes; framework paper.", 45 "source": "haiku" 46 } 47 }, 48 "limitations_and_scope": { 49 "limitations_section_present": { 50 "applies": true, 51 "answer": false, 52 "justification": "No dedicated Limitations or Threats-to-Validity section. Section 8 (Future Plans) acknowledges framework needs refinement but doesn't formally assess current limitations.", 53 "source": "haiku" 54 }, 55 "threats_to_validity_specific": { 56 "applies": false, 57 "answer": false, 58 "justification": "Framework-design paper without empirical threats.", 59 "source": "haiku" 60 }, 61 "scope_boundaries_stated": { 62 "applies": true, 63 "answer": true, 64 "justification": "Framework explicitly scoped to LLM-based code generation (title, abstract, introduction). Future extension to other SE tasks is mentioned, defining current boundaries.", 65 "source": "haiku" 66 } 67 }, 68 "conflicts_of_interest": { 69 "funding_disclosed": { 70 "applies": true, 71 "answer": false, 72 "justification": "No funding acknowledgment section or statement present.", 73 "source": "haiku" 74 }, 75 "affiliations_disclosed": { 76 "applies": true, 77 "answer": true, 78 "justification": "All three authors' institutional affiliations clearly listed (Penn State, Waterloo).", 79 "source": "haiku" 80 }, 81 "funder_independent_of_outcome": { 82 "applies": false, 83 "answer": false, 84 "justification": "No funding disclosed.", 85 "source": "haiku" 86 }, 87 "financial_interests_declared": { 88 "applies": true, 89 "answer": false, 90 "justification": "No competing interests statement provided.", 91 "source": "haiku" 92 } 93 }, 94 "scope_and_framing": { 95 "key_terms_defined": { 96 "applies": true, 97 "answer": true, 98 "justification": "Framework components (Problem Sources, Quality Attributes, Metrics, Environment, etc.) explicitly defined in Section 5. Quality attributes grounded in ISO/IEC 25010.", 99 "source": "haiku" 100 }, 101 "intended_contribution_clear": { 102 "applies": true, 103 "answer": true, 104 "justification": "Abstract and introduction explicitly state: 'we propose a theoretical framework for designing and reporting empirical studies on LLM-based code generation.' Contribution is unambiguous.", 105 "source": "haiku" 106 }, 107 "engagement_with_prior_work": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section 2 systematically contrasts this work with Schneider et al., Yeo et al., De Martino et al., and Wagner et al., showing how this framework differs (e.g., 'our approach provides a structured, bottom-up framework').", 111 "source": "haiku" 112 } 113 } 114 }, 115 "type_checklist": { 116 "survey": { 117 "search_and_selection": { 118 "search_strategy_reproducible": { 119 "applies": true, 120 "answer": true, 121 "justification": "Exact boolean search string provided: '((LLM OR LLMs...) AND (\"code generation\"...) AND (empirical AND (compar* OR...)))' in ACM Digital Library.", 122 "source": "haiku" 123 }, 124 "inclusion_exclusion_explicit": { 125 "applies": true, 126 "answer": true, 127 "justification": "Stated explicitly: included 'empirical evaluations of LLMs on code generation tasks'; excluded 'education, user perception, tasks unrelated to code generation, non-empirical position/vision papers.'", 128 "source": "haiku" 129 }, 130 "prisma_or_structured_protocol": { 131 "applies": true, 132 "answer": false, 133 "justification": "No PRISMA checklist, Cochrane protocol, or structured systematic review methodology cited. Approach is ad hoc.", 134 "source": "haiku" 135 }, 136 "search_terms_provided": { 137 "applies": true, 138 "answer": true, 139 "justification": "Full search string provided in Section 3 with all boolean operators and field specifications.", 140 "source": "haiku" 141 }, 142 "databases_listed": { 143 "applies": true, 144 "answer": true, 145 "justification": "ACM Digital Library explicitly named. Only one database searched, limiting comprehensiveness.", 146 "source": "haiku" 147 }, 148 "screening_process_documented": { 149 "applies": true, 150 "answer": true, 151 "justification": "Screening counts documented: 75 initial → 32 retained → 13 analyzed (11 most-cited + 2 snowballed). Counts provided but filtering methodology is sparse.", 152 "source": "haiku" 153 }, 154 "review_scope_justified": { 155 "applies": true, 156 "answer": false, 157 "justification": "No justification for 2023-2025 date range, single-database scope, or why ACM-only (ignoring arXiv, IEEE, others in the field). Scope is stated but not reasoned.", 158 "source": "haiku" 159 } 160 }, 161 "synthesis_quality": { 162 "conflicting_findings_acknowledged": { 163 "applies": false, 164 "answer": false, 165 "justification": "Framework distillation paper, not a synthesis of empirical findings across papers. No discussion of conflicting results or disagreements in the literature.", 166 "source": "haiku" 167 }, 168 "quality_assessment_of_sources": { 169 "applies": true, 170 "answer": false, 171 "justification": "No quality rubric, risk-of-bias assessment, or structured evaluation of the 13 papers analyzed. Selection criterion was 'most cited papers' without methodological appraisal.", 172 "source": "haiku" 173 }, 174 "publication_bias_discussed": { 175 "applies": true, 176 "answer": false, 177 "justification": "No discussion of publication bias, selection effects, negative results, or how 'most cited' criterion may distort the sample.", 178 "source": "haiku" 179 }, 180 "quantitative_synthesis_present": { 181 "applies": true, 182 "answer": false, 183 "justification": "No meta-analysis, vote counting, effect-size aggregation, or quantitative synthesis. Pure qualitative framework extraction.", 184 "source": "haiku" 185 }, 186 "recommendations_supported_by_evidence": { 187 "applies": true, 188 "answer": false, 189 "justification": "Framework components are grounded in the 13 papers' practices but no evidence is provided that using these components improves study quality. Prescriptive but not evidence-based.", 190 "source": "haiku" 191 } 192 } 193 } 194 }, 195 "claims": [ 196 { 197 "claim": "Empirical evaluation of LLM-based code generation lacks standardization, with studies varying widely in goals, tasks, and metrics.", 198 "evidence": "Section 1 identifies fragmentation: 'Studies often adopt ad hoc experimental setups, resulting in limited reproducibility, poor comparability.' Authors cite Baltes et al. on unique LLM challenges (non-determinism, version drift, transparency).", 199 "supported": "strong" 200 }, 201 { 202 "claim": "A bottom-up framework distilled from existing literature can organize core elements of LLM code generation experiments.", 203 "evidence": "Section 3 documents search (75 papers → 32 → 13 analyzed). Section 5 identifies six framework components (Coding Task, Quality/Metrics, Empirical Research, Environment, LLM Model, Generated Output) recurring across studies.", 204 "supported": "moderate" 205 }, 206 { 207 "claim": "The framework is applicable to diverse empirical setups.", 208 "evidence": "Section 6 maps two representative papers (Ouyang et al., Ren et al.) to framework components, showing how it generalizes. But only 2 validation cases are presented.", 209 "supported": "weak" 210 }, 211 { 212 "claim": "Domain-specific quality attributes (correctness, efficiency, bias, security) are critical to LLM code evaluation.", 213 "evidence": "Section 5.3 cites ISO/IEC 25010 and empirical literature to group quality concerns into Functional, Technical, Resource Efficiency, and Ethical/Social categories with examples from [3, 5, 9, 11, 14, 18, 19, 22].", 214 "supported": "strong" 215 }, 216 { 217 "claim": "The framework will evolve into an automated tool for research protocol generation.", 218 "evidence": "Section 8 outlines future plans: 'automatic design of research protocols' where researchers specify domain and GQM, and the tool recommends questions, metrics, and study design. This is a prospective claim, not validated.", 219 "supported": "weak" 220 } 221 ], 222 "methodology_tags": [ 223 "meta-analysis", 224 "case-study" 225 ], 226 "key_findings": "The paper proposes a six-component framework for standardizing empirical studies on LLM-based code generation (Coding Task, Quality/Metrics, Empirical Research, Environment, LLM Model, Generated Output), derived from a search of 75 papers (32 retained, 13 analyzed). The framework identifies recurring elements (problem sources like LeetCode/GitHub, quality attributes like correctness and efficiency, comparative methods) and gaps (non-determinism, prompt chaining, specification adherence) in the literature. Two validation mappings (Ouyang et al., Ren et al.) demonstrate applicability.", 227 "red_flags": [ 228 { 229 "flag": "Small analytical sample", 230 "detail": "Only 13 of 32 retained papers analyzed for framework construction (11 most-cited + 2 snowballed). Risk of citation bias and non-representative sample." 231 }, 232 { 233 "flag": "Framework grounded in authors' own work", 234 "detail": "Framework explicitly grounded in authors' prior papers [8, 11, 12]. Potential self-selection bias; framework components may over-represent authors' methodological choices." 235 }, 236 { 237 "flag": "Limited validation", 238 "detail": "Only 2 papers used to validate framework applicability (Ouyang et al., Ren et al.). Insufficient evidence that framework generalizes broadly." 239 }, 240 { 241 "flag": "No quality assessment of source papers", 242 "detail": "Source papers selected by citation count, not methodological quality. Framework may enshrine poor practices if high-citation papers have weak designs." 243 }, 244 { 245 "flag": "No inter-rater reliability", 246 "detail": "No evidence that multiple reviewers independently extracted framework components from papers and achieved agreement. Single-rater framework construction." 247 }, 248 { 249 "flag": "Missing limitations section", 250 "detail": "No formal Limitations section. Authors acknowledge in Section 8 that analysis is 'preliminary' but do not list current framework limitations." 251 }, 252 { 253 "flag": "Single-database search", 254 "detail": "ACM Digital Library only. Excludes arXiv, IEEE Xplore, Scopus, Google Scholar. Risk of venue bias (may miss domain-specific venues)." 255 } 256 ], 257 "cited_papers": [ 258 { 259 "title": "Guidelines for Empirical Studies in Software Engineering involving Large Language Models", 260 "relevance": "Wagner et al. proposes guidelines for LLM empirical study design; this framework complements by providing structural components." 261 }, 262 { 263 "title": "A Reference Model for Empirically Comparing LLMs with Humans", 264 "relevance": "Schneider et al. addresses human-vs-LLM comparisons; this framework generalizes beyond human baselines." 265 }, 266 { 267 "title": "Framework for evaluating code generation ability of large language models", 268 "relevance": "Yeo et al. proposes task taxonomy and metrics; this framework emphasizes experimental design structure." 269 }, 270 { 271 "title": "An Empirical Study of the Non-Determinism of ChatGPT in Code Generation", 272 "relevance": "Ouyang et al. identifies non-determinism as underexplored; framework validation case demonstrates stability attribute integration." 273 }, 274 { 275 "title": "From Misuse to Mastery: Enhancing Code Generation with Knowledge-Driven AI Chaining", 276 "relevance": "Ren et al. demonstrates prompt chaining for exception handling; framework validation case shows gaps in capturing advanced prompting strategies." 277 }, 278 { 279 "title": "RMCBench: Benchmarking Large Language Models' Resistance to Malicious Code", 280 "relevance": "Chen et al. addresses security/robustness in code generation; exemplifies Ethical/Social quality attribute." 281 }, 282 { 283 "title": "Security Weaknesses of Copilot-Generated Code in GitHub Projects: An Empirical Study", 284 "relevance": "Fu et al. evaluates security risks in generated code; demonstrates need for security quality metrics." 285 } 286 ], 287 "engagement_factors": { 288 "practical_relevance": { 289 "score": 2, 290 "justification": "Framework is intended to guide empirical study design, but practical applicability limited by preliminary nature and lack of tool/template instantiation." 291 }, 292 "surprise_contrarian": { 293 "score": 1, 294 "justification": "Proposes bottom-up framework approach vs. top-down guidelines, but conclusions (fragmentation exists, standardization needed) are widely acknowledged in the literature." 295 }, 296 "fear_safety": { 297 "score": 1, 298 "justification": "Mentions security/bias as quality attributes but does not raise novel safety concerns. Risk discussion is taxonomic, not alarm-raising." 299 }, 300 "drama_conflict": { 301 "score": 0, 302 "justification": "No controversy, debate, or competing viewpoints presented. Consensual framework design paper." 303 }, 304 "demo_ability": { 305 "score": 1, 306 "justification": "Framework is abstract conceptual structure. No interactive tool, no runnable code, no live demo. Cannot be 'tried now.'" 307 }, 308 "brand_recognition": { 309 "score": 1, 310 "justification": "Authors from Penn State and Waterloo (mid-tier institutions). No Nobel laureates or household-name labs. Venues are arXiv (not yet peer-reviewed) and prior CASCON/MSR (mid-tier)." 311 } 312 }, 313 "hn_data": { 314 "threads": [ 315 { 316 "hn_id": "37862039", 317 "title": "PeaTMOSS: Mining Pre-Trained Models in Open-Source Software", 318 "points": 23, 319 "comments": 1, 320 "url": "https://news.ycombinator.com/item?id=37862039", 321 "created_at": "2023-10-12T19:35:57Z" 322 }, 323 { 324 "hn_id": "42333823", 325 "title": "Show HN: Data Connector – Chat with Your Database and APIs", 326 "points": 17, 327 "comments": 0, 328 "url": "https://news.ycombinator.com/item?id=42333823", 329 "created_at": "2024-12-05T23:00:20Z" 330 }, 331 { 332 "hn_id": "45857764", 333 "title": "Tidally Torn: Why the Most Common Stars May Lack Large, Habitable-Zone Moons", 334 "points": 8, 335 "comments": 0, 336 "url": "https://news.ycombinator.com/item?id=45857764", 337 "created_at": "2025-11-08T16:18:41Z" 338 }, 339 { 340 "hn_id": "46210641", 341 "title": "Is Vibe Coding Safe? Benchmarking Vulnerability of Agent-Generated Code", 342 "points": 4, 343 "comments": 1, 344 "url": "https://news.ycombinator.com/item?id=46210641", 345 "created_at": "2025-12-09T21:05:49Z" 346 }, 347 { 348 "hn_id": "46194269", 349 "title": "Is Vibe Coding Safe? Benchmarking Vulnerability of Agent-Generated Code", 350 "points": 3, 351 "comments": 0, 352 "url": "https://news.ycombinator.com/item?id=46194269", 353 "created_at": "2025-12-08T16:29:33Z" 354 }, 355 { 356 "hn_id": "42535956", 357 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 358 "points": 3, 359 "comments": 0, 360 "url": "https://news.ycombinator.com/item?id=42535956", 361 "created_at": "2024-12-28T23:45:41Z" 362 }, 363 { 364 "hn_id": "45683970", 365 "title": "Parse: LLM Driven Schema Optimization for Reliable Entity Extraction", 366 "points": 2, 367 "comments": 0, 368 "url": "https://news.ycombinator.com/item?id=45683970", 369 "created_at": "2025-10-23T16:42:00Z" 370 }, 371 { 372 "hn_id": "47021638", 373 "title": "To ReAct or not to ReAct?", 374 "points": 1, 375 "comments": 0, 376 "url": "https://news.ycombinator.com/item?id=47021638", 377 "created_at": "2026-02-15T06:57:48Z" 378 }, 379 { 380 "hn_id": "46200850", 381 "title": "Benchmarking Vulnerability of Agent-Generated Code in Real-World Tasks", 382 "points": 1, 383 "comments": 0, 384 "url": "https://news.ycombinator.com/item?id=46200850", 385 "created_at": "2025-12-09T03:13:01Z" 386 }, 387 { 388 "hn_id": "43050120", 389 "title": "Understanding Workers' Internal and External Representations of Complex Data", 390 "points": 1, 391 "comments": 0, 392 "url": "https://news.ycombinator.com/item?id=43050120", 393 "created_at": "2025-02-14T16:31:31Z" 394 } 395 ], 396 "top_points": 23, 397 "total_points": 63, 398 "total_comments": 2 399 } 400 }