scan-v4.json (20308B)
1 { 2 "scan_version": 4, 3 "paper_type": "survey", 4 "paper": { 5 "title": "Designing Empirical Studies on LLM-Based Code Generation: Towards a Reference Framework", 6 "authors": [ 7 "Nathalia Nascimento", 8 "Everton Guimaraes", 9 "Paulo Alencar" 10 ], 11 "year": 2025, 12 "venue": "arXiv", 13 "arxiv_id": "2510.03862", 14 "doi": null 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "The abstract claims the framework 'organizes evaluation around core components' and they 'demonstrate its applicability through representative case mappings.' Both are supported by Figure 1, Table 1, and Section 6.", 22 "source": "opus" 23 }, 24 "causal_claims_justified": { 25 "applies": false, 26 "answer": false, 27 "justification": "The paper makes no causal claims. It proposes a framework and demonstrates applicability through case mappings.", 28 "source": "opus" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": false, 33 "justification": "The title says 'Towards a Reference Framework' suggesting generality, but the framework is grounded in only 9 papers from a single database (ACM DL), with validation on only 2 instances. The paper does not explicitly bound the generalization to LLM code generation studies found in ACM DL.", 34 "source": "opus" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": false, 39 "justification": "No discussion of alternative framework designs, whether different paper selections would yield different components, or whether the bottom-up approach introduces selection bias.", 40 "source": "opus" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper claims the framework supports 'standardized and comprehensive experimentation' but measures applicability only through two author-performed case mappings, without discussing the gap between mapping coverage and actual standardization utility.", 46 "source": "opus" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": false, 53 "justification": "There is no dedicated limitations or threats-to-validity section. The paper moves from Section 6 (framework instances) directly to Section 7 (Conclusion) and Section 8 (Future Plans) without discussing limitations.", 54 "source": "opus" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": false, 59 "justification": "No threats to validity are discussed anywhere in the paper.", 60 "source": "opus" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": false, 65 "justification": "The paper does not explicitly state what the framework does NOT cover or what settings it should NOT be applied to, beyond vague mentions of future extension to other SE tasks in Section 8.", 66 "source": "opus" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding information or acknowledgments section is present in the paper.", 74 "source": "opus" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "Author affiliations are listed: Pennsylvania State University and University of Waterloo.", 80 "source": "opus" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": true, 84 "answer": false, 85 "justification": "No funding disclosure, so independence cannot be assessed.", 86 "source": "opus" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests statement is present in the paper.", 92 "source": "opus" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": false, 99 "justification": "Key terms like 'quality attributes', 'empirical research', and 'reference framework' are used but not formally defined. ISO/IEC 25010 is cited for quality attributes but not explained upfront.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "Paper explicitly states in abstract and introduction that it proposes 'a theoretical framework for designing and reporting empirical studies on LLM-based code generation' grounded in bottom-up analysis.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 2 systematically contrasts the proposed framework with 4 related frameworks (Schneider et al., Yeo et al., De Martino et al., Wagner et al.), showing how this work is more modular and bottom-up.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "survey": { 118 "search_and_selection": { 119 "search_strategy_reproducible": { 120 "applies": true, 121 "answer": true, 122 "justification": "Complete boolean search string provided ((LLM OR ... AND ... AND ...), database (ACM Digital Library), and search fields (title/abstract) specified, enabling reproduction.", 123 "source": "haiku" 124 }, 125 "inclusion_exclusion_explicit": { 126 "applies": true, 127 "answer": true, 128 "justification": "Inclusion criteria stated: empirical evaluations of LLMs on code generation tasks. Exclusions listed: education-focused, user perception studies, non-code tasks, non-empirical papers.", 129 "source": "haiku" 130 }, 131 "prisma_or_structured_protocol": { 132 "applies": true, 133 "answer": false, 134 "justification": "No mention of PRISMA, COCHRANE, or other structured review protocol. Methodology described informally without reference to established systematic review standards.", 135 "source": "haiku" 136 }, 137 "search_terms_provided": { 138 "applies": true, 139 "answer": true, 140 "justification": "Exact boolean query provided verbatim in research method section with all operators and wildcards specified.", 141 "source": "haiku" 142 }, 143 "databases_listed": { 144 "applies": true, 145 "answer": false, 146 "justification": "Only ACM Digital Library searched. No mention of IEEE Xplore, arXiv, DBLP, or other software engineering venues, limiting coverage of empirical work.", 147 "source": "haiku" 148 }, 149 "screening_process_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "Flow from 75 retrieved to 32 after criteria to final 13 papers reported, but intermediate stages (title/abstract screening) missing. Selection of '11 most cited + 2 snowballed' not justified.", 153 "source": "haiku" 154 }, 155 "review_scope_justified": { 156 "applies": true, 157 "answer": false, 158 "justification": "Time period (2023-2025) chosen without justification—could favor recent trends or miss foundational papers. Topic scope is motivated but venue selection (single database) not justified.", 159 "source": "haiku" 160 } 161 }, 162 "synthesis_quality": { 163 "conflicting_findings_acknowledged": { 164 "applies": false, 165 "answer": false, 166 "justification": "Not applicable. Paper does not synthesize empirical findings; it extracts methodological patterns from 13 papers to construct a framework. No findings to have conflicts.", 167 "source": "haiku" 168 }, 169 "quality_assessment_of_sources": { 170 "applies": true, 171 "answer": false, 172 "justification": "No quality rubric or risk-of-bias assessment applied to the 13 papers. Framework treats all papers as equally valid methodological exemplars regardless of their own empirical rigor.", 173 "source": "haiku" 174 }, 175 "publication_bias_discussed": { 176 "applies": false, 177 "answer": false, 178 "justification": "Not applicable for framework design survey. Paper extracts patterns from published work without acknowledging that unpublished/rejected studies might use different methodologies.", 179 "source": "haiku" 180 }, 181 "quantitative_synthesis_present": { 182 "applies": false, 183 "answer": false, 184 "justification": "Not applicable. Framework design is qualitative extraction of design patterns. No meta-analysis or vote-counting expected or provided.", 185 "source": "haiku" 186 }, 187 "recommendations_supported_by_evidence": { 188 "applies": true, 189 "answer": false, 190 "justification": "Framework recommendations grounded in 13 papers but validated with only 2 case studies (Instances 1 and 2). No empirical testing of framework's ability to improve study quality.", 191 "source": "haiku" 192 } 193 } 194 } 195 }, 196 "claims": [ 197 { 198 "claim": "Empirical evaluation of LLM-based code generation lacks standardization", 199 "evidence": "Introduction states studies vary widely in goals, tasks, metrics, limiting comparability and reproducibility. Baltes et al. cited for unique LLM challenges.", 200 "supported": "moderate" 201 }, 202 { 203 "claim": "Framework with 6 components can organize empirical design patterns", 204 "evidence": "Figure 1 presents component hierarchy (Coding Task, Quality, Empirical Research, Environment, LLM Model, Output). Section 5 details each with examples.", 205 "supported": "strong" 206 }, 207 { 208 "claim": "Framework can be instantiated for diverse experimental designs", 209 "evidence": "Two case mappings (Ouyang et al. on non-determinism, Ren et al. on prompt chaining) show framework flexibility. However, only 2 validation examples provided.", 210 "supported": "moderate" 211 }, 212 { 213 "claim": "Existing frameworks (Yeo, Wagner, Schneider) address subsets but not comprehensive design", 214 "evidence": "Related Work section distinguishes this as more modular and bottom-up. Characterization depends on fair interpretation of prior work.", 215 "supported": "moderate" 216 }, 217 { 218 "claim": "Quality attributes should include functional, technical, resource efficiency, and ethical dimensions", 219 "evidence": "Section 5.3 groups attributes from ISO/IEC 25010 and literature into 4 categories with examples (correctness, complexity, efficiency, bias/security).", 220 "supported": "strong" 221 }, 222 { 223 "claim": "Non-determinism and prompt chaining are underformalized in empirical design", 224 "evidence": "Instance 1 and 2 analysis identifies these as gaps and 'extension opportunities', but framing is speculative rather than evidence-based.", 225 "supported": "weak" 226 } 227 ], 228 "methodology_tags": [ 229 "survey", 230 "case-study", 231 "qualitative" 232 ], 233 "key_findings": "The paper proposes a 6-component framework (Coding Task, Quality/Metrics, Empirical Research, Environment, LLM Model, Output) for standardizing LLM code generation empirical studies, derived from analysis of 13 papers and prior author experience. Two case studies demonstrate framework instantiation (non-determinism analysis and prompt chaining evaluation), revealing potential extensions. The framework is positioned as preliminary; most concrete advances (interactive tool, automation, extension to other SE tasks) are in future work.", 234 "red_flags": [ 235 { 236 "flag": "Limited validation scope", 237 "detail": "Framework derived from 13 papers but validated on only 2 case studies. No empirical test of whether framework improves future study quality." 238 }, 239 { 240 "flag": "Selection bias in paper choice", 241 "detail": "Selection of '11 most cited papers' heavily biases toward established/popular work. Long-tail papers may use different methodologies unrepresented in framework." 242 }, 243 { 244 "flag": "No quality assessment of source papers", 245 "detail": "Framework treats all 13 papers as equally valid methodological exemplars without assessing their own empirical rigor or identifying which practices are actually good." 246 }, 247 { 248 "flag": "Single database search", 249 "detail": "Only ACM Digital Library searched. Missing arXiv preprints, IEEE Xplore, and other venues limits comprehensiveness of source coverage." 250 }, 251 { 252 "flag": "Unclear framework derivation process", 253 "detail": "Not stated whether component extraction used systematic method (grounded theory) or intuitive thematic analysis. No inter-rater reliability reported for 3 authors." 254 }, 255 { 256 "flag": "Incomplete screening documentation", 257 "detail": "Flow from 75 retrieved → 32 after criteria → 13 final not fully explained. No counts for title/abstract screening stage or detailed justification for final 13 selection." 258 }, 259 { 260 "flag": "Heavy reliance on future work", 261 "detail": "Framework positioned as preliminary. Most concrete advances (interactive tool, automation, extension to other SE tasks) deferred to future plans, not completed." 262 } 263 ], 264 "cited_papers": [ 265 { 266 "title": "Guidelines for Empirical Studies in Software Engineering involving Large Language Models", 267 "authors": "Baltes et al.", 268 "year": 2025, 269 "relevance": "Directly related guidelines addressing unique LLM challenges (non-determinism, versioning, transparency) in empirical studies" 270 }, 271 { 272 "title": "A Reference Model for Empirically Comparing LLMs with Humans", 273 "authors": "Schneider et al.", 274 "year": 2025, 275 "relevance": "Related framework for LLM evaluation methodology with focus on human-versus-LLM fairness in experimental design" 276 }, 277 { 278 "title": "Framework for evaluating code generation ability of large language models", 279 "authors": "Yeo et al.", 280 "year": 2024, 281 "relevance": "Prior framework proposing taxonomy of task categories, input-output formats, and evaluation metrics for code generation" 282 }, 283 { 284 "title": "Towards evaluation guidelines for empirical studies involving llms", 285 "authors": "Wagner et al.", 286 "year": 2025, 287 "relevance": "Related guidelines classifying study types (LLMs as annotators, judges, subjects) and proposing best practices for reproducibility" 288 }, 289 { 290 "title": "On the Effectiveness of Large Language Models in Domain-Specific Code Generation", 291 "authors": "Gu et al.", 292 "year": 2024, 293 "relevance": "Empirical study demonstrating domain-specific variations in LLM performance (web, data science, ML), supporting motivation for contextual framework" 294 }, 295 { 296 "title": "An Empirical Study of the Non-Determinism of ChatGPT in Code Generation", 297 "authors": "Ouyang et al.", 298 "year": 2025, 299 "relevance": "Instance 1 validation case: evaluates output stability across repeated generations, identifying non-determinism as framework extension" 300 }, 301 { 302 "title": "From Misuse to Mastery: Enhancing Code Generation with Knowledge-Driven AI Chaining", 303 "authors": "Ren et al.", 304 "year": 2024, 305 "relevance": "Instance 2 validation case: investigates prompt chaining strategies for exception handling, identifying prompt engineering as framework extension" 306 }, 307 { 308 "title": "A framework for using llms for repository mining studies in empirical software engineering", 309 "authors": "De Martino et al.", 310 "year": 2024, 311 "relevance": "Specialized framework (PRIMES) for LLM-based repository mining with practical guidance on prompt engineering and data extraction" 312 } 313 ], 314 "engagement_factors": { 315 "practical_relevance": { 316 "score": 1, 317 "justification": "Framework could eventually guide future study design but is preliminary, unvalidated, and not yet adopted. Most practical value deferred to future work (interactive tool)." 318 }, 319 "surprise_contrarian": { 320 "score": 0, 321 "justification": "Framework is incremental improvement over existing frameworks (Yeo, Wagner, Schneider). No surprising findings or novel methodological breakthroughs presented." 322 }, 323 "fear_safety": { 324 "score": 0, 325 "justification": "No discussion of AI safety, security risks, alignment concerns, or ethical implications. Pure methodological standardization focus." 326 }, 327 "drama_conflict": { 328 "score": 0, 329 "justification": "Dry methodological paper. No controversy, debate, or conflicting perspectives on how empirical studies should be designed." 330 }, 331 "demo_ability": { 332 "score": 1, 333 "justification": "Can show framework instantiations (like the 2 case studies presented) on paper, but no interactive tool, runnable code, or template available. Promised for future." 334 }, 335 "brand_recognition": { 336 "score": 0, 337 "justification": "Authors from Penn State and University of Waterloo, not top-tier AI/SE labs (CMU, Berkeley, Stanford, OpenAI, Google, Meta). Limited institutional brand recognition." 338 } 339 }, 340 "hn_data": { 341 "threads": [ 342 { 343 "hn_id": "37862039", 344 "title": "PeaTMOSS: Mining Pre-Trained Models in Open-Source Software", 345 "points": 23, 346 "comments": 1, 347 "url": "https://news.ycombinator.com/item?id=37862039", 348 "created_at": "2023-10-12T19:35:57Z" 349 }, 350 { 351 "hn_id": "42333823", 352 "title": "Show HN: Data Connector – Chat with Your Database and APIs", 353 "points": 17, 354 "comments": 0, 355 "url": "https://news.ycombinator.com/item?id=42333823", 356 "created_at": "2024-12-05T23:00:20Z" 357 }, 358 { 359 "hn_id": "45857764", 360 "title": "Tidally Torn: Why the Most Common Stars May Lack Large, Habitable-Zone Moons", 361 "points": 8, 362 "comments": 0, 363 "url": "https://news.ycombinator.com/item?id=45857764", 364 "created_at": "2025-11-08T16:18:41Z" 365 }, 366 { 367 "hn_id": "46210641", 368 "title": "Is Vibe Coding Safe? Benchmarking Vulnerability of Agent-Generated Code", 369 "points": 4, 370 "comments": 1, 371 "url": "https://news.ycombinator.com/item?id=46210641", 372 "created_at": "2025-12-09T21:05:49Z" 373 }, 374 { 375 "hn_id": "46194269", 376 "title": "Is Vibe Coding Safe? Benchmarking Vulnerability of Agent-Generated Code", 377 "points": 3, 378 "comments": 0, 379 "url": "https://news.ycombinator.com/item?id=46194269", 380 "created_at": "2025-12-08T16:29:33Z" 381 }, 382 { 383 "hn_id": "42535956", 384 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 385 "points": 3, 386 "comments": 0, 387 "url": "https://news.ycombinator.com/item?id=42535956", 388 "created_at": "2024-12-28T23:45:41Z" 389 }, 390 { 391 "hn_id": "45683970", 392 "title": "Parse: LLM Driven Schema Optimization for Reliable Entity Extraction", 393 "points": 2, 394 "comments": 0, 395 "url": "https://news.ycombinator.com/item?id=45683970", 396 "created_at": "2025-10-23T16:42:00Z" 397 }, 398 { 399 "hn_id": "47021638", 400 "title": "To ReAct or not to ReAct?", 401 "points": 1, 402 "comments": 0, 403 "url": "https://news.ycombinator.com/item?id=47021638", 404 "created_at": "2026-02-15T06:57:48Z" 405 }, 406 { 407 "hn_id": "46200850", 408 "title": "Benchmarking Vulnerability of Agent-Generated Code in Real-World Tasks", 409 "points": 1, 410 "comments": 0, 411 "url": "https://news.ycombinator.com/item?id=46200850", 412 "created_at": "2025-12-09T03:13:01Z" 413 }, 414 { 415 "hn_id": "43050120", 416 "title": "Understanding Workers' Internal and External Representations of Complex Data", 417 "points": 1, 418 "comments": 0, 419 "url": "https://news.ycombinator.com/item?id=43050120", 420 "created_at": "2025-02-14T16:31:31Z" 421 } 422 ], 423 "top_points": 23, 424 "total_points": 63, 425 "total_comments": 2 426 } 427 }