ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (18629B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Designing Empirical Studies on LLM-Based Code Generation: Towards a Reference Framework",
      6     "authors": [
      7       "Nathalia Nascimento",
      8       "Everton Guimaraes",
      9       "Paulo Alencar"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv",
     13     "arxiv_id": "2510.03862"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "Abstract claims grounding in prior experience ([8,11,12]) and comparative analysis are supported by Section 3's documented search (75 papers, 32 retained, 13 analyzed).",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": false,
     25         "answer": false,
     26         "justification": "This is a framework-design paper, not an empirical study making causal claims about experimental outcomes.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Framework explicitly scoped to 'LLM-based code generation' studies. Section 8 acknowledges future extension to other SE tasks, defining current boundaries.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": false,
     37         "answer": false,
     38         "justification": "Framework-design paper with no empirical claims requiring alternative explanations.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "No empirical claims about measured vs. claimed outcomes; framework paper.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No dedicated Limitations or Threats-to-Validity section. Section 8 (Future Plans) acknowledges framework needs refinement but doesn't formally assess current limitations.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": false,
     57         "answer": false,
     58         "justification": "Framework-design paper without empirical threats.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Framework explicitly scoped to LLM-based code generation (title, abstract, introduction). Future extension to other SE tasks is mentioned, defining current boundaries.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No funding acknowledgment section or statement present.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "All three authors' institutional affiliations clearly listed (Penn State, Waterloo).",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No funding disclosed.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement provided.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Framework components (Problem Sources, Quality Attributes, Metrics, Environment, etc.) explicitly defined in Section 5. Quality attributes grounded in ISO/IEC 25010.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Abstract and introduction explicitly state: 'we propose a theoretical framework for designing and reporting empirical studies on LLM-based code generation.' Contribution is unambiguous.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 2 systematically contrasts this work with Schneider et al., Yeo et al., De Martino et al., and Wagner et al., showing how this framework differs (e.g., 'our approach provides a structured, bottom-up framework').",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "survey": {
    117       "search_and_selection": {
    118         "search_strategy_reproducible": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "Exact boolean search string provided: '((LLM OR LLMs...) AND (\"code generation\"...) AND (empirical AND (compar* OR...)))' in ACM Digital Library.",
    122           "source": "haiku"
    123         },
    124         "inclusion_exclusion_explicit": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "Stated explicitly: included 'empirical evaluations of LLMs on code generation tasks'; excluded 'education, user perception, tasks unrelated to code generation, non-empirical position/vision papers.'",
    128           "source": "haiku"
    129         },
    130         "prisma_or_structured_protocol": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "No PRISMA checklist, Cochrane protocol, or structured systematic review methodology cited. Approach is ad hoc.",
    134           "source": "haiku"
    135         },
    136         "search_terms_provided": {
    137           "applies": true,
    138           "answer": true,
    139           "justification": "Full search string provided in Section 3 with all boolean operators and field specifications.",
    140           "source": "haiku"
    141         },
    142         "databases_listed": {
    143           "applies": true,
    144           "answer": true,
    145           "justification": "ACM Digital Library explicitly named. Only one database searched, limiting comprehensiveness.",
    146           "source": "haiku"
    147         },
    148         "screening_process_documented": {
    149           "applies": true,
    150           "answer": true,
    151           "justification": "Screening counts documented: 75 initial → 32 retained → 13 analyzed (11 most-cited + 2 snowballed). Counts provided but filtering methodology is sparse.",
    152           "source": "haiku"
    153         },
    154         "review_scope_justified": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No justification for 2023-2025 date range, single-database scope, or why ACM-only (ignoring arXiv, IEEE, others in the field). Scope is stated but not reasoned.",
    158           "source": "haiku"
    159         }
    160       },
    161       "synthesis_quality": {
    162         "conflicting_findings_acknowledged": {
    163           "applies": false,
    164           "answer": false,
    165           "justification": "Framework distillation paper, not a synthesis of empirical findings across papers. No discussion of conflicting results or disagreements in the literature.",
    166           "source": "haiku"
    167         },
    168         "quality_assessment_of_sources": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "No quality rubric, risk-of-bias assessment, or structured evaluation of the 13 papers analyzed. Selection criterion was 'most cited papers' without methodological appraisal.",
    172           "source": "haiku"
    173         },
    174         "publication_bias_discussed": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "No discussion of publication bias, selection effects, negative results, or how 'most cited' criterion may distort the sample.",
    178           "source": "haiku"
    179         },
    180         "quantitative_synthesis_present": {
    181           "applies": true,
    182           "answer": false,
    183           "justification": "No meta-analysis, vote counting, effect-size aggregation, or quantitative synthesis. Pure qualitative framework extraction.",
    184           "source": "haiku"
    185         },
    186         "recommendations_supported_by_evidence": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "Framework components are grounded in the 13 papers' practices but no evidence is provided that using these components improves study quality. Prescriptive but not evidence-based.",
    190           "source": "haiku"
    191         }
    192       }
    193     }
    194   },
    195   "claims": [
    196     {
    197       "claim": "Empirical evaluation of LLM-based code generation lacks standardization, with studies varying widely in goals, tasks, and metrics.",
    198       "evidence": "Section 1 identifies fragmentation: 'Studies often adopt ad hoc experimental setups, resulting in limited reproducibility, poor comparability.' Authors cite Baltes et al. on unique LLM challenges (non-determinism, version drift, transparency).",
    199       "supported": "strong"
    200     },
    201     {
    202       "claim": "A bottom-up framework distilled from existing literature can organize core elements of LLM code generation experiments.",
    203       "evidence": "Section 3 documents search (75 papers → 32 → 13 analyzed). Section 5 identifies six framework components (Coding Task, Quality/Metrics, Empirical Research, Environment, LLM Model, Generated Output) recurring across studies.",
    204       "supported": "moderate"
    205     },
    206     {
    207       "claim": "The framework is applicable to diverse empirical setups.",
    208       "evidence": "Section 6 maps two representative papers (Ouyang et al., Ren et al.) to framework components, showing how it generalizes. But only 2 validation cases are presented.",
    209       "supported": "weak"
    210     },
    211     {
    212       "claim": "Domain-specific quality attributes (correctness, efficiency, bias, security) are critical to LLM code evaluation.",
    213       "evidence": "Section 5.3 cites ISO/IEC 25010 and empirical literature to group quality concerns into Functional, Technical, Resource Efficiency, and Ethical/Social categories with examples from [3, 5, 9, 11, 14, 18, 19, 22].",
    214       "supported": "strong"
    215     },
    216     {
    217       "claim": "The framework will evolve into an automated tool for research protocol generation.",
    218       "evidence": "Section 8 outlines future plans: 'automatic design of research protocols' where researchers specify domain and GQM, and the tool recommends questions, metrics, and study design. This is a prospective claim, not validated.",
    219       "supported": "weak"
    220     }
    221   ],
    222   "methodology_tags": [
    223     "meta-analysis",
    224     "case-study"
    225   ],
    226   "key_findings": "The paper proposes a six-component framework for standardizing empirical studies on LLM-based code generation (Coding Task, Quality/Metrics, Empirical Research, Environment, LLM Model, Generated Output), derived from a search of 75 papers (32 retained, 13 analyzed). The framework identifies recurring elements (problem sources like LeetCode/GitHub, quality attributes like correctness and efficiency, comparative methods) and gaps (non-determinism, prompt chaining, specification adherence) in the literature. Two validation mappings (Ouyang et al., Ren et al.) demonstrate applicability.",
    227   "red_flags": [
    228     {
    229       "flag": "Small analytical sample",
    230       "detail": "Only 13 of 32 retained papers analyzed for framework construction (11 most-cited + 2 snowballed). Risk of citation bias and non-representative sample."
    231     },
    232     {
    233       "flag": "Framework grounded in authors' own work",
    234       "detail": "Framework explicitly grounded in authors' prior papers [8, 11, 12]. Potential self-selection bias; framework components may over-represent authors' methodological choices."
    235     },
    236     {
    237       "flag": "Limited validation",
    238       "detail": "Only 2 papers used to validate framework applicability (Ouyang et al., Ren et al.). Insufficient evidence that framework generalizes broadly."
    239     },
    240     {
    241       "flag": "No quality assessment of source papers",
    242       "detail": "Source papers selected by citation count, not methodological quality. Framework may enshrine poor practices if high-citation papers have weak designs."
    243     },
    244     {
    245       "flag": "No inter-rater reliability",
    246       "detail": "No evidence that multiple reviewers independently extracted framework components from papers and achieved agreement. Single-rater framework construction."
    247     },
    248     {
    249       "flag": "Missing limitations section",
    250       "detail": "No formal Limitations section. Authors acknowledge in Section 8 that analysis is 'preliminary' but do not list current framework limitations."
    251     },
    252     {
    253       "flag": "Single-database search",
    254       "detail": "ACM Digital Library only. Excludes arXiv, IEEE Xplore, Scopus, Google Scholar. Risk of venue bias (may miss domain-specific venues)."
    255     }
    256   ],
    257   "cited_papers": [
    258     {
    259       "title": "Guidelines for Empirical Studies in Software Engineering involving Large Language Models",
    260       "relevance": "Wagner et al. proposes guidelines for LLM empirical study design; this framework complements by providing structural components."
    261     },
    262     {
    263       "title": "A Reference Model for Empirically Comparing LLMs with Humans",
    264       "relevance": "Schneider et al. addresses human-vs-LLM comparisons; this framework generalizes beyond human baselines."
    265     },
    266     {
    267       "title": "Framework for evaluating code generation ability of large language models",
    268       "relevance": "Yeo et al. proposes task taxonomy and metrics; this framework emphasizes experimental design structure."
    269     },
    270     {
    271       "title": "An Empirical Study of the Non-Determinism of ChatGPT in Code Generation",
    272       "relevance": "Ouyang et al. identifies non-determinism as underexplored; framework validation case demonstrates stability attribute integration."
    273     },
    274     {
    275       "title": "From Misuse to Mastery: Enhancing Code Generation with Knowledge-Driven AI Chaining",
    276       "relevance": "Ren et al. demonstrates prompt chaining for exception handling; framework validation case shows gaps in capturing advanced prompting strategies."
    277     },
    278     {
    279       "title": "RMCBench: Benchmarking Large Language Models' Resistance to Malicious Code",
    280       "relevance": "Chen et al. addresses security/robustness in code generation; exemplifies Ethical/Social quality attribute."
    281     },
    282     {
    283       "title": "Security Weaknesses of Copilot-Generated Code in GitHub Projects: An Empirical Study",
    284       "relevance": "Fu et al. evaluates security risks in generated code; demonstrates need for security quality metrics."
    285     }
    286   ],
    287   "engagement_factors": {
    288     "practical_relevance": {
    289       "score": 2,
    290       "justification": "Framework is intended to guide empirical study design, but practical applicability limited by preliminary nature and lack of tool/template instantiation."
    291     },
    292     "surprise_contrarian": {
    293       "score": 1,
    294       "justification": "Proposes bottom-up framework approach vs. top-down guidelines, but conclusions (fragmentation exists, standardization needed) are widely acknowledged in the literature."
    295     },
    296     "fear_safety": {
    297       "score": 1,
    298       "justification": "Mentions security/bias as quality attributes but does not raise novel safety concerns. Risk discussion is taxonomic, not alarm-raising."
    299     },
    300     "drama_conflict": {
    301       "score": 0,
    302       "justification": "No controversy, debate, or competing viewpoints presented. Consensual framework design paper."
    303     },
    304     "demo_ability": {
    305       "score": 1,
    306       "justification": "Framework is abstract conceptual structure. No interactive tool, no runnable code, no live demo. Cannot be 'tried now.'"
    307     },
    308     "brand_recognition": {
    309       "score": 1,
    310       "justification": "Authors from Penn State and Waterloo (mid-tier institutions). No Nobel laureates or household-name labs. Venues are arXiv (not yet peer-reviewed) and prior CASCON/MSR (mid-tier)."
    311     }
    312   },
    313   "hn_data": {
    314     "threads": [
    315       {
    316         "hn_id": "37862039",
    317         "title": "PeaTMOSS: Mining Pre-Trained Models in Open-Source Software",
    318         "points": 23,
    319         "comments": 1,
    320         "url": "https://news.ycombinator.com/item?id=37862039",
    321         "created_at": "2023-10-12T19:35:57Z"
    322       },
    323       {
    324         "hn_id": "42333823",
    325         "title": "Show HN: Data Connector – Chat with Your Database and APIs",
    326         "points": 17,
    327         "comments": 0,
    328         "url": "https://news.ycombinator.com/item?id=42333823",
    329         "created_at": "2024-12-05T23:00:20Z"
    330       },
    331       {
    332         "hn_id": "45857764",
    333         "title": "Tidally Torn: Why the Most Common Stars May Lack Large, Habitable-Zone Moons",
    334         "points": 8,
    335         "comments": 0,
    336         "url": "https://news.ycombinator.com/item?id=45857764",
    337         "created_at": "2025-11-08T16:18:41Z"
    338       },
    339       {
    340         "hn_id": "46210641",
    341         "title": "Is Vibe Coding Safe? Benchmarking Vulnerability of Agent-Generated Code",
    342         "points": 4,
    343         "comments": 1,
    344         "url": "https://news.ycombinator.com/item?id=46210641",
    345         "created_at": "2025-12-09T21:05:49Z"
    346       },
    347       {
    348         "hn_id": "46194269",
    349         "title": "Is Vibe Coding Safe? Benchmarking Vulnerability of Agent-Generated Code",
    350         "points": 3,
    351         "comments": 0,
    352         "url": "https://news.ycombinator.com/item?id=46194269",
    353         "created_at": "2025-12-08T16:29:33Z"
    354       },
    355       {
    356         "hn_id": "42535956",
    357         "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    358         "points": 3,
    359         "comments": 0,
    360         "url": "https://news.ycombinator.com/item?id=42535956",
    361         "created_at": "2024-12-28T23:45:41Z"
    362       },
    363       {
    364         "hn_id": "45683970",
    365         "title": "Parse: LLM Driven Schema Optimization for Reliable Entity Extraction",
    366         "points": 2,
    367         "comments": 0,
    368         "url": "https://news.ycombinator.com/item?id=45683970",
    369         "created_at": "2025-10-23T16:42:00Z"
    370       },
    371       {
    372         "hn_id": "47021638",
    373         "title": "To ReAct or not to ReAct?",
    374         "points": 1,
    375         "comments": 0,
    376         "url": "https://news.ycombinator.com/item?id=47021638",
    377         "created_at": "2026-02-15T06:57:48Z"
    378       },
    379       {
    380         "hn_id": "46200850",
    381         "title": "Benchmarking Vulnerability of Agent-Generated Code in Real-World Tasks",
    382         "points": 1,
    383         "comments": 0,
    384         "url": "https://news.ycombinator.com/item?id=46200850",
    385         "created_at": "2025-12-09T03:13:01Z"
    386       },
    387       {
    388         "hn_id": "43050120",
    389         "title": "Understanding Workers' Internal and External Representations of Complex Data",
    390         "points": 1,
    391         "comments": 0,
    392         "url": "https://news.ycombinator.com/item?id=43050120",
    393         "created_at": "2025-02-14T16:31:31Z"
    394       }
    395     ],
    396     "top_points": 23,
    397     "total_points": 63,
    398     "total_comments": 2
    399   }
    400 }

Impressum · Datenschutz