scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20272B)
      1 {
      2   "paper": {
      3     "title": "From Requirements to Code: Understanding Developer Practices in LLM-Assisted Software Engineering",
      4     "authors": ["Jonathan Ullrich", "Matthias Koch", "Andreas Vogelsang"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2507.07548"
      8   },
      9   "scan_version": 2,
     10   "active_modules": [],
     11   "methodology_tags": ["qualitative"],
     12   "key_findings": "Practitioners do not use traditional requirements artifacts (user stories, functional requirements) as direct input for LLMs. Instead, they manually decompose requirements into programming tasks, enriching them with design decisions and architectural constraints before prompting. Three interaction patterns emerged: incremental code generation (pair programming style), manual coding with intelligent auto-completion, and extensive code generation. The study proposes a process model and content model for LLM-assisted implementation grounded in 18 practitioner interviews across 14 companies.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No source code is released. The paper provides an interview guide and codebook via a Zenodo replication package (https://zenodo.org/records/15005613), but no analysis code."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "A replication package is provided at Zenodo (https://zenodo.org/records/15005613) containing the interview guide and codebook with traces from in-vivo codes to themes."
     24       },
     25       "environment_specified": {
     26         "applies": false,
     27         "answer": false,
     28         "justification": "This is a qualitative interview study with no computational experiments requiring environment specification."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "While the interview guide and codebook are provided, there are no step-by-step instructions for reproducing the analysis. The coding process is described narratively but not as a reproducible protocol."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "Qualitative interview study with no quantitative results requiring confidence intervals."
     41       },
     42       "significance_tests": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "No quantitative comparisons are made; findings are qualitative themes derived from interviews."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "Qualitative study; no effect sizes are applicable."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The authors state they stopped acquiring new interviewees once 'our theory was validated in subsequent interviews and no new themes emerged' after P15-P18, indicating theoretical saturation was reached."
     56       },
     57       "variance_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "Qualitative study with no quantitative experimental runs."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "This is a qualitative theory-building study, not a system evaluation. Baselines are not applicable."
     68       },
     69       "baselines_contemporary": {
     70         "applies": false,
     71         "answer": false,
     72         "justification": "No system evaluation requiring baselines."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "No system with components to ablate."
     78       },
     79       "multiple_metrics": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "No quantitative evaluation metrics; findings are qualitative."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No system outputs to evaluate. The study itself is interview-based research, not a system evaluation."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No datasets or test sets involved."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table I provides per-participant breakdowns by role, company size, application domain, requirements type, and interaction mode. Findings are organized by themes (process model, content model, interaction patterns)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper discusses cases where direct use of requirements failed: 'If I say GPT write a command interface for a Keithley, then I have in fact taken a requirement from my catalog and I've never actually had anything ready to use come out of it' (P10)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The core finding is a negative result: requirements as traditionally documented are too abstract for direct LLM input. Multiple participants report failed attempts at using requirements directly."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims about requirements being too abstract, needing decomposition into programming tasks, and enrichment with design decisions are all supported by interview quotes in Section IV."
    115       },
    116       "causal_claims_justified": {
    117         "applies": false,
    118         "answer": false,
    119         "justification": "The paper makes no causal claims. It proposes a descriptive theory of current practices based on qualitative interviews."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section III.D (External validity) acknowledges 'qualitative research does not aim for statistical generalization' and notes the sample's diversity as a mitigation, not a guarantee of generalizability."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper does not discuss alternative explanations for the observed patterns. For instance, the decomposition pattern might reflect tool limitations at interview time rather than inherent necessity, but this is not explored."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper carefully distinguishes what it measures (self-reported developer practices) from broader claims. It frames findings as 'a theory that describes processes' rather than claiming objective measurement of effectiveness."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "The study does not evaluate any AI models; it interviews practitioners about their practices."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No LLM prompting is performed as part of the research methodology."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No computational experiments requiring hyperparameter specification."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding used in the research."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section III.C describes the data analysis: exhaustive in-vivo coding producing 179 quotes, iterative clustering into themes, cross-validation by all authors, use of MaxQDA tool, export to Excel. Translation from German to English using DeepL is noted."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section III.D 'Threats to Validity' provides a substantive discussion covering internal, external, and construct validity."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The threats section discusses study-specific issues: recording and transcribing interviews to ensure accuracy, explaining study goals to participants to minimize misunderstandings, including diverse roles/industries, and cross-validating coding across all authors."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show. The threats-to-validity section discusses general methodological concerns but does not clearly delineate specific scope boundaries or excluded populations/settings."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "Raw interview transcripts are not released, likely for participant privacy. Only the codebook and interview guide are in the replication package."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section III.B describes interview conduct: 30-60 minute online interviews via Microsoft Teams, recorded, semi-structured guideline, two-part structure, all transcribed, German interviews translated via DeepL."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section III.B states 'We applied convenience sampling using our network to acquire interview partners' and describes selection criteria (experience with code models in implementation phase, diverse roles and domains)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section III.C documents the pipeline: interviews → transcription → in-vivo coding (179 quotes) → iterative clustering into themes → cross-validation by authors → naming themes → constructing process and content models."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information is disclosed. Authors are at Fraunhofer IESE (applied research institute) and University of Duisburg-Essen, but no grants or sponsors are mentioned."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: Fraunhofer IESE, Kaiserslautern and University of Duisburg-Essen. No products are being evaluated that would create a conflict."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is provided."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "The study does not evaluate any pre-trained model on a benchmark; it is an interview study about developer practices."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "No model evaluation on benchmarks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No benchmark evaluation performed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No pre-registration is mentioned for this interview study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No IRB or ethics board approval is mentioned despite the study involving human interview participants."
    252       },
    253       "demographics_reported": {
    254         "applies": true,
    255         "answer": true,
    256         "justification": "Table I reports participant roles, company sizes, application domains, requirements types used, and interaction modes for all 18 participants."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": true,
    260         "answer": true,
    261         "justification": "Section III.B states participants 'were selected based on their experience with code models in the implementation phase' and describes deliberate inclusion of diverse roles and domains."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "This is a qualitative interview study, not an experimental study with treatment conditions."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "Not an experimental study; blinding is not applicable to qualitative interviews."
    272       },
    273       "attrition_reported": {
    274         "applies": true,
    275         "answer": true,
    276         "justification": "All 18 scheduled participants completed their interviews; no attrition is evident (all 18 are listed in Table I and referenced in findings)."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "Qualitative interview study with no computational method whose cost would be relevant."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No computational experiments performed."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "Requirements artifacts as traditionally documented are too abstract for direct input into LLMs for code generation.",
    295       "evidence": "Multiple participants (P04, P10) describe failed attempts at using requirements directly. P10: 'I've never actually had anything ready to use come out of it.' Confirmed by P01, P04-P11, P13, P15, P17 (Section IV.A).",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "Practitioners manually decompose requirements into programming tasks before using them as LLM input.",
    300       "evidence": "12 of 18 participants explicitly confirmed this decomposition process (P01, P04-P11, P13, P15, P17). P06 describes structuring the program, deciding on functions and database connections before involving ChatGPT (Section IV.A).",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "Three distinct interaction patterns exist: incremental code generation, manual coding with intelligent auto-completion, and extensive code generation.",
    305       "evidence": "Section IV.A identifies these patterns with specific participant attributions: incremental (most participants), auto-completion (P02, P03, P05, P18), extensive (P04, P08, P13, P14).",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "Design decisions and architectural constraints must be added to prompts for generated code to be usable in existing codebases.",
    310       "evidence": "Content model (Section IV.B, Fig. 2) supported by P04, P05, P06, P08, P11 describing the need for infrastructure, language/library, interface, and unit test context in prompts.",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "The decomposition of requirements into programming tasks is not new to LLM-assisted development but mirrors traditional implementation thinking.",
    315       "evidence": "P04, P05, P06 explicitly argue this in Section IV.A. P04: 'I think this is also the classic thought process of how to implement requirements before coding assistants existed.'",
    316       "supported": "moderate"
    317     }
    318   ],
    319   "red_flags": [
    320     {
    321       "flag": "Convenience sampling bias",
    322       "detail": "Participants were recruited through the authors' professional network (convenience sampling), which may not represent the broader developer population. The authors acknowledge this in threats to validity but the sample skews toward practitioners connected to a German applied research institute."
    323     },
    324     {
    325       "flag": "Small sample for theory construction",
    326       "detail": "18 interviews across 14 companies. While theoretical saturation is claimed, the sample may not capture practices in large-scale agentic AI development or non-European contexts."
    327     },
    328     {
    329       "flag": "No ethics approval mentioned",
    330       "detail": "The study interviews human participants about their professional practices but does not mention IRB or ethics board approval, which is standard for human subjects research."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "Evaluating large language models trained on code",
    336       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    337       "year": 2021,
    338       "arxiv_id": "2107.03374",
    339       "relevance": "Codex paper — foundational LLM-for-code model evaluated on HumanEval benchmark."
    340     },
    341     {
    342       "title": "Code llama: Open foundation models for code",
    343       "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"],
    344       "year": 2023,
    345       "arxiv_id": "2308.12950",
    346       "relevance": "Open-source code LLM used in code generation research and practice."
    347     },
    348     {
    349       "title": "DeepSeek-Coder: When the large language model meets programming",
    350       "authors": ["D. Guo", "Q. Zhu", "D. Yang"],
    351       "year": 2024,
    352       "arxiv_id": "2401.14196",
    353       "relevance": "Major open-source code model evaluated for code intelligence tasks."
    354     },
    355     {
    356       "title": "Navigating the complexity of generative AI adoption in software engineering",
    357       "authors": ["D. Russo"],
    358       "year": 2024,
    359       "relevance": "Studies GenAI adoption in SE workflows; directly relevant to understanding LLM integration challenges."
    360     },
    361     {
    362       "title": "Grounded copilot: How programmers interact with code-generating models",
    363       "authors": ["S. Barke", "M. B. James", "N. Polikarpova"],
    364       "year": 2023,
    365       "relevance": "Identifies developer interaction patterns with Copilot; this paper extends those patterns to requirements context."
    366     },
    367     {
    368       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    369       "authors": ["P. Vaithilingam", "T. Zhang", "E. L. Glassman"],
    370       "year": 2022,
    371       "relevance": "User study evaluating Copilot usability; found it doesn't reduce task completion time but serves as useful starting point."
    372     },
    373     {
    374       "title": "Productivity assessment of neural code completion",
    375       "authors": ["A. Ziegler", "E. Kalliamvakou", "X. A. Li"],
    376       "year": 2022,
    377       "relevance": "GitHub Copilot productivity assessment — key paper in LLM-assisted developer productivity research."
    378     },
    379     {
    380       "title": "ClarifYGPT: A framework for enhancing LLM-based code generation via requirements clarification",
    381       "authors": ["F. Mu", "L. Shi", "S. Wang"],
    382       "year": 2024,
    383       "relevance": "Attempts to improve LLM code generation through requirements clarification; critiqued in this paper for using programming tasks rather than real requirements."
    384     },
    385     {
    386       "title": "Swe-bench: Can language models resolve real-world github issues?",
    387       "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig"],
    388       "year": 2023,
    389       "arxiv_id": "2310.06770",
    390       "relevance": "Major benchmark for evaluating LLMs on real-world SE tasks."
    391     },
    392     {
    393       "title": "Large language models for software engineering: A systematic literature review",
    394       "authors": ["X. Hou", "Y. Zhao", "Y. Liu"],
    395       "year": 2024,
    396       "relevance": "Comprehensive SLR finding requirements engineering and software design highly underrepresented in LLM-for-SE studies."
    397     }
    398   ]
    399 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs