scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22725B)
      1 {
      2   "paper": {
      3     "title": "Towards Advancing Code Generation with Large Language Models: A Research Roadmap",
      4     "authors": [
      5       "Haolin Jin",
      6       "Huaming Chen",
      7       "Qinghua Lu",
      8       "Liming Zhu"
      9     ],
     10     "year": 2025,
     11     "venue": "ACM (conference TBD — placeholder in paper)",
     12     "arxiv_id": "2501.11354"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code repository or archive is mentioned anywhere in the paper. The paper is a vision/roadmap paper with no implementation, but analysis scripts or supplementary materials could have been released."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No dataset was collected or released. The paper does not collect or use any data of its own; it discusses existing benchmarks (HumanEval, MBPP, APPS) only in passing."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specification is provided. The paper has no experiments and no computational artifacts."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions are provided. The paper contains no experiments or implementations to reproduce."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "This is a vision/roadmap paper with no experiments or quantitative results. Statistical uncertainty measures are not applicable."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No comparative experiments are conducted in this paper. No claims of difference between systems are made based on measured data."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No empirical results are reported; the paper does not measure any effects."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No data collection or sampling occurs in this paper."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experimental runs are performed; variance reporting is not applicable."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "This is a vision paper proposing a conceptual framework, not an empirical evaluation. No baseline comparisons are applicable."
     69       },
     70       "baselines_contemporary": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No baselines are used in the paper."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No experiments are run; ablation studies are not applicable to this vision paper."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No empirical evaluation is conducted in this paper."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No system outputs are evaluated; there are no results from any implemented system to assess."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No data splits are used; this is a vision paper without experiments."
     94       },
     95       "per_category_breakdown": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "No empirical evaluation is conducted."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Section 3 discusses high-level challenges in LLM code generation (prompt sensitivity, usability, security, evaluation limitations), but these are thematic research directions, not specific failure case analyses. The paper provides no error analysis, no qualitative examples of concrete failures, and no discussion of where a specific approach breaks down with evidence. The challenges are stated at a conceptual level without showing actual failure instances."
    104       },
    105       "negative_results_reported": {
    106         "applies": false,
    107         "answer": false,
    108         "justification": "No experiments are conducted, so there are no experimental results (positive or negative) to report."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims the paper presents a 'six-layer vision framework' and 'actionable recommendations' — both are present in the paper. The abstract does not make unsupported empirical claims; it accurately describes the content as a vision and analysis paper."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The paper makes no causal claims based on new empirical evidence. It discusses existing literature findings at a high level but does not itself make new causal inferences."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes broad generalizations about 'LLM-based code generation' without bounding them to specific models, tasks, or settings. For instance, Section 3.1.2 states 'developers using such plugins do not necessarily see large efficiency gains' citing [41] but the title and framing suggest these conclusions apply broadly to all LLM-based code generation."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper discusses challenges and suggests solutions, but does not substantively consider alternative explanations for the phenomena it describes. For example, the discussion of 'usability' challenges does not consider whether the studies cited have confounds or selection bias."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "The paper does not run any model evaluations of its own; it discusses GPT-3.5 and GPT-4 only as examples in the literature review, without specifying versions."
    138       },
    139       "prompts_provided": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "The paper does not use prompting as part of any experiment. It discusses prompt strategies conceptually."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No experiments are run; no hyperparameters are applicable."
    148       },
    149       "scaffolding_described": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "The paper describes scaffolding architectures conceptually (the six-layer framework), but does not implement or evaluate any scaffolding. No third-party scaffolding is evaluated either."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No data is collected or processed in this paper."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "There is no dedicated limitations or threats-to-validity section. The paper ends with a brief conclusion (Section 4) that does not discuss limitations of the vision or roadmap itself."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No threats-to-validity discussion is present. The paper does not reflect on whether its proposed framework or roadmap might be incomplete, biased by the literature it surveys, or otherwise limited."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The paper does not state explicit scope boundaries. It discusses LLM-based code generation broadly without clarifying what types of systems, tasks, or domains are out of scope."
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No underlying data is collected or released. The paper references existing literature but does not provide a corpus of papers reviewed or any systematic data about the literature surveyed."
    182       },
    183       "data_collection_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not describe how it selected the papers it discusses. No search strategy, inclusion/exclusion criteria, or literature search methodology is presented."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No human participants are involved; not a human subjects study."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "There is no documented pipeline for how papers were identified, screened, and included. The review appears informal with no systematic methodology described."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "No acknowledgments section or funding disclosure is present in the paper. The paper lists author affiliations (University of Sydney, CSIRO's Data61) but does not disclose any funding sources."
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Author affiliations are clearly listed: Haolin Jin and Huaming Chen at University of Sydney; Qinghua Lu and Liming Zhu at CSIRO's Data61, Australia."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": false,
    212         "answer": false,
    213         "justification": "No funding is disclosed, so independence of funder cannot be assessed."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No competing interests or financial disclosure statement is present in the paper."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "The paper does not evaluate any pre-trained model's capabilities on a benchmark. It discusses models and benchmarks conceptually only."
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmark evaluation is performed; train/test overlap is not applicable."
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No model is evaluated on any benchmark in this paper."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this paper."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this paper."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this paper."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this paper."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this paper."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this paper."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved in this paper."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "This is a vision/survey paper with no implemented system. Cost reporting is not applicable."
    280       },
    281       "compute_budget_stated": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is a vision/survey paper with no experiments requiring computational resources."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "LLM-based code generation still faces numerous technical and evaluation challenges, particularly when embedded in real-world development.",
    291       "evidence": "This is asserted in the abstract and elaborated in Section 3, citing multiple prior works on prompt sensitivity [30], usability [41], security [31], and benchmark limitations [25]. No original empirical data is provided.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Most LLM-based code generation studies include planning and self-reasoning steps, frequently employing role assignment and task decomposition.",
    296       "evidence": "Section 2.1 asserts this based on a review of existing frameworks (MetaGPT, ChatDev cited as [13, 32]). The claim is not supported by a systematic literature review with counts or selection criteria.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "Developers using LLM-based IDE plugins do not necessarily see large efficiency gains compared to those who do not.",
    301       "evidence": "Section 3.1.2 cites [41] (Xu et al., 2021) for this claim. The paper does not verify or reproduce this finding; it simply repeats a finding from prior work.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "The proposed six-layer architecture captures the core components of current LLM-based code generation research.",
    306       "evidence": "Section 2 presents the framework and describes how existing works map to it, but no systematic analysis of the coverage of this framework across the literature is provided.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "Existing benchmarks like HumanEval and MBPP focus on function-level tasks that do not reflect real-world software development complexity.",
    311       "evidence": "Section 3.2 asserts this citing HumanEval+ [25] as a source, noting that benchmarks 'focus on function-level or single-file tasks.' This is a widely known limitation acknowledged in the field.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": [
    316     "qualitative"
    317   ],
    318   "key_findings": "This paper presents a vision/roadmap for LLM-based code generation, proposing a six-layer conceptual architecture (Requirements, Model Invocation, Planning, Synthesis, Execution, and Refinement layers) and a four-phase workflow (Input, Orchestration, Development, Validation). It identifies three main technical challenges: prompt sensitivity causing non-determinism, usability and consumption concerns in multi-agent frameworks, and code security risks. It also identifies evaluation challenges including the over-reliance on function-level benchmarks and simple pass/fail metrics. The paper makes no original empirical contributions and offers no new experimental evidence.",
    319   "red_flags": [
    320     {
    321       "flag": "No systematic literature methodology",
    322       "detail": "The paper presents itself as a survey/roadmap but does not describe any systematic search strategy, inclusion/exclusion criteria, or paper selection pipeline. It is unclear how the cited papers were chosen, making the coverage unverifiable and potentially cherry-picked."
    323     },
    324     {
    325       "flag": "Unsupported generalizations",
    326       "detail": "The paper makes broad generalizations about the state of LLM-based code generation (e.g., 'most LLM-based code generation studies include' planning steps) without counting studies or providing systematic evidence. These claims are asserted rather than demonstrated."
    327     },
    328     {
    329       "flag": "No limitations or scope boundaries",
    330       "detail": "The paper provides no limitations section and does not acknowledge potential biases in its literature coverage or the scope of its proposed framework. A 10-page roadmap paper without acknowledging what it does not cover risks misleading readers about the comprehensiveness of its analysis."
    331     },
    332     {
    333       "flag": "No funding disclosure",
    334       "detail": "Despite being affiliated with CSIRO's Data61 (a government research agency), no funding source is disclosed. Authors from publicly-funded institutions should disclose grant support."
    335     },
    336     {
    337       "flag": "Informal survey laundering weak signals",
    338       "detail": "The paper aggregates findings from multiple prior works without assessing their methodological quality. Weak or poorly-controlled prior results are cited with the same weight as rigorous ones, potentially laundering low-quality findings into the survey's recommendations."
    339     }
    340   ],
    341   "cited_papers": [
    342     {
    343       "title": "Program synthesis with large language models",
    344       "authors": [
    345         "Jacob Austin",
    346         "Augustus Odena",
    347         "Maxwell Nye",
    348         "Maarten Bosma"
    349       ],
    350       "year": 2021,
    351       "arxiv_id": "2108.07732",
    352       "relevance": "Introduces MBPP benchmark and foundational work on LLM-based program synthesis, directly relevant to evaluating LLM code generation capabilities."
    353     },
    354     {
    355       "title": "Evaluating large language models trained on code",
    356       "authors": [
    357         "Mark Chen",
    358         "Jerry Tworek",
    359         "Heewoo Jun"
    360       ],
    361       "year": 2021,
    362       "arxiv_id": "2107.03374",
    363       "relevance": "Introduces HumanEval benchmark and Codex model, foundational for evaluating LLM code generation."
    364     },
    365     {
    366       "title": "Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation",
    367       "authors": [
    368         "Jiawei Liu",
    369         "Chunqiu Steven Xia",
    370         "Yuyao Wang",
    371         "Lingming Zhang"
    372       ],
    373       "year": 2024,
    374       "relevance": "Introduces EvalPlus/HumanEval+, augmenting existing benchmarks with additional test cases to more rigorously evaluate LLM code generation."
    375     },
    376     {
    377       "title": "Metagpt: Meta programming for multi-agent collaborative framework",
    378       "authors": [
    379         "Sirui Hong",
    380         "Xiawu Zheng",
    381         "Jonathan Chen"
    382       ],
    383       "year": 2023,
    384       "arxiv_id": "2308.00352",
    385       "relevance": "Prominent multi-agent LLM framework for software development; directly relevant to agentic code generation methodology."
    386     },
    387     {
    388       "title": "Chatdev: Communicative agents for software development",
    389       "authors": [
    390         "Chen Qian",
    391         "Wei Liu",
    392         "Hongzhang Liu"
    393       ],
    394       "year": 2024,
    395       "relevance": "Multi-agent LLM framework for software development, a key example of agentic code generation that this roadmap discusses."
    396     },
    397     {
    398       "title": "From llms to llm-based agents for software engineering: A survey of current, challenges and future",
    399       "authors": [
    400         "Haolin Jin",
    401         "Linghan Huang",
    402         "Haipeng Cai",
    403         "Jun Yan",
    404         "Bo Li",
    405         "Huaming Chen"
    406       ],
    407       "year": 2024,
    408       "arxiv_id": "2408.02479",
    409       "relevance": "Survey of LLM-based agents for software engineering by same first author; directly relevant to the scope of this survey project."
    410     },
    411     {
    412       "title": "A Survey on Large Language Models for Code Generation",
    413       "authors": [
    414         "Juyong Jiang",
    415         "Fan Wang",
    416         "Jiasi Shen",
    417         "Sungju Kim",
    418         "Sunghun Kim"
    419       ],
    420       "year": 2024,
    421       "arxiv_id": "2406.00515",
    422       "relevance": "Survey of LLMs for code generation, directly relevant to assessing the methodology and coverage of LLM coding research."
    423     },
    424     {
    425       "title": "An empirical study of the non-determinism of chatgpt in code generation",
    426       "authors": [
    427         "Shuyin Ouyang",
    428         "Jie M Zhang",
    429         "Mark Harman",
    430         "Meng Wang"
    431       ],
    432       "year": 2024,
    433       "relevance": "Empirical study of prompt sensitivity and non-determinism in LLM code generation, relevant to reproducibility concerns."
    434     },
    435     {
    436       "title": "The rise and potential of large language model based agents: A survey",
    437       "authors": [
    438         "Zhiheng Xi",
    439         "Wenxiang Chen",
    440         "Xin Guo"
    441       ],
    442       "year": 2023,
    443       "arxiv_id": "2309.07864",
    444       "relevance": "Comprehensive survey of LLM-based agents, directly relevant to the agentic AI scope of this research project."
    445     },
    446     {
    447       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    448       "authors": [
    449         "Hammond Pearce",
    450         "Baleegh Ahmad",
    451         "Benjamin Tan",
    452         "Brendan Dolan-Gavitt",
    453         "Ramesh Karri"
    454       ],
    455       "year": 2022,
    456       "doi": "10.1109/SP46214.2022.9833571",
    457       "relevance": "Evaluates security vulnerabilities in Copilot-generated code, relevant to the code security challenges discussed in this paper."
    458     },
    459     {
    460       "title": "ClarifyGPT: Empowering LLM-based Code Generation with Intention Clarification",
    461       "authors": [
    462         "Fangwen Mu",
    463         "Lin Shi",
    464         "Song Wang"
    465       ],
    466       "year": 2023,
    467       "arxiv_id": "2310.10996",
    468       "relevance": "Proposes a method for LLMs to clarify ambiguous requirements before code generation, relevant to prompt engineering for code generation."
    469     },
    470     {
    471       "title": "Teaching large language models to self-debug",
    472       "authors": [
    473         "Xinyun Chen",
    474         "Maxwell Lin",
    475         "Nathanael Schärli",
    476         "Denny Zhou"
    477       ],
    478       "year": 2023,
    479       "arxiv_id": "2304.05128",
    480       "relevance": "Introduces self-debugging for LLMs in code generation, relevant to the refinement and debugging layers discussed in this roadmap."
    481     },
    482     {
    483       "title": "In-IDE Code Generation from Natural Language: Promise and Challenges",
    484       "authors": [
    485         "Frank F. Xu",
    486         "Bogdan Vasilescu",
    487         "Graham Neubig"
    488       ],
    489       "year": 2021,
    490       "arxiv_id": "2101.11149",
    491       "relevance": "Evaluates real-world usability of LLM-based code generation in IDEs; source of the claim that developers may not see large efficiency gains."
    492     }
    493   ]
    494 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs