ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (21263B)


      1 {
      2   "paper": {
      3     "title": "A Review of Research on AI-Assisted Code Generation and AI-Driven Code Review",
      4     "authors": ["Yuzhi Wang"],
      5     "year": 2025,
      6     "venue": "Unknown",
      7     "doi": "10.54097/d6775287"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["survey_methodology"],
     11   "methodology_tags": ["meta-analysis"],
     12   "key_findings": "This narrative review covers AI-assisted code generation (Code Llama, Copilot) and AI-driven code review. It reports that Copilot's logically correct code proportion is approximately 46.3% (citing Yetistiren et al. 2023) and that Copilot fails to detect critical security vulnerabilities like SQL injection and XSS (citing Amro & Alalfi 2025). The paper concludes that LLMs excel at code style and low-severity defects but cannot replace dedicated security tools or manual review.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No code or analysis scripts released. A survey can release its analysis pipeline or corpus; this paper provides none."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No dataset, search corpus, or extracted data is released. The paper does not provide its paper selection corpus or any supplementary data."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment or tooling specifications provided. A survey could document the tools used for literature search; none are mentioned."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No reproduction instructions. There is no description of how to replicate the literature search or the review process."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "This is a narrative literature review with no original experiments or statistical analysis."
     41       },
     42       "significance_tests": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "No original experiments; the paper only summarizes findings from cited works."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "No original experiments or statistical aggregation performed."
     51       },
     52       "sample_size_justified": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "No original experiments conducted. No meta-analysis with sample size considerations."
     56       },
     57       "variance_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "No original experiments or multi-run analysis. Pure narrative review."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The survey does not compare itself against prior surveys or reviews in the same space. It does not reference or position itself relative to existing review papers on AI code generation or code review."
     68       },
     69       "baselines_contemporary": {
     70         "applies": false,
     71         "answer": false,
     72         "justification": "No experimental baselines; this is a survey paper with no original evaluation."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "No system with components to ablate; this is a literature review."
     78       },
     79       "multiple_metrics": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "No original experiments requiring metrics."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No original experiments or system outputs to evaluate."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No experiments requiring test sets."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper provides Table 1 comparing Code Llama vs Copilot across multiple aspects (core positioning, context capability, instruction following, quality focus) and Table 2 comparing AI-assisted code generation vs AI-driven code review across comparison metrics."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper discusses failure cases: Copilot's inability to detect SQL injection, XSS, and insecure deserialization (Section 2.2.1.2), and the 46.3% logical correctness rate for generated code (Section 4)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports negative findings: Copilot's 46.3% correctness rate, failure to detect critical security vulnerabilities, and limitations of LLM-based code review in deep security knowledge (Sections 2.2.1.2, 4)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims (efficiency gains coexist with quality challenges, Copilot/Code Llama accelerate development, AI handles low-severity defects) are discussed in the body with citations to supporting papers. Claims are general but match the body content."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes causal-sounding claims such as 'AI has evolved from an auxiliary tool to a key technology that deeply participates in code optimization' and that tools 'significantly accelerated the development process.' These are asserted without rigorous causal evidence from the review; the paper simply relays claims from cited works without critically assessing the causal evidence."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims 'A Review of Research on AI-Assisted Code Generation and AI-Driven Code Review' but the paper only discusses Code Llama and Copilot in depth, with only 18 total references. The scope is not bounded to these two tools; conclusions are presented as general findings about 'AI' and 'LLM' in software development."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": false,
    128         "answer": false,
    129         "justification": "This is a pure survey/review paper presenting no original empirical results. NA per schema guidance."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": false,
    133         "answer": false,
    134         "justification": "This is a survey paper with no original measurements. NA per schema guidance for papers with no measurements."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "No original experiments using models. The paper reviews other work."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No prompting used; this is a literature review."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No original experiments requiring hyperparameter specification."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding used; this is a survey paper."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No paper selection pipeline is documented. There is no description of which databases were searched, what search queries were used, what inclusion/exclusion criteria were applied, or how many papers were screened at each stage. The paper appears to have selected references ad hoc."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations or threats-to-validity section. The Discussion (Section 3) and Conclusion (Section 4) mention challenges of the reviewed tools but do not discuss limitations of the review itself."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to the validity of the review are discussed. The paper does not acknowledge potential biases in its paper selection, coverage gaps, or methodological limitations."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No explicit scope boundaries stated. The paper does not state what it does NOT cover, what tools/models are excluded, or what time period is bounded. The reader cannot determine what falls outside the review's scope."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data, paper corpus, or extraction tables are made available for verification."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No description of how the 18 cited papers were found or selected. No search strategy, databases, or time period documented."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "For a survey, this concerns how source papers were identified. No recruitment/search method is described — the paper selection appears entirely ad hoc."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "No pipeline from literature search to final paper corpus is documented. There are no PRISMA-style counts, no screening stages, and no filtering criteria."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or sponsors."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The author's affiliation is stated: 'Beijing University of Technology, Beijing, China.' The author does not appear affiliated with any company whose products are reviewed."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not evidence of absence of funding."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This is a survey paper that does not evaluate any pre-trained model on benchmarks."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "This is a survey paper with no model evaluation on benchmarks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "This is a survey paper with no benchmark evaluation."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this literature review."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this literature review."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this literature review."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this literature review."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this literature review."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this literature review."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this literature review."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "Survey paper with no original method or experiments."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "Survey paper with no original computation."
    289       }
    290     },
    291     "survey_methodology": {
    292       "prisma_or_structured_protocol": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No PRISMA flow diagram, no structured review protocol, no reproducible search queries, and no systematic methodology for paper selection. The review appears to be an ad-hoc narrative collection of 18 references."
    296       },
    297       "quality_assessment_of_sources": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No quality assessment rubric or risk-of-bias evaluation is applied to the cited papers. All sources are treated equally regardless of their methodological rigor — findings from preprints and peer-reviewed work are presented with equal weight."
    301       },
    302       "publication_bias_discussed": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No discussion of publication bias. The paper does not consider whether its sources are biased toward positive results or whether negative findings about AI tools are underrepresented."
    306       }
    307     }
    308   },
    309   "claims": [
    310     {
    311       "claim": "Models such as Code Llama and Copilot have significantly accelerated the development process.",
    312       "evidence": "Stated in the abstract and supported by citations to Roziere et al. (2023) on Code Llama and Yetistiren et al. (2022) on Copilot, but no quantitative evidence of acceleration is presented in this review (Section 1, Section 2.1).",
    313       "supported": "weak"
    314     },
    315     {
    316       "claim": "The proportion of the code generated by Copilot that is logically correct is only approximately 46.3%.",
    317       "evidence": "Cites Yetistiren et al. (2023) evaluation results (Section 4, Table 2). The specific figure is traceable to the cited empirical study.",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "Copilot often failed to detect critical vulnerabilities such as SQL Injection, XSS, and insecure deserialization.",
    322       "evidence": "Cites Amro & Alalfi (2025) systematic evaluation (Section 2.2.1.2). The claim is traceable to the cited study.",
    323       "supported": "moderate"
    324     },
    325     {
    326       "claim": "AI can effectively handle code standards and low-severity defects in code review.",
    327       "evidence": "Discussed in Section 2.2.1.1 citing Copilot's ability to suggest style fixes and identify spelling/grammatical errors, but 'effectively' is not quantified and relies on a single tool's described behavior.",
    328       "supported": "weak"
    329     },
    330     {
    331       "claim": "The future of AI-driven development requires transformation from auxiliary tools to intelligent Agents with self-repair capabilities.",
    332       "evidence": "Stated in the abstract and conclusion (Section 4) as a forward-looking recommendation. No empirical evidence is provided for this claim; it is speculative.",
    333       "supported": "unsupported"
    334     }
    335   ],
    336   "red_flags": [
    337     {
    338       "flag": "Extremely shallow survey coverage",
    339       "detail": "The paper has only 18 references total and discusses only two tools (Code Llama and Copilot) in depth. For a paper claiming to review 'AI-Assisted Code Generation and AI-Driven Code Review,' this coverage is inadequate — major tools like StarCoder, CodeWhisperer, DeepSeek Coder, and many others are mentioned only in passing or omitted."
    340     },
    341     {
    342       "flag": "No systematic search methodology",
    343       "detail": "There is no description of how papers were found or selected. No databases searched, no search terms, no inclusion/exclusion criteria, no PRISMA diagram. The paper selection appears entirely ad hoc, making the review non-reproducible."
    344     },
    345     {
    346       "flag": "No quality assessment of source papers",
    347       "detail": "The survey treats all cited papers equally without assessing their methodological quality. Findings from preprints and peer-reviewed work are presented side by side with no assessment of evidence strength. This launders the signal-to-noise ratio of the sources."
    348     },
    349     {
    350       "flag": "Claims far exceed evidence base",
    351       "detail": "The paper makes sweeping conclusions about the state of AI in software development ('a truly efficient and secure human-machine collaboration paradigm') based on a handful of papers about two tools. The generalization is not warranted by the narrow evidence reviewed."
    352     },
    353     {
    354       "flag": "No limitations discussion for the review itself",
    355       "detail": "The paper discusses limitations of the tools reviewed but never acknowledges limitations of the review methodology, coverage gaps, or potential biases in paper selection."
    356     }
    357   ],
    358   "cited_papers": [
    359     {
    360       "title": "Evaluating large language models trained on code",
    361       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    362       "year": 2021,
    363       "arxiv_id": "2107.03374",
    364       "relevance": "Foundational Codex evaluation paper — directly relevant to AI code generation capabilities and benchmark methodology."
    365     },
    366     {
    367       "title": "Code llama: Open foundation models for code",
    368       "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"],
    369       "year": 2023,
    370       "arxiv_id": "2308.12950",
    371       "relevance": "Primary paper on Code Llama model family — core to the survey's code generation coverage."
    372     },
    373     {
    374       "title": "Assessing the quality of GitHub copilot's code generation",
    375       "authors": ["Burak Yetistiren", "Isik Ozsoy", "Eray Tuzun"],
    376       "year": 2022,
    377       "relevance": "Empirical evaluation of Copilot code quality — provides the 46.3% correctness claim cited in this review."
    378     },
    379     {
    380       "title": "Evaluating the code quality of ai-assisted code generation tools: An empirical study on github copilot, amazon codewhisperer, and chatgpt",
    381       "authors": ["Burak Yetistiren", "Isik Ozsoy", "Merve Ayerdem", "Eray Tuzun"],
    382       "year": 2023,
    383       "arxiv_id": "2304.10778",
    384       "relevance": "Comparative evaluation of multiple AI code generation tools with quality metrics."
    385     },
    386     {
    387       "title": "Large language models for code analysis: Do LLMs really do their job?",
    388       "authors": ["Chongzhou Fang", "Ning Miao", "Shaurya Srivastav"],
    389       "year": 2024,
    390       "relevance": "Systematic evaluation of LLM capabilities for code analysis, published at USENIX Security 2024."
    391     },
    392     {
    393       "title": "GitHub's Copilot Code Review: Can AI Spot Security Flaws Before You Commit?",
    394       "authors": ["Ahmad Amro", "Manar H. Alalfi"],
    395       "year": 2025,
    396       "arxiv_id": "2509.13650",
    397       "relevance": "Evaluates Copilot's code review for security vulnerability detection — key finding about SQL injection/XSS detection failures."
    398     },
    399     {
    400       "title": "AI-powered code review with llms: Early results",
    401       "authors": ["Zahra Rasheed", "Muhammad Asif Sami", "Muhammad Waseem"],
    402       "year": 2024,
    403       "arxiv_id": "2404.18496",
    404       "relevance": "Early empirical results on LLM-powered code review quality and capabilities."
    405     },
    406     {
    407       "title": "Exploring and evaluating hallucinations in llm-powered code generation",
    408       "authors": ["Fang Liu", "Yang Liu", "Lin Shi"],
    409       "year": 2024,
    410       "arxiv_id": "2404.00971",
    411       "relevance": "Directly relevant to code generation reliability — examines hallucination patterns in LLM-generated code."
    412     },
    413     {
    414       "title": "Starcoder: may the source be with you!",
    415       "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"],
    416       "year": 2023,
    417       "arxiv_id": "2305.06161",
    418       "relevance": "Major open-source code LLM relevant to AI code generation benchmarking."
    419     },
    420     {
    421       "title": "A survey on code generation with llm-based agents",
    422       "authors": ["Yifan Dong", "Xiao Jiang", "Jin Qian"],
    423       "year": 2025,
    424       "arxiv_id": "2508.00083",
    425       "relevance": "Comprehensive survey on LLM-based agent approaches to code generation — directly relevant to agentic coding research."
    426     }
    427   ]
    428 }

Impressum · Datenschutz