ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24165B)


      1 {
      2   "paper": {
      3     "title": "Retrieval-Augmented Code Generation: A Survey with Focus on Repository-Level Approaches",
      4     "authors": [
      5       "Yicheng Tao",
      6       "Yao Qin",
      7       "Yepang Liu"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv",
     11     "arxiv_id": "2510.04905",
     12     "doi": "10.48550/arXiv.2510.04905"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["survey_methodology"],
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "No repository URL, code archive, or supplementary materials are mentioned. The survey could have released its paper corpus, analysis scripts, or taxonomy data but did not."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The list of 110 surveyed papers and their annotations/categorizations are not released as a downloadable dataset. No supplementary data files are provided."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No environment specifications are provided. As a survey, this could have included tools used for analysis or bibliometric processing, but none are mentioned."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No reproduction instructions are provided. The exact search keywords are not listed ('we derived and refined a set of search terms' without specifying them), making the search unreproducible."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": false,
     42         "answer": false,
     43         "justification": "This is a survey paper with no experiments. Statistical uncertainty measures are not applicable."
     44       },
     45       "significance_tests": {
     46         "applies": false,
     47         "answer": false,
     48         "justification": "This is a survey paper with no experiments or statistical comparisons."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": false,
     52         "answer": false,
     53         "justification": "This is a survey paper with no experiments. No effect sizes are applicable."
     54       },
     55       "sample_size_justified": {
     56         "applies": false,
     57         "answer": false,
     58         "justification": "This is a survey paper with no experiments requiring sample size justification."
     59       },
     60       "variance_reported": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "This is a survey paper with no experiments requiring variance reporting."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper positions itself against prior surveys [22-28] in Section 1, noting that 'existing surveys mainly focus on snippet-level generation or general software engineering tasks, with limited discussion of retrieval-based or repository-level methods.' This comparison justifies the survey's contribution."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The prior surveys cited for comparison (references 22-28) include recent works from 2024-2025, representing the current state of survey literature in this area."
     76       },
     77       "ablation_study": {
     78         "applies": false,
     79         "answer": false,
     80         "justification": "Survey paper with no system components to ablate."
     81       },
     82       "multiple_metrics": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "Survey paper with no experiments requiring metrics."
     86       },
     87       "human_evaluation": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "Survey paper with no system outputs to evaluate."
     91       },
     92       "held_out_test_set": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "Survey paper with no experiments requiring train/test splits."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The survey provides detailed breakdowns by multiple dimensions: retrieval strategy (graph vs non-graph, Section 4.1), training paradigm (Section 4.2), agent architecture level (Level 0/1/2, Section 4.3), downstream tasks (Section 4.4), programming language (Section 4.5, Figure 9), backbone models (Section 4.6), and venue distribution (Figure 3)."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 5.1 'Limitations of Existing Approaches' discusses where current RACG approaches fail: context window constraints, graph-based RAG complexity and noise, insufficient dataset scale, and limited deployment readiness."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 4.1.4 discusses cases where RAG does not help: 'when repositories are relatively small and well-structured, LC models can match or even outperform RAG' (Peng et al.). The paper also notes 'RAG is not universally superior.'"
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims a 'comprehensive review,' taxonomy by 'generation strategies, retrieval modalities, model architectures, training paradigms, and evaluation protocols,' and a 'unified analytical framework.' All of these are delivered in the paper body through Sections 4-5 and the taxonomy figures."
    118       },
    119       "causal_claims_justified": {
    120         "applies": false,
    121         "answer": false,
    122         "justification": "The survey makes no original causal claims. Interpretive statements about trends (e.g., 'Python dominates... likely due to its popularity') are appropriately hedged with 'likely' or 'may be due to,' and causal claims about RAG effectiveness are attributed to cited work."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Section 3.2 bounds the search to January 2023 – August 2025, specific venues, and specific platforms. Inclusion/exclusion criteria (Section 3.3) further bound the scope. The title itself scopes to 'Repository-Level Approaches.'"
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "This is a pure survey/taxonomy paper that presents no original empirical results requiring alternative explanations."
    133       },
    134       "proxy_outcome_distinction": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "This is a survey paper with no measurements or proxies of its own."
    138       }
    139     },
    140     "setup_transparency": {
    141       "model_versions_specified": {
    142         "applies": false,
    143         "answer": false,
    144         "justification": "No models are used by the survey itself."
    145       },
    146       "prompts_provided": {
    147         "applies": false,
    148         "answer": false,
    149         "justification": "No prompting is used in this survey."
    150       },
    151       "hyperparameters_reported": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No experiments with hyperparameters."
    155       },
    156       "scaffolding_described": {
    157         "applies": false,
    158         "answer": false,
    159         "justification": "No agentic scaffolding used in this survey."
    160       },
    161       "data_preprocessing_documented": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 3 documents the data collection pipeline: manual venue review → keyword-based automated search across ACM DL, IEEE Xplore, arXiv, OpenReview, ACL Anthology → 579 candidates → screening with inclusion/exclusion criteria (Section 3.3) → quality assessment (Section 3.4) → backward snowballing (Section 3.5) → 110 papers. Criteria at each stage are stated."
    165       }
    166     },
    167     "limitations_and_scope": {
    168       "limitations_section_present": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "Section 5.1 discusses limitations of the RACG approaches surveyed (context windows, graph complexity, etc.), but there is no discussion of the survey's own methodological limitations — e.g., search coverage gaps, keyword selection bias, or limitations of the manual annotation process."
    172       },
    173       "threats_to_validity_specific": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No threats to validity of the survey itself are discussed. There is no mention of potential biases in paper selection, limitations of the search strategy, or risks of the manual categorization process."
    177       },
    178       "scope_boundaries_stated": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The scope is bounded by time period (Jan 2023 – Aug 2025), venues (listed in Section 3.2), platforms (five bibliographic databases), and explicit exclusion criteria (Section 3.3: papers without LMs, papers using RAG only as minor component, vulnerability/clone detection, etc.)."
    182       }
    183     },
    184     "data_integrity": {
    185       "raw_data_available": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "The list of 110 papers, their categorizations, and annotations are not available for download. No supplementary materials are provided for independent verification."
    189       },
    190       "data_collection_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Section 3.2 describes the two-stage data collection: manual review of leading venues followed by automated keyword searches across five platforms, with the search period (Jan 2023 – Aug 2025) and platforms explicitly stated."
    194       },
    195       "recruitment_methods_described": {
    196         "applies": false,
    197         "answer": false,
    198         "justification": "No human participants. Data source is a literature search, not a standard benchmark, but the search methodology is documented in Section 3."
    199       },
    200       "data_pipeline_documented": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "The pipeline stages are described (manual review → keyword search → 579 candidates → screening → quality assessment → snowballing → 110 papers), but intermediate counts are missing. How many were excluded at each stage (screening, quality assessment, snowballing additions) is not stated — only the starting (579) and ending (110) counts."
    204       }
    205     },
    206     "conflicts_of_interest": {
    207       "funding_disclosed": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "No funding or acknowledgments section is present in the paper."
    211       },
    212       "affiliations_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Author affiliations are clearly listed: Carnegie Mellon University, Chinese University of Hong Kong, and Southern University of Science and Technology."
    216       },
    217       "funder_independent_of_outcome": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No funding information is disclosed, so independence cannot be assessed."
    221       },
    222       "financial_interests_declared": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No competing interests or financial interests statement is present in the paper."
    226       }
    227     },
    228     "contamination": {
    229       "training_cutoff_stated": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "Survey paper that does not evaluate any pre-trained model on a benchmark."
    233       },
    234       "train_test_overlap_discussed": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "Survey paper that does not evaluate any pre-trained model on a benchmark."
    238       },
    239       "benchmark_contamination_addressed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "Survey paper that does not evaluate any pre-trained model on a benchmark."
    243       }
    244     },
    245     "human_studies": {
    246       "pre_registered": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants in this survey."
    250       },
    251       "irb_or_ethics_approval": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this survey."
    255       },
    256       "demographics_reported": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this survey."
    260       },
    261       "inclusion_exclusion_criteria": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this survey."
    265       },
    266       "randomization_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this survey."
    270       },
    271       "blinding_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this survey."
    275       },
    276       "attrition_reported": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this survey."
    280       }
    281     },
    282     "cost_and_practicality": {
    283       "inference_cost_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "Survey paper with no computational method of its own."
    287       },
    288       "compute_budget_stated": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "Survey paper with no computational experiments."
    292       }
    293     },
    294     "survey_methodology": {
    295       "prisma_or_structured_protocol": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Section 3 follows a structured review approach 'inspired by established guidelines in software engineering literature [30, 27, 22]' with defined research questions (Section 3.1), systematic data collection (Section 3.2), inclusion/exclusion criteria (Section 3.3), quality assessment (Section 3.4), and snowballing (Section 3.5). However, no PRISMA flow diagram is provided and exact search keywords are not listed."
    299       },
    300       "quality_assessment_of_sources": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "Section 3.4 describes quality assessment as a screening criterion (relevance, methodological clarity, retrieval-generation integration, reproducibility), but once papers pass screening, all 110 are treated equally. There is no quality scoring rubric, risk-of-bias assessment, or structured evaluation of included studies' methodological rigor."
    304       },
    305       "publication_bias_discussed": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "Publication bias is not discussed. The paper notes arXiv dominates (37.3%) but does not consider whether included papers skew toward positive results, whether negative findings are underrepresented, or whether the venue distribution introduces systematic bias."
    309       }
    310     }
    311   },
    312   "claims": [
    313     {
    314       "claim": "This is the first comprehensive survey on Retrieval-Augmented Code Generation (RACG) emphasizing repository-level reasoning challenges.",
    315       "evidence": "The paper positions against prior surveys [22-28] in Section 1, noting existing surveys 'mainly focus on snippet-level generation or general software engineering tasks, with limited discussion of retrieval-based or repository-level methods.'",
    316       "supported": "moderate"
    317     },
    318     {
    319       "claim": "Python dominates RACG research with 79 occurrences, far ahead of Java (38) and other languages.",
    320       "evidence": "Figure 9 (Section 4.5) shows the programming language distribution analyzed from explicit mentions in papers.",
    321       "supported": "strong"
    322     },
    323     {
    324       "claim": "75.2% of RACG systems are non-agent (Level 0), with only 15.8% being fully autonomous agents (Level 2).",
    325       "evidence": "Figure 7 shows the distribution pie chart; Section 4.3 provides the three-tier classification framework with categorization of individual systems.",
    326       "supported": "strong"
    327     },
    328     {
    329       "claim": "RAG effectiveness is context-dependent: RAG excels in large/complex repositories but long-context LLMs can match or surpass RAG in smaller, structured settings.",
    330       "evidence": "Section 4.1.4 cites Peng et al. [112] showing LC models matching RAG in small repos, and multiple works (Chen et al. [114], Yang et al. [115]) demonstrating RAG advantages in specific contexts.",
    331       "supported": "moderate"
    332     },
    333     {
    334       "claim": "Small open-source models (<10B) and proprietary models are significantly more popular than large open-source models (>10B) as base generation models in RACG.",
    335       "evidence": "Figure 10 (Section 4.6) shows the distribution of base generation models adopted in recommended configurations across the surveyed papers.",
    336       "supported": "strong"
    337     },
    338     {
    339       "claim": "Contain and Invoke are foundational edge types in graph-based RAG, forming part of a de facto standard three-tier abstraction (Module-Class-Function).",
    340       "evidence": "Table 2 in Section 4.1.2 compares 25 graph-based RAG methods, showing Contain appears in nearly all and Invoke in most, with Module-Class-Function nodes co-occurring frequently.",
    341       "supported": "strong"
    342     },
    343     {
    344       "claim": "BM25 retrieval achieves competitive or best performance for code matching in multiple studies.",
    345       "evidence": "Section 4.1.4 cites Chen et al. [114] finding BM25 retrieves best for code matching, Yang et al. [115] finding BM25 balances simplicity and effectiveness, and Wang et al. [123] confirming BM25's balance of effectiveness and efficiency.",
    346       "supported": "moderate"
    347     }
    348   ],
    349   "methodology_tags": ["meta-analysis"],
    350   "key_findings": "This survey reviews 110 papers on Retrieval-Augmented Code Generation, providing a taxonomy covering non-graph-based and graph-based retrieval strategies, training paradigms, and a three-tier agent architecture classification (Level 0-2). The analysis reveals Python dominance (79 occurrences), the prevalence of non-agent systems (75.2%), and that RAG effectiveness is context-dependent rather than universally superior. Key gaps identified include limited multilingual support, insufficient benchmark diversity, and the underexploration of data/control flow edges in graph-based approaches.",
    351   "red_flags": [
    352     {
    353       "flag": "No artifacts released",
    354       "detail": "The survey provides no downloadable corpus of the 110 papers, their categorizations, or analysis data. The exact search keywords used for automated search are not disclosed, making the search unreproducible."
    355     },
    356     {
    357       "flag": "No quality assessment of included studies",
    358       "detail": "All 110 papers are treated equally once they pass an initial screening. There is no quality scoring or risk-of-bias assessment of included studies, meaning findings from rigorous controlled experiments are weighted the same as preliminary arXiv preprints."
    359     },
    360     {
    361       "flag": "No discussion of publication bias",
    362       "detail": "The survey does not consider whether included papers skew toward positive RAG results, whether negative findings are underrepresented, or whether the 37.3% arXiv dominance introduces systematic quality or reporting bias."
    363     },
    364     {
    365       "flag": "Missing survey self-limitations",
    366       "detail": "Section 5.1 discusses limitations of the surveyed approaches, but the paper never discusses limitations of the survey itself — potential search coverage gaps, keyword selection bias, categorization subjectivity, or the impact of the 579→110 filtering on representativeness."
    367     },
    368     {
    369       "flag": "Opaque filtering pipeline",
    370       "detail": "The paper reports going from 579 candidate papers to 110 retained but provides no intermediate counts showing how many were excluded at each stage (inclusion/exclusion screening, quality assessment, snowballing additions)."
    371     }
    372   ],
    373   "cited_papers": [
    374     {
    375       "title": "SWE-bench: Can language models resolve real-world github issues?",
    376       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"],
    377       "year": 2024,
    378       "relevance": "Major benchmark for evaluating LLM-based automated software engineering at the repository level."
    379     },
    380     {
    381       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    382       "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    383       "year": 2024,
    384       "relevance": "Foundational Level 2 agentic framework for automated code repair with custom Agent-Computer Interface."
    385     },
    386     {
    387       "title": "RepoCoder: Repository-level code completion through iterative retrieval and generation",
    388       "authors": ["Fengji Zhang", "Bei Chen", "Yue Zhang", "Jacky Keung", "Jin Liu", "Daoguang Zan", "Yi Mao", "Jian-Guang Lou", "Weizhu Chen"],
    389       "year": 2023,
    390       "relevance": "Foundational RACG work introducing iterative retrieval and the RepoEval benchmark for repository-level code completion."
    391     },
    392     {
    393       "title": "OpenHands: An open platform for AI software developers as generalist agents",
    394       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    395       "year": 2025,
    396       "relevance": "General-purpose agentic platform for automated software development, demonstrating scalable collaborative agent systems."
    397     },
    398     {
    399       "title": "Evaluating large language models trained on code",
    400       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    401       "year": 2021,
    402       "arxiv_id": "2107.03374",
    403       "relevance": "Introduces HumanEval benchmark, widely used for evaluating code generation capabilities of LLMs."
    404     },
    405     {
    406       "title": "AutoCodeRover: Autonomous program improvement",
    407       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    408       "year": 2024,
    409       "relevance": "Level 2 autonomous agent integrating AST traversal, iterative retrieval, and spectrum-based fault localization for program repair."
    410     },
    411     {
    412       "title": "CodexGraph: Bridging large language models and code repositories via code graph databases",
    413       "authors": ["Xiangyan Liu", "Bo Lan", "Zhiyuan Hu"],
    414       "year": 2025,
    415       "relevance": "Bridges LLMs with code graph databases using dual-agent architecture for repository-level reasoning."
    416     },
    417     {
    418       "title": "Demystifying llm-based software engineering agents",
    419       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    420       "year": 2025,
    421       "relevance": "Analyzes the Agentless approach demonstrating that simple locate-repair-verify pipelines can match complex agents on SWE-bench."
    422     },
    423     {
    424       "title": "Qwen2.5-coder technical report",
    425       "authors": ["Binyuan Hui", "Jian Yang", "Zeyu Cui"],
    426       "year": 2024,
    427       "arxiv_id": "2409.12186",
    428       "relevance": "Major open-source code LLM family supporting 92+ programming languages, widely used as backbone in RACG systems."
    429     },
    430     {
    431       "title": "A survey on large language models for code generation",
    432       "authors": ["Juyong Jiang", "Fan Wang", "Jiasi Shen", "Sungju Kim", "Sunghun Kim"],
    433       "year": 2025,
    434       "relevance": "Prior survey on LLM code generation that this paper positions against as lacking RACG/repository-level focus."
    435     },
    436     {
    437       "title": "CodeRAG-bench: Can retrieval augment code generation?",
    438       "authors": ["Zora Zhiruo Wang", "Akari Asai", "Xinyan Velocity Yu"],
    439       "year": 2025,
    440       "relevance": "Benchmark specifically for evaluating retrieval-augmented code generation, directly relevant to RACG evaluation practices."
    441     },
    442     {
    443       "title": "RLCoder: Reinforcement Learning for Repository-Level Code Completion",
    444       "authors": ["Yanlin Wang", "Yanli Wang", "Daya Guo"],
    445       "year": 2025,
    446       "relevance": "Applies reinforcement learning to train retrievers for repository-level code completion, demonstrating RL's value in RACG."
    447     }
    448   ]
    449 }

Impressum · Datenschutz