scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24756B)
      1 {
      2   "paper": {
      3     "title": "CAST: Enhancing Code Retrieval-Augmented Generation with Structural Chunking via Abstract Syntax Tree",
      4     "authors": ["Yilin Zhang", "Xinran Zhao", "Zora Zhiruo Wang", "Chenyang Yang", "Jiayi Wei", "Tongshuang Wu"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2506.15655"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper provides a GitHub link: https://github.com/yilinjz/astchunk (mentioned in the abstract footnote and in Section A.6). The code is stated as open source under the MIT License (Table 9)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available benchmarks: RepoEval, CrossCodeEval, and SWE-bench Lite, all with public links and open licenses listed in Table 9."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "Appendix A.1 mentions hardware (8 Nvidia A100 40G GPUs, 8 Nvidia A6000 40G GPUs, CUDA 12) and vLLM for inference, but there is no requirements.txt, Dockerfile, or detailed library version listing to recreate the environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While the algorithm is provided in pseudocode (Algorithm 1) and the code repository is linked, the paper does not include step-by-step reproduction instructions (e.g., specific commands to run, a README with reproduction steps for the main experiments)."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results in Tables 1-8 are reported as point estimates without confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CAST improves over fixed-size chunking across multiple settings but provides no statistical significance tests. Comparisons are made purely by comparing point estimates."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports absolute point improvements with baseline context throughout, e.g., 'boosting Recall@5 by 4.3 points on RepoEval' and 'Pass@1 by 2.67 points on SWE-bench generation' (Abstract, Section 3.2). Tables show both CAST and baseline numbers, allowing readers to assess effect magnitude."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for why specific benchmarks or their sizes were chosen. SWE-bench Lite is a 300-problem subset, but no discussion of whether this is sufficient for the claims made."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread across experimental runs is reported. Results appear to be from single runs (for Pass@1 with t=0.2) with no mention of multiple seeds or repetitions."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "CAST is consistently compared against fixed-size line-based chunking as the baseline across all experiments (Tables 1-8)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The only baseline is fixed-size line-based chunking, which is the simplest approach. No comparison is made against other structure-aware or semantic chunking methods. The paper mentions CodeCRAG (Du et al., 2025) as related work but does not compare against it."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 4 presents ablation studies: split-only vs. split-then-merge (Table 2), context length selection (Table 3), and maximum chunk size sensitivity (Table 4)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses multiple metrics: nDCG@5, Precision@5, Recall@5 for retrieval; Pass@1, Pass@8, exact match (EM), and edit similarity (ES) for generation (Section 3.1)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation is included. All evaluation is automated via metrics on benchmark datasets. Human evaluation of chunk quality or generated code quality could be relevant to validating whether AST-based chunks are indeed more 'semantically coherent' as claimed."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses established benchmark test sets (RepoEval, CrossCodeEval, SWE-bench Lite) with standard splits. No tuning on test data is mentioned."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 8 provides per-language breakdowns (Python, Java, C#, TypeScript) on CrossCodeEval. Results are also broken down by retriever model and generator model across all tables."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "The paper does not discuss specific failure cases or show examples where CAST performs worse than the baseline. The only negative result is in the ablation (BGE-large on SWE-bench), but this is not analyzed."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 3 shows that increasing context length from 4000 to 8000 can hurt performance for CodeSage+StarCoder2 (73.2 → 69.2). Table 7 shows cases where CAST with BGE-large performs worse than fixed-size (13.3 vs 14.6 for Claude). These negative results are present in the data though not deeply discussed."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 'boosting Recall@5 by 4.3 points on RepoEval' and 'Pass@1 by 2.67 points on SWE-bench generation' are supported by Table 1 (e.g., CodeSage Recall@5: 83.9 vs 82.1 = 1.8; GIST Recall@5: 75.0 vs 70.7 = 4.3; and SWE-bench Pass@1 with BGE+Claude: 16.3 vs 13.7 = 2.6). The claims are reasonably supported."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims that AST-based chunking improves performance. The ablation studies (Section 4) provide controlled single-variable manipulations: split-only vs split-then-merge, varying chunk sizes, and varying context lengths. These constitute adequate ablation designs for the causal claims."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims 'Enhancing Code Retrieval-Augmented Generation' broadly, but results are limited to three specific benchmarks (RepoEval, CrossCodeEval, SWE-bench Lite) with specific retrievers and generators. The abstract and conclusion make broad claims about 'scaling retrieval-enhanced code intelligence' without bounding to the tested settings."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for the observed improvements. For instance, the score mapping technique (Appendix A.2) could introduce systematic bias, and the distributional differences between AST-based and fixed-size chunks could confound comparisons. Neither is discussed."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper specifies 'claude-3.7-sonnet' and 'gemini-2.5-pro-0325' (Section 3.1). While Gemini has a snapshot date, Claude 3.7 Sonnet does not have a snapshot date or API version. Open models are specified by name and size (StarCoder2-7B, CodeLlama-7B-Python) but without version hashes."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper does not provide the actual prompts used to query the generation models. It describes the pipeline at a high level but the specific prompt text or templates used for code generation are not included."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix A.1 reports temperature (t=0.2 for Pass@1, t=0.8 for Pass@8), top-p (0.95), number of samples (1 for Pass@1, 8 for Pass@8), max_context_length (4000 for RepoEval/SWE-bench, 10000 for CrossCodeEval), and max_chunk_size (2000)."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use agentic scaffolding. CAST is a preprocessing (chunking) step in a standard RAG pipeline, not an agentic system."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The AST-based chunking algorithm is documented in detail (Section 2, Algorithm 1). The score mapping technique for metric comparability is described in Appendix A.2. The chunk size metric (non-whitespace characters rather than lines) is explained in Section 2."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "There is a dedicated 'Limitations' section after the conclusion, discussing contextual awareness, multi-view of code, and inner execution dynamics."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The Limitations section discusses future directions (multi-level information, multiple code views, execution traces) rather than specific threats to validity of the current study. It does not address whether the improvements could be due to confounds, the fairness of the score mapping technique, or the generalizability concerns."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The Limitations section discusses future directions but does not explicitly state what the results do NOT show. It does not bound the claims to the specific benchmarks, languages, or models tested."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "While the benchmarks are publicly available, the paper does not release the raw experimental outputs (e.g., generated code, retrieval results, per-instance scores) for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data comes from established public benchmarks (RepoEval, CrossCodeEval, SWE-bench Lite), all described with citations in Section 3.1. The selection of SWE-bench Lite (300-problem subset where each issue is solvable by editing a single file) is documented."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. Data sources are standard public benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: source code is parsed into AST, chunked via the split-then-merge algorithm (Algorithm 1), indexed, retrieved with various retrievers, and fed to generators. The score mapping technique for metric alignment is also documented (Appendix A.2)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The Acknowledgments section lists funding: ONR Award N000142312840, OpenAI Research Credit program, Amazon AI Research Gift Fund, and Gemma Academic Program GCP Credit Award."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Carnegie Mellon University (5 authors) and Augment Code (1 author, Jiayi Wei). The affiliation with Augment Code, a code AI company, is disclosed."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "OpenAI, Amazon, and Google (Gemma program) are all companies with products in the code generation and RAG space. While the funding appears to be unrestricted grants/credits, these companies have financial interest in advancing code RAG technology. Additionally, one author is affiliated with Augment Code, a code AI company."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is included. One author (Jiayi Wei) is affiliated with Augment Code, a code AI company that could benefit from improved code RAG techniques, but no financial interest disclosure is provided."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates pre-trained models (StarCoder2-7B, CodeLlama-7B, Claude 3.7 Sonnet, Gemini 2.5 Pro) on benchmarks but does not state the training data cutoff dates for any of these models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether the benchmark data (RepoEval, CrossCodeEval, SWE-bench) could have been in the training data of the models used. This is particularly relevant for Claude and Gemini, which are large closed-source models."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "RepoEval and SWE-bench were published in 2023/2024, before the likely training cutoffs of Claude 3.7 and Gemini 2.5. The paper does not discuss this contamination risk. However, since CAST is compared against fixed-size chunking using the same models, contamination would affect both conditions equally, partially mitigating (but not eliminating) the concern."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper does not report API costs for Claude 3.7 Sonnet or Gemini 2.5 Pro, nor wall-clock time for chunking or generation. Given that CAST involves AST parsing as an additional step, the computational overhead vs. fixed-size chunking is not quantified."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "While the hardware is mentioned (8 A100 GPUs, 8 A6000 GPUs), the total compute budget (GPU hours, API costs, total experiment time) is not stated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CAST boosts Recall@5 by 4.3 points on RepoEval retrieval.",
    286       "evidence": "Table 1 shows GIST-base retriever achieves Recall@5 of 75.0 with CAST vs 70.7 with fixed-size chunking, a 4.3 point gain.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "CAST improves Pass@1 by 2.67 points on SWE-bench generation.",
    291       "evidence": "Table 1 shows BGE+Claude achieves Pass@1 of 16.3 with CAST vs 13.7 with fixed-size chunking (2.6 points) and CodeSage+Claude at 16.7 vs 14.0 (2.7 points).",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "The merging step in CAST is necessary for good performance.",
    296       "evidence": "Table 2 shows split-only approach significantly degrades performance across all retrievers. For BGE-base, nDCG drops from 71.1 to 53.5 and Pass@1 from 51.7 to 48.3.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "CAST provides cross-language consistency with up to 4.3 points gain on CrossCodeEval.",
    301       "evidence": "Table 1 shows CodeSage retriever achieves Identifier Match EM of 39.9 with CAST vs 36.3 with fixed-size, a 3.6 point gain. Table 8 shows per-language breakdowns with consistent gains, with largest gains on TypeScript.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Higher precision in retrieval tends to convert into better generation performance.",
    306       "evidence": "Section 3.2 discusses this correlation across Tables 1-8, noting recall and nDCG correlate weakly with downstream quality.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "StarCoder2-7B sees an average of 5.5 points gain on RepoEval.",
    311       "evidence": "Table 6 shows Pass@1 improvements across retrievers for StarCoder2: BGE-base (51.7 vs 47.5 = 4.2), BGE-large (48.8 vs 45.8 = 3.0), GIST-base (57.9 vs 51.2 = 6.7), GIST-large (61.7 vs 59.2 = 2.5), CodeSage (73.2 vs 67.6 = 5.6), Jina (80.7 vs 75.1 = 5.6). Average is approximately 4.6 points, not 5.5.",
    312       "supported": "weak"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "CAST proposes AST-based chunking for code RAG pipelines, using a recursive split-then-merge algorithm that respects syntactic boundaries. The method consistently improves retrieval metrics (1-4 points in Precision and Recall) and generation metrics (2-6 points in Pass@1) across three benchmarks (RepoEval, CrossCodeEval, SWE-bench Lite) compared to fixed-size line-based chunking. Ablation studies show the merging step is critical, and performance is sensitive to chunk size and context length parameters. The approach is language-agnostic, showing gains across Python, Java, C#, and TypeScript.",
    317   "red_flags": [
    318     {
    319       "flag": "Single weak baseline",
    320       "detail": "The only baseline is naive fixed-size line-based chunking. No comparison against other structure-aware chunking methods (e.g., CodeCRAG which is mentioned in related work) or semantic chunking approaches. This makes it impossible to assess whether CAST specifically is good or whether any structure-aware approach would yield similar gains."
    321     },
    322     {
    323       "flag": "No uncertainty quantification",
    324       "detail": "All results are reported as point estimates without confidence intervals, error bars, or variance across runs. For Pass@1 with temperature 0.2, a single sample is used, making results sensitive to randomness. No significance tests are performed despite comparative claims."
    325     },
    326     {
    327       "flag": "Score mapping technique may introduce bias",
    328       "detail": "Appendix A.2 describes a score mapping technique to make retrieval scores comparable across different chunk distributions. This non-standard technique could systematically favor one method, but its impact is not analyzed or validated."
    329     },
    330     {
    331       "flag": "Claimed average gain does not match data",
    332       "detail": "Section 1 claims 'StarCoder2-7B sees an average of 5.5 points gain on RepoEval' but computing from Table 6 yields approximately 4.6 points average gain across retrievers."
    333     },
    334     {
    335       "flag": "Potential funder conflict not acknowledged",
    336       "detail": "One author is affiliated with Augment Code (a code AI company), and funding comes from OpenAI, Amazon, and Google — all with commercial interests in code generation technology. No competing interests statement is provided."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "SWE-bench: Can language models resolve real-world github issues?",
    342       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"],
    343       "year": 2024,
    344       "relevance": "Key benchmark for evaluating LLMs on real-world software engineering tasks, used as one of three main evaluation datasets."
    345     },
    346     {
    347       "title": "CodeRAG-Bench: Can retrieval augment code generation?",
    348       "authors": ["Zora Zhiruo Wang", "Akari Asai", "Xinyan Yu", "Frank F. Xu", "Yiqing Xie", "Graham Neubig", "Daniel Fried"],
    349       "year": 2024,
    350       "arxiv_id": "2406.14497",
    351       "relevance": "Benchmark framework for evaluating retrieval-augmented code generation, directly relevant to this survey's scope on code generation methodology."
    352     },
    353     {
    354       "title": "Agentless: Demystifying llm-based software engineering agents",
    355       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    356       "year": 2024,
    357       "arxiv_id": "2407.01489",
    358       "relevance": "Proposes a non-agentic approach to LLM-based software engineering, directly relevant to comparing agentic vs non-agentic approaches."
    359     },
    360     {
    361       "title": "Evaluating large language models trained on code",
    362       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    363       "year": 2021,
    364       "arxiv_id": "2107.03374",
    365       "relevance": "Introduces HumanEval and the Pass@k metric for evaluating code generation, foundational to the evaluation methodology used in this and many other papers."
    366     },
    367     {
    368       "title": "Starcoder 2 and the stack v2: The next generation",
    369       "authors": ["Anton Lozhkov", "Raymond Li", "Loubna Ben Allal"],
    370       "year": 2024,
    371       "arxiv_id": "2402.19173",
    372       "relevance": "Open-source code LLM used as a primary generation model in the experiments, relevant to understanding code LLM capabilities."
    373     },
    374     {
    375       "title": "Code llama: Open foundation models for code",
    376       "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"],
    377       "year": 2023,
    378       "arxiv_id": "2308.12950",
    379       "relevance": "Open-source code LLM used as a generation model in the experiments, relevant to the survey's coverage of code generation models."
    380     },
    381     {
    382       "title": "An empirical study on llm-based agents for automated bug fixing",
    383       "authors": ["Xiangxin Meng", "Zexiong Ma", "Pengfei Gao", "Chao Peng"],
    384       "year": 2024,
    385       "arxiv_id": "2411.10213",
    386       "relevance": "Empirical study on LLM-based agents for software engineering tasks, directly relevant to the survey scope."
    387     },
    388     {
    389       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    390       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
    391       "year": 2020,
    392       "relevance": "Foundational RAG paper that established the paradigm used in code generation, relevant to understanding the RAG methodology evaluated in surveyed papers."
    393     },
    394     {
    395       "title": "CodeGRAG: Bridging the gap between natural language and programming language via graphical retrieval augmented generation",
    396       "authors": ["Kounianhua Du", "Jizheng Chen", "Renting Rui"],
    397       "year": 2025,
    398       "arxiv_id": "2405.02355",
    399       "relevance": "Uses graph structure of code for RAG, directly relevant alternative approach to structure-aware code retrieval."
    400     },
    401     {
    402       "title": "RepoCoder: Repository-level code completion through iterative retrieval and generation",
    403       "authors": ["Fengji Zhang", "Bei Chen", "Yue Zhang", "Jacky Keung"],
    404       "year": 2023,
    405       "relevance": "Repository-level code completion using retrieval, relevant to evaluating RAG-based code generation approaches."
    406     }
    407   ]
    408 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs