scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26723B)
      1 {
      2   "paper": {
      3     "title": "CODERAG-BENCH: Can Retrieval Augment Code Generation?",
      4     "authors": [
      5       "Zora Zhiruo Wang",
      6       "Akari Asai",
      7       "Xinyan Velocity Yu",
      8       "Frank F. Xu",
      9       "Yiqing Xie",
     10       "Graham Neubig",
     11       "Daniel Fried"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv",
     15     "arxiv_id": "2406.14497"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper provides a project website (https://code-rag-bench.github.io/) and mentions releasing a codebase for reproducible evaluations. The abstract and introduction explicitly state the benchmark provides a 'consistent interface for retrieval, augmented generation, and evaluation.'"
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The benchmark aggregates publicly available datasets (HumanEval, MBPP, LiveCodeBench, DS-1000, ODEX, RepoEval, SWE-bench-Lite, CodeSearchNet) and creates a diverse retrieval datastore from five public sources. The project website is provided for access."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper mentions using specific tools (pyserini, sentence-transformers) and hardware (A100 GPU with 80GB), but does not provide a requirements.txt, Dockerfile, or detailed environment specification listing library versions."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "While Appendix C discusses result reproduction differences and the paper describes experimental setups, the paper itself does not include step-by-step reproduction instructions or commands. Reproduction details may exist on the project website, but the paper text does not contain them."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "All results are reported as single point estimates (pass@1, NDCG@10) without confidence intervals, error bars, or any uncertainty quantification."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper makes numerous comparative claims (e.g., 'dense embedding models frequently surpass BM25', 'GPT-4o achieves a 27.4% gain') but reports no statistical significance tests to support these comparisons."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The paper reports effect sizes in context, e.g., 'GPT-4o achieves a 27.4% gain on SWE-Bench and a 6.9% gain on the harder ODEX subset' and 'Jina-v2-code outperforms GIST-base and BGE-base by 7.4 and 6.6 average NDCG@10.' Baseline values and improvements are consistently provided together."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No justification is given for the number of examples in each dataset or why these particular dataset sizes are sufficient. For instance, LiveCodeBench uses 400 examples and ODEX-hard uses 29 examples from the 20 least common libraries, with no power analysis or size justification."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper generates only one response per example (temperature=0.2, sample one response) and reports single-run numbers. No standard deviation, variance, or multiple-run statistics are provided."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper includes extensive baselines: 10 retrieval models (BM25, various dense and proprietary retrievers) and 10 generation models, plus no-retrieval and gold-document baselines."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Baselines include contemporary models as of 2024: GPT-4o, DeepSeekCoder, StarCoder2, Llama3, CodeGemma, Claude-3 variants, Gemini-1.5 variants, and top-ranked MTEB retrievers like SFR-Embedding-Mistral."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper includes multiple ablation-style analyses: comparing different retrieval sources individually (Section 4, Tables 7-8), varying document counts (Appendix E, Figure 6), comparing chunking strategies (Table 9), and constructing the ODEX-hard subset to isolate library familiarity effects (Section 3.3, Appendix D)."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper uses multiple metrics: NDCG@10, Precision, and Recall for retrieval (Section 2.4), and pass@k for code generation. Different tasks use appropriate metrics (ndcg@10 for code retrieval, execution-based pass@1 for generation)."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "The paper relies entirely on automated metrics (execution-based pass@1 and NDCG@10). While manual analysis is mentioned for specific failure modes (Section 4, Appendix F), there is no systematic human evaluation of the generated code quality or relevance of retrieved documents beyond the canonical document annotation process."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The paper uses established benchmark test sets (HumanEval, MBPP, DS-1000, ODEX, RepoEval, SWE-bench-Lite). LiveCodeBench is specifically included to address contamination concerns with problems collected after training cutoffs."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Results are broken down by task type (basic programming, open-domain, repository-level, code retrieval) in Tables 3, 5, and 6, and further by individual dataset within each category."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 4 and Appendix F discuss failure modes: models copying functions from context, generating over-complicated programs, Claude models responding with explanations instead of code. Figure 5 illustrates distracting context examples. Appendix A.2 provides concrete examples of both helpful and distracting retrieved documents."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper reports several negative results: RACG does not improve DeepSeekCoder generations due to overcomplicated outputs (Section 3.4), GPTs show no gains on open-domain tasks with common libraries (Section 3.3), reranking within optimal range 'greatly degrades the results' (Section 4), and open retrieval sources are less useful than local code for repository-level tasks (Table 8)."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims are supported: (1) 'retrieving high-quality contexts improves code generation' is supported by Table 5 gold vs. w/o results; (2) 'retrievers often struggle to fetch useful contexts' is supported by Table 3 low NDCG scores on open-domain and SWE-bench; (3) 'generators face limitations in using those contexts effectively' is supported by negative RACG results in Tables 6 and 11-13."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper's causal claims are primarily ablation-style: adding/removing retrieval context and measuring the effect. The controlled experimental design (same model with/without documents, gold vs. retrieved) adequately supports claims like 'retrieval significantly enhances performance.' The ODEX-hard experiment (Appendix D) provides controlled evidence for the library familiarity hypothesis."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The Limitations section explicitly bounds scope: 'we focus on coding tasks using Python programming language, but extrapolating to other languages may bring additional challenges.' The title asks 'Can Retrieval Augment Code Generation?' rather than asserting universal effectiveness. Footnote 1 also states the Python-only scope."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper discusses alternative explanations for key results: the library memorization hypothesis for why GPT models don't benefit from open-domain retrieval (Section 3.3, verified in Appendix D), the over-complication hypothesis for DeepSeekCoder's degradation with contexts (Section 3.4), and the context-copying problem for ODEX failures (Appendix F)."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper specifies 'gpt-3.5-turbo-0125' and 'gpt-4o' but 'gpt-4o' lacks a specific version/snapshot date. Other models are specified by name but not version: 'Claude-3-haiku', 'Claude-3-sonnet', 'Gemini-1.5-flash', 'Gemini-1.5-pro', 'Llama3-8B', 'Command-R'. Marketing names without snapshot dates do not count as specified versions per the schema."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "Appendix C states 'we use the same prompt for each dataset' and 'zero-shot prompts without any additional instructions,' but the actual prompt text is not provided in the paper or appendix. The paper describes what prompts do in natural language rather than providing the actual text."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Key hyperparameters are reported: temperature t=0.2, top_p=0.95 for generation (Section 3.1); BM25 parameters k1=1.2, b=0.75; batch sizes for encoding (64 for most models, 8 for SFR-Mistral); top-5 documents prepended; n=21 sampling for SWE-bench."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "The paper does not use agentic scaffolding. It evaluates a straightforward retrieve-then-generate pipeline without agent loops, tool use, or feedback mechanisms."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 2 describes the data curation process in detail: how retrieval sources were collected (from ClueWeb22, RedPajama-1T), how canonical documents were annotated (automatic parsing + manual verification), and how each dataset was integrated. Table 1 shows dataset sizes and Table 2 shows corpus statistics."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The paper has a dedicated 'Limitations' section after the Conclusion (starting at the text marker 'Limitations') discussing task and language diversity, model coverage, and methodological scope."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The Limitations section raises specific concerns: Python-only focus may not generalize to other languages, vanilla retrieval/generation methods used rather than advanced techniques, and results 'may not represent all model behaviors.' Appendix C specifically discusses reproduction variances for individual models (3-5 points lower for CodeGemma, few-shot vs zero-shot differences)."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper explicitly states scope boundaries: Python-only (footnote 1), focus on basic retrieval methods without advanced RACG techniques (Limitations section), exclusion of tasks like code debugging. The title frames the work as a question rather than a definitive answer."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The benchmark is built on publicly available datasets (HumanEval, MBPP, LiveCodeBench, DS-1000, ODEX, RepoEval, SWE-bench-Lite, CodeSearchNet) and the retrieval corpora are from public sources (ClueWeb22, RedPajama-1T, devdocs.io). The project website provides access."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 2 describes data collection in detail: programming problems sourced from existing datasets (Section 2.1), retrieval documents from five specific sources with sizes and average lengths (Section 2.2, Table 2), and canonical document annotation with automatic + manual verification (Section 2.3)."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants are involved. The paper uses standard public benchmarks and automated evaluation."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The data pipeline is documented across Section 2: dataset selection and integration (2.1), retrieval source collection with specific origins for each source (2.2), canonical document annotation process with automatic parsing followed by manual verification (2.3), and the evaluation pipeline (2.4)."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding sources are disclosed. The Acknowledgment section thanks individuals for discussions and feedback but does not mention any grants, funding agencies, or corporate sponsors."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: Carnegie Mellon University, University of Washington, and University of Southern California. The authors are from academic institutions, not from companies whose products are being evaluated."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure does not confirm the work is unfunded — the authors are at major research universities that typically have funding."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper does not state the training data cutoff dates for the models evaluated. It acknowledges contamination concerns ('it is unclear whether models suffer from data contamination on HumanEval and MBPP') and includes LiveCodeBench with 'problems collected from coding websites after the training cutoff of LMs that we consider,' but does not specify what those cutoff dates are."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "Section 2.1 explicitly discusses contamination risk: 'due to limited public knowledge about model training data, it is unclear whether models suffer from data contamination on HumanEval and MBPP.' LiveCodeBench is specifically included 'to decrease the risk of contamination' with problems after training cutoffs."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "The paper proactively addresses benchmark contamination by including LiveCodeBench alongside potentially contaminated benchmarks (HumanEval published 2021, MBPP published 2021). Section 2.1 explicitly acknowledges the contamination risk and includes a contamination-free alternative."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants are involved in the study. All evaluation is automated."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants are involved. The study uses public benchmarks and automated evaluation."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants are involved in the study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants are involved in the study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants or experimental conditions requiring randomization."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants or subjective evaluation requiring blinding."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants are involved in the study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Table 4 reports encoding latency, search latency, model storage, and index storage for retrieval models. Section 3.2 discusses the efficiency-effectiveness tradeoff: SFR-Mistral requires '5x larger index storage and adds nearly 100x latency to encode documents' compared to GIST-base."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "While the paper mentions using an A100 GPU (Appendix B), total computational budget (GPU hours, total API spend for proprietary model calls across all experiments) is not stated. The paper runs extensive experiments across 10 retrievers and 10 generators on 8 datasets, but the total cost is never quantified."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "Retrieving high-quality (canonical) documents significantly improves code generation across tasks, with GPT-4o achieving a 27.4% gain on SWE-Bench and 6.9% gain on ODEX-hard.",
    294       "evidence": "Table 5 shows pass@1 improvements from w/o to gold across models and datasets. GPT-4o SWE-bench: 2.3 to 30.7 (Section 3.3). ODEX-hard: 20.7 to 27.6 (Section 3.3, Table 5).",
    295       "supported": "strong"
    296     },
    297     {
    298       "claim": "Dense embedding models frequently surpass BM25 on code retrieval tasks, contrary to findings in text-centric domains.",
    299       "evidence": "Table 3 shows retrieval NDCG@10 where models like SFR-Mistral (67.0 avg) and Jina-v2-code (65.4 avg) outperform BM25 (57.7 avg) on average. Discussed in Section 3.2.",
    300       "supported": "strong"
    301     },
    302     {
    303       "claim": "Code-specific retrieval models outperform general-purpose models at similar parameter scales.",
    304       "evidence": "Section 3.2: 'Jina-v2-code outperforms GIST-base and BGE-base by 7.4 and 6.6 average NDCG@10, respectively, while Voyage-code significantly outperforms OpenAI-03.' Supported by Table 3 numbers.",
    305       "supported": "moderate"
    306     },
    307     {
    308       "claim": "RACG is particularly effective for less common libraries where models lack parametric knowledge.",
    309       "evidence": "The ODEX-hard subset (20 least common libraries) shows 20.3-40.1% improvement with retrieval (Appendix D, Table 10), compared to limited gains on the full ODEX set. Verified across multiple retrieval models.",
    310       "supported": "strong"
    311     },
    312     {
    313       "claim": "Pre-retrieval chunking achieves the highest scores compared to post-retrieval and reranking-based chunking strategies.",
    314       "evidence": "Table 9 shows pre-retrieval chunking outperforming full text, first chunk, and reranking on most sources for HumanEval with BM25.",
    315       "supported": "moderate"
    316     },
    317     {
    318       "claim": "Top-performing retrieval models do not always lead to the best end-to-end RACG outcomes.",
    319       "evidence": "Section 3.4 states this and Table 6 shows examples where the best retriever varies by task and generation model. For instance, OpenAI reranking performs best on RepoEval but not on basic programming tasks.",
    320       "supported": "moderate"
    321     },
    322     {
    323       "claim": "Models can be easily distracted by irrelevant retrieved contexts, sometimes degrading performance below the no-retrieval baseline.",
    324       "evidence": "Table 5 shows red-highlighted cells where gold < w/o. Table 6 shows DeepSeekCoder degradation on some tasks. Appendix F discusses Claude models copying context and generating overcomplicated code.",
    325       "supported": "strong"
    326     }
    327   ],
    328   "methodology_tags": [
    329     "benchmark-eval"
    330   ],
    331   "key_findings": "CODERAG-BENCH demonstrates that retrieval-augmented code generation (RACG) can substantially improve code generation when high-quality documents are retrieved, with gains up to 27.4% on SWE-Bench. However, current retrieval models struggle to find useful documents especially for open-domain and repository-level tasks, and generation models often fail to effectively utilize retrieved contexts, sometimes degrading below no-retrieval baselines. Code-specific retrieval models outperform general-purpose ones at similar scales, and RACG is most effective for problems involving less common libraries where models lack parametric knowledge.",
    332   "red_flags": [
    333     {
    334       "flag": "No uncertainty quantification",
    335       "detail": "All results across Tables 3-13 are single point estimates without confidence intervals, error bars, or variance across runs, despite using temperature=0.2 sampling which introduces stochasticity."
    336     },
    337     {
    338       "flag": "Single-sample generation",
    339       "detail": "The paper generates only one response per example (except SWE-bench with n=21 majority voting), making results sensitive to sampling randomness while reporting no variance."
    340     },
    341     {
    342       "flag": "No statistical significance tests",
    343       "detail": "Numerous comparative claims ('surpass', 'outperform', 'significantly improves') are made based on raw number comparisons without any significance testing."
    344     },
    345     {
    346       "flag": "Potential data leakage in HumanEval/MBPP evaluation",
    347       "detail": "The paper acknowledges HumanEval and MBPP contamination risk but still reports and draws conclusions from results on these datasets. While LiveCodeBench is included as a mitigation, many comparisons rely on the potentially contaminated benchmarks."
    348     }
    349   ],
    350   "cited_papers": [
    351     {
    352       "title": "Evaluating large language models trained on code",
    353       "authors": ["Mark Chen", "Jerry Tworek"],
    354       "year": 2021,
    355       "arxiv_id": "2107.03374",
    356       "relevance": "Introduced HumanEval benchmark and pass@k metric, foundational for code generation evaluation."
    357     },
    358     {
    359       "title": "SWE-bench: Can language models resolve real-world github issues?",
    360       "authors": ["Carlos E Jimenez", "John Yang"],
    361       "year": 2024,
    362       "relevance": "Major benchmark for repository-level code generation from GitHub issues, used as a key evaluation task."
    363     },
    364     {
    365       "title": "Agentless: Demystifying llm-based software engineering agents",
    366       "authors": ["Chunqiu Steven Xia", "Yinlin Deng"],
    367       "year": 2024,
    368       "arxiv_id": "2407.01489",
    369       "relevance": "Proposed sampling and majority-vote reranking strategy for SWE-bench, adopted in this paper's experimental setup."
    370     },
    371     {
    372       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    373       "authors": ["Naman Jain", "King Han"],
    374       "year": 2024,
    375       "arxiv_id": "2403.07974",
    376       "relevance": "Contamination-free code generation benchmark with problems collected after model training cutoffs."
    377     },
    378     {
    379       "title": "Docprompting: Generating code by retrieving the docs",
    380       "authors": ["Shuyan Zhou", "Uri Alon"],
    381       "year": 2023,
    382       "relevance": "Prior work on retrieval-augmented code generation using library documentation."
    383     },
    384     {
    385       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    386       "authors": ["John Yang", "Carlos E Jimenez"],
    387       "year": 2024,
    388       "arxiv_id": "2405.15793",
    389       "relevance": "Agent-based approach to software engineering using LLMs with custom computer interfaces."
    390     },
    391     {
    392       "title": "DeepSeek-Coder: When the large language model meets programming",
    393       "authors": ["Daya Guo", "Qihao Zhu"],
    394       "year": 2024,
    395       "arxiv_id": "2401.14196",
    396       "relevance": "Major open-source code LLM evaluated as a generation baseline in this benchmark."
    397     },
    398     {
    399       "title": "StarCoder: may the source be with you!",
    400       "authors": ["Raymond Li"],
    401       "year": 2023,
    402       "relevance": "Open code LLM series with reproducibility certification, used as baseline."
    403     },
    404     {
    405       "title": "RepoCoder: Repository-level code completion through iterative retrieval and generation",
    406       "authors": ["Fengji Zhang", "Bei Chen"],
    407       "year": 2023,
    408       "relevance": "Prior work on retrieval-augmented repository-level code completion."
    409     },
    410     {
    411       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    412       "authors": ["Jiawei Liu", "Chunqiu Steven Xia"],
    413       "year": 2023,
    414       "relevance": "Critical evaluation of LLM code generation quality and evaluation methodology."
    415     },
    416     {
    417       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    418       "authors": ["Patrick Lewis", "Ethan Perez"],
    419       "year": 2020,
    420       "relevance": "Foundational RAG paper establishing the retrieve-then-generate paradigm."
    421     },
    422     {
    423       "title": "ARKS: Active retrieval in knowledge soup for code generation",
    424       "authors": ["Hongjin Su"],
    425       "year": 2024,
    426       "arxiv_id": "2402.12317",
    427       "relevance": "Prior work on retrieval strategies specifically for code generation tasks."
    428     }
    429   ]
    430 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs