scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31412B)
      1 {
      2   "paper": {
      3     "title": "An Empirical Study of Retrieval-Augmented Code Generation: Challenges and Opportunities",
      4     "authors": [
      5       "Zezhou Yang",
      6       "Sirong Chen",
      7       "Cuiyun Gao",
      8       "Zhenhao Li",
      9       "Xing Hu",
     10       "Kui Liu",
     11       "Xin Xia"
     12     ],
     13     "year": 2025,
     14     "venue": "ACM Transactions on Software Engineering and Methodology",
     15     "arxiv_id": "2501.13742",
     16     "doi": "10.1145/3717061"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "Retrieval-augmented framework (RAF) consistently improves code generation performance across CodeGen, UniXcoder, and CodeT5 on CONCODE, CoNaLa, and HearthStone datasets. BM25, despite being the simplest retrieval technique requiring no training, outperforms more complex neural retrieval methods including CodeBERT, UniXcoder, and CoCoSoDa on most configurations. Sequential Integration Fusion is the most cost-effective fusion strategy, while Sketch Filling Fusion achieves the best performance at 2-7x training cost. The framework also benefits LLMs (ChatGLM3, CodeLlama, DeepSeek-Coder) when used via prompt engineering at inference time.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper provides a code repository at https://github.com/watreyoung/RACG (footnote 4) and retrieval-augmented datasets at a Google Drive link (footnote 3)."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper uses publicly available datasets (CONCODE, CoNaLa, HearthStone) and additionally releases retrieval-augmented datasets via Google Drive (footnote 3). The standard benchmarks are all publicly accessible."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions PyTorch, Huggingface, and hardware (Intel Xeon Platinum 8276, NVIDIA A100 80G), but does not provide a requirements.txt, Dockerfile, or specific library versions. Section 4.4 states only the framework and hardware without sufficient detail to recreate the environment."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "While code and data are released, the paper does not include step-by-step reproduction instructions. Section 4.4 says 'all the hyper-parameter settings of pre-trained models are the same as the original corresponding papers' without providing specific reproduction steps."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All results tables (Tables 3-8) report only point estimates. No confidence intervals, error bars, or ± notation appear anywhere in the paper."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 5.2 states: 'We perform a statistical significance test (t-test), and the results show that the models with the RAF outperform the original models at the significance level at 0.05 (p-value 0.035).'"
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper reports percentage improvements with baseline context throughout (e.g., 'an average improvement of 41.60% in the EM metric' in Section 5.2) and Tables 4 and 6 include percentage improvements in parentheses alongside absolute values."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No justification is given for the choice of three models, three datasets, or the specific dataset sizes. No power analysis is discussed. The paper states datasets are 'well recognized' and 'widely-used' but does not justify sample adequacy."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No variance, standard deviation, or spread measures are reported in any table. All results appear to be from single experimental runs without reporting result stability across seeds or runs."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Table 3 compares retrieval-augmented models against base models (CodeGen, UniXcoder, CodeT5 fine-tuned on original datasets). Table 4 compares five different retrieval techniques against each other and the baseline."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The pre-trained models (CodeGen, UniXcoder, CodeT5) and LLMs (ChatGLM3-6B, CodeLlama-7B, DeepSeek-Coder-6.7B) were contemporary at the time of writing. CoCoSoDa is described as achieving 'state-of-the-art performance' for code search."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The study systematically varies retrieval techniques (RQ2), fusion strategies (RQ3), number of retrieved snippets (Section 5.4.1), and ordering of snippets (Table 8), effectively ablating different components of the retrieval-augmented framework."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Five metrics are used: Exact Match (EM), BLEU-4, Edit Distance (ED), SimilarityAST, and CodeBLEU, as described in Section 4.3."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No human evaluation is performed. All evaluation is automated using the five metrics. The case studies in Section 6.2 are qualitative illustrations by the authors, not systematic human evaluation."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 4.1 describes separate train/test/validation splits for all three datasets. CONCODE has 100k/2k/2k, CoNaLa has 2179/500/200, HearthStone has 533/66/66. Results are reported on test sets."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results are broken down by dataset (CONCODE, CoNaLa, HearthStone), by model, by retrieval technique (Table 4), and by fusion strategy (Table 5). This provides detailed per-category analysis."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 5.3 discusses RetroMAE's performance degradation (7.74% and 1.32% BLEU drops on CONCODE). Section 5.4.2 discusses VDF's decreased performance compared to SEF. Section 6.2.2 analyzes why UniXcoder retrieves suboptimal results."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "RetroMAE decreases CodeGen performance by 7.74% BLEU on CONCODE and causes 81.33% drop on HearthStone (Table 4). VDF performs worse than SEF across all datasets (Table 5). CoCoSoDa decreases performance on HearthStone for CodeGen."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract's claims about RAF improving performance, BM25's effectiveness, SIF's convenience, and SFF's further improvement are all supported by Tables 3-8 and the analysis in Sections 5.2-5.4."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The causal claims (RAF improves performance) are supported by controlled comparisons: same models with and without retrieval augmentation, same models with different retrieval techniques, same models with different fusion strategies. The single-variable manipulation design is adequate."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "Finding 2 states 'BM25 is proven to be the most effective retrieval technique for code generation' — an overly broad claim from 3 datasets and 3 models. The title claims 'Code Generation' generally while testing only Java and Python. The threats section acknowledges uncertainty about larger models but the findings themselves are stated without bounds."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "While the paper discusses why RetroMAE underperforms (NLP vs code domain gap) and why VDF fails (encoder/decoder limitations), it does not discuss alternative explanations for the main findings, such as whether improvements come from data augmentation effects rather than retrieval quality, or whether memorization of training examples explains the gains."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper measures BLEU, CodeBLEU, and EM but frames results as 'code generation performance' without discussing the gap between n-gram matching metrics and actual code correctness or functional correctness. The paper acknowledges pass@k datasets exist (Section 2.2) but does not use them or discuss why proxy metrics may be insufficient."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Table 2 specifies model names with parameter counts: CodeGen-MONO (350M), UniXcoder (126M), CodeT5 (223M). LLMs are specified as ChatGLM3-6B, CodeLlama-7B, DeepSeek-Coder-6.7B. Pre-training data sources are listed."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "For LLM experiments in Section 6.1, the paper states 'The prompts are constructed following [43]' without providing the actual prompt text. For fine-tuned models, the input format is described conceptually but actual prompts/templates are not reproduced."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "Section 4.4 states 'all the hyper-parameter settings of pre-trained models are the same as the original corresponding papers' without listing specific values. No learning rate, batch size, epoch count, or temperature settings are reported. BM25 parameters b and k are mentioned but values are not given."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. The paper studies a retrieval-augmented framework with standard fine-tuning and prompting, not agentic workflows."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 4.1 describes dataset organization, splits, and how the CoNaLa validation set was created (200 random instances from training). Section 4.4 describes data format ('organized in the form of <Natural Language Description, Code Snippets> and stored in JSON files'). The retrieval database construction is described in Section 3.2."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 6.4 'Threats to Validity' contains three substantive subsections: generalization of model results, replication of experiments, and limited dataset."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The threats are specific: (1) uncertainty about applicability to larger models or different architectures, (2) device and parameter setting effects on replication, (3) CONCODE's preprocessing makes it unintuitive for humans — 'it is difficult for human developers to write code the same as the ground truth.'"
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 6.4 states findings may not apply to 'larger models or models with differing architectures' and acknowledges 'a distinct gap between the data within these datasets and the real, specific development environment context.' These are specific scope limitations."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Retrieval-augmented datasets are released via Google Drive (footnote 3) and code is available at the GitHub repository (footnote 4). The underlying benchmarks (CONCODE, CoNaLa, HearthStone) are publicly available."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 4.1 describes each dataset's origin and construction: CONCODE from ~33k Java GitHub projects, CoNaLa from 2,879 Stack Overflow annotations, HearthStone from 665 card implementations. Table 1 provides detailed statistics."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. Data sources are standard public benchmarks."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The three-phase pipeline is clearly documented: Retrieval Phase retrieves top-k code snippets (Section 3.2), Fusion Phase integrates them with input (Section 3.3), Generation Phase produces code (Section 3.4). Section 4.4 describes the learning process. Dataset splits and validation creation are documented."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding acknowledgment or grant information is mentioned anywhere in the paper text. One author is from Huawei Technologies but no funding disclosure accompanies this."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: Harbin Institute of Technology (Shenzhen), Concordia University, Zhejiang University, and Huawei Technologies Co., Ltd."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding source is disclosed, so independence cannot be assessed. One author is from Huawei Technologies, which has commercial interest in code generation (PanGu-Coder is cited), creating a potential conflict."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "Training data sources are mentioned (The Pile, BigQuery, BigPython for CodeGen; CodeSearchNet for UniXcoder and CodeT5) but no training data cutoff dates are provided for any model."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether benchmark data (CONCODE, CoNaLa, HearthStone) could appear in the pre-training corpora of the evaluated models. The models are pre-trained on large code corpora that could contain these benchmarks."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "CONCODE (2018), CoNaLa (2018), and HearthStone (2016) were all published years before the pre-trained models were trained. Models trained on GitHub/StackOverflow data could easily contain these benchmarks, but this contamination risk is not addressed."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study. It is a benchmark evaluation of pre-trained models."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Table 5 reports inference costs (in seconds) for each fusion strategy across all three datasets. Table 7 reports retrieval costs per 50 instances for each retrieval technique. Section 6.1.2 discusses cost trade-offs."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": true,
    297         "justification": "Section 4.4 specifies hardware (two Intel Xeon Platinum 8276 CPUs, two NVIDIA A100 80G). Table 5 reports total training time (e.g., 128-923 min on CONCODE). Table 7 reports per-epoch training costs and retrieval costs."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single experimental runs."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs is not stated anywhere. It is unclear whether results are from single runs or averaged over multiple runs."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search budget is reported. The paper defers to original papers for hyperparameter settings without indicating whether any search was performed for the retrieval-augmented configurations."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "The selection of k=5 retrieved snippets is justified by experiments in Section 5.4.1 showing performance across k=1,3,5,7,10. The choice of ascending ordering is justified by Table 8. Model selections reference prior empirical comparisons."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The paper performs a single t-test (p-value 0.035) but runs many comparisons across 3 models × 3 datasets × 5 retrieval techniques × 5 metrics without any correction for multiple comparisons."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors implement and evaluate all retrieval techniques and fusion strategies themselves without acknowledging the bias of evaluating their own implementations. No independent evaluation or acknowledgment of author-evaluation bias."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "Table 5 directly reports both performance metrics and training/inference costs for each fusion strategy side by side. Section 6.1.2 explicitly discusses the 'trade-off between performance improvement and computational costs.'"
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": true,
    339         "justification": "Section 6.4 acknowledges that CONCODE's 'pre-processing makes human hard to understand the code intuitively' and that 'it is difficult for human developers to write code the same as the ground truth,' questioning whether these benchmarks reflect real-world development."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No agentic scaffolding is involved. The study evaluates fine-tuned models and prompted LLMs directly without scaffolding."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "Not addressed. The benchmarks (2016-2018) were created before the pre-trained models' training periods, meaning solutions could have been in training data. This temporal leakage risk is not discussed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "Not discussed. The retrieval database is the training set, meaning the model has already been fine-tuned on the same data it retrieves from. The paper does not discuss whether this creates a feature leakage concern."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "Only partially addressed for CONCODE, which uses 'repository-based partitioning' to separate domains. No independence analysis is performed for CoNaLa or HearthStone, and no systematic discussion of non-independence is provided."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are used."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Retrieval-augmented framework universally improves code generation performance across different pre-trained models and datasets.",
    373       "evidence": "Table 3 shows average improvements of 6.79% EM, 11.45% BLEU, 6.93% SimilarityAST, and 8.72% CodeBLEU on CONCODE; 3.74%, 18.42%, 15.51%, and 16.75% on CoNaLa; 41.60%, 9.01%, 11.25%, and 8.69% on HearthStone (Section 5.2). Statistical significance test yields p-value 0.035.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "BM25 is the most effective retrieval technique for the retrieval-augmented framework, outperforming neural methods.",
    378       "evidence": "Table 4 shows all three models achieve highest gains with BM25 on CONCODE and HearthStone. On CoNaLa, BM25 is optimal for CodeT5 (25.69% BLEU improvement) and suboptimal for CodeGen (Section 5.3). CoCoSoDa sometimes matches BM25 on CoNaLa.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Sequential Integration Fusion is the most recommended fusion strategy balancing cost and performance.",
    383       "evidence": "Table 5 shows SIF training time is the shortest (285 min vs 923 min for SEF on CONCODE). SIF achieves competitive or best performance on CoNaLa. On HearthStone, SFF outperforms SIF but costs 107 min vs 60 min (Section 5.4.2).",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Sketch Filling Fusion achieves the highest performance improvement among fusion strategies.",
    388       "evidence": "Table 5 shows SFF yields highest BLEU on CONCODE (40.84) and HearthStone (81.89). Section 5.4.2 reports average 14.83% BLEU and 8.05% CodeBLEU improvement for SFF based on SEF. However, SFF performs worse than SIF on CoNaLa.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "RAF is effective for LLMs in code generation during inference without fine-tuning.",
    393       "evidence": "Table 6 shows substantial improvements: ChatGLM BLEU ratio of 198.67 on HearthStone with BM25, CodeLlama BLEU ratio of 147.00 on HearthStone with BM25. All three LLMs improve across all datasets and retrieval techniques (Section 6.1).",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "More complex retrieval techniques do not necessarily lead to better code generation results.",
    398       "evidence": "Table 4 shows RetroMAE (a neural retrieval model) decreases CodeGen's BLEU by 7.74% and causes 81.33% drop on HearthStone. BM25, requiring no training, outperforms trained code search models in most configurations (Section 5.3).",
    399       "supported": "strong"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "No variance or multiple seeds reported",
    405       "detail": "All results appear to be from single experimental runs. No standard deviations, confidence intervals, or seed sensitivity analysis is reported despite deep learning results being highly sensitive to random initialization (Henderson et al., 2018)."
    406     },
    407     {
    408       "flag": "No contamination analysis",
    409       "detail": "The benchmarks (CONCODE 2018, CoNaLa 2018, HearthStone 2016) predate all evaluated models' training periods. Models trained on GitHub/StackOverflow code could contain these benchmark solutions, but no contamination analysis is performed."
    410     },
    411     {
    412       "flag": "Overclaiming from limited scope",
    413       "detail": "Finding 2 states 'BM25 is proven to be the most effective retrieval technique for code generation' from only 3 datasets (2 languages) and 3 models. The use of 'proven' overstates the evidence. The title implies general code generation but tests are limited to Java and Python."
    414     },
    415     {
    416       "flag": "No human evaluation of generated code",
    417       "detail": "All evaluation relies on automated metrics (BLEU, CodeBLEU, EM) which measure surface-level similarity. No functional correctness evaluation (pass@k) or human assessment of code quality is performed despite the paper framing results as 'code generation performance.'"
    418     },
    419     {
    420       "flag": "Huawei affiliation without conflict disclosure",
    421       "detail": "One author (Kui Liu) is from Huawei Technologies, which has commercial code generation products (PanGu-Coder). No funding disclosure or competing interests statement is provided."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Evaluating large language models trained on code",
    427       "authors": ["Mark Chen", "Jerry Tworek"],
    428       "year": 2021,
    429       "arxiv_id": "2107.03374",
    430       "relevance": "Introduces Codex and HumanEval benchmark, foundational to LLM-based code generation evaluation."
    431     },
    432     {
    433       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    434       "authors": ["Erik Nijkamp", "Bo Pang"],
    435       "year": 2023,
    436       "relevance": "One of three pre-trained models evaluated in this study; pioneered conversational code generation."
    437     },
    438     {
    439       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    440       "authors": ["Yue Wang", "Weishi Wang"],
    441       "year": 2021,
    442       "relevance": "One of three pre-trained models evaluated; unified encoder-decoder architecture for code tasks."
    443     },
    444     {
    445       "title": "Retrieval Augmented Code Generation and Summarization",
    446       "authors": ["Md. Rizwan Parvez", "Wasi Uddin Ahmad"],
    447       "year": 2021,
    448       "relevance": "REDCODER: key prior work on retrieval-augmented code generation enhancing both generation and summarization."
    449     },
    450     {
    451       "title": "Skcoder: A sketch-based approach for automatic code generation",
    452       "authors": ["Jia Li", "Yongmin Li"],
    453       "year": 2023,
    454       "arxiv_id": "2302.06144",
    455       "relevance": "Introduces Sketch Filling Fusion, one of the four fusion strategies evaluated in this study."
    456     },
    457     {
    458       "title": "DocPrompting: Generating Code by Retrieving the Docs",
    459       "authors": ["Shuyan Zhou", "Uri Alon"],
    460       "year": 2023,
    461       "relevance": "Retrieval-augmented code generation through documentation retrieval; alternative RAG approach for code."
    462     },
    463     {
    464       "title": "CodeRL: Mastering Code Generation through Pretrained Models and Deep Reinforcement Learning",
    465       "authors": ["Hung Le", "Yue Wang"],
    466       "year": 2022,
    467       "relevance": "Uses RL to improve code generation from pre-trained models; alternative approach to improving code quality."
    468     },
    469     {
    470       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming - The Rise of Code Intelligence",
    471       "authors": ["Daya Guo", "Qihao Zhu"],
    472       "year": 2024,
    473       "arxiv_id": "2401.14196",
    474       "relevance": "One of three LLMs used in the RAF extension experiments; represents modern open-source code LLMs."
    475     },
    476     {
    477       "title": "Code llama: Open foundation models for code",
    478       "authors": ["Baptiste Rozière", "Jonas Gehring"],
    479       "year": 2023,
    480       "arxiv_id": "2308.12950",
    481       "relevance": "One of three LLMs used in the RAF extension experiments; Meta's open code generation model."
    482     },
    483     {
    484       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    485       "authors": ["Patrick S. H. Lewis", "Ethan Perez"],
    486       "year": 2020,
    487       "relevance": "Foundational RAG paper defining the retriever-and-generator framework that this work extends to code generation."
    488     },
    489     {
    490       "title": "An Empirical Comparison of Pre-Trained Models of Source Code",
    491       "authors": ["Changan Niu", "Chuanyi Li"],
    492       "year": 2023,
    493       "relevance": "Empirical comparison of pre-trained code models that informed model selection in this study."
    494     },
    495     {
    496       "title": "Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection",
    497       "authors": ["Akari Asai", "Zeqiu Wu"],
    498       "year": 2024,
    499       "relevance": "Active retrieval approach for RAG, cited as future work direction for retrieval-augmented code generation."
    500     },
    501     {
    502       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    503       "authors": ["Zhangyin Feng", "Daya Guo"],
    504       "year": 2020,
    505       "relevance": "First bimodal pre-trained model for code; used as both a retrieval technique and baseline in this study."
    506     }
    507   ],
    508   "engagement_factors": {
    509     "practical_relevance": {
    510       "score": 2,
    511       "justification": "Practitioners building RAG systems for code generation can directly apply the recommendation to use BM25 + Sequential Integration Fusion."
    512     },
    513     "surprise_contrarian": {
    514       "score": 1,
    515       "justification": "The finding that simple BM25 outperforms neural retrieval models is mildly surprising but aligns with known patterns in information retrieval."
    516     },
    517     "fear_safety": {
    518       "score": 0,
    519       "justification": "No safety, security, or risk concerns are raised."
    520     },
    521     "drama_conflict": {
    522       "score": 0,
    523       "justification": "Straightforward empirical study with no controversy or conflict."
    524     },
    525     "demo_ability": {
    526       "score": 1,
    527       "justification": "Code repository is available on GitHub but requires substantial setup with GPU hardware and multiple models to replicate."
    528     },
    529     "brand_recognition": {
    530       "score": 0,
    531       "justification": "Authors from academic institutions and Huawei; not associated with high-profile AI labs."
    532     }
    533   }
    534 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs