scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33278B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "An Empirical Study of Retrieval-Augmented Code Generation: Challenges and Opportunities",
      6     "authors": [
      7       "Zezhou Yang",
      8       "Sirong Chen",
      9       "Cuiyun Gao",
     10       "Zhenhao Li",
     11       "Xing Hu",
     12       "Kui Liu",
     13       "Xin Xia"
     14     ],
     15     "year": 2025,
     16     "venue": "ACM Transactions on Software Engineering and Methodology",
     17     "arxiv_id": "2501.13742",
     18     "doi": "10.1145/3717061"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The abstract's claims about RAF improving performance, BM25's effectiveness, SIF's convenience, and SFF's further improvement are all supported by Tables 3-8 and the analysis in Sections 5.2-5.4.",
     26         "source": "opus"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The causal claims (RAF improves performance) are supported by controlled comparisons: same models with and without retrieval augmentation, same models with different retrieval techniques, same models with different fusion strategies. The single-variable manipulation design is adequate.",
     32         "source": "opus"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Finding 2 states 'BM25 is proven to be the most effective retrieval technique for code generation' — an overly broad claim from 3 datasets and 3 models. The title claims 'Code Generation' generally while testing only Java and Python. The threats section acknowledges uncertainty about larger models but the findings themselves are stated without bounds.",
     38         "source": "opus"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "While the paper discusses why RetroMAE underperforms (NLP vs code domain gap) and why VDF fails (encoder/decoder limitations), it does not discuss alternative explanations for the main findings, such as whether improvements come from data augmentation effects rather than retrieval quality, or whether memorization of training examples explains the gains.",
     44         "source": "opus"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper measures BLEU, CodeBLEU, and EM but frames results as 'code generation performance' without discussing the gap between n-gram matching metrics and actual code correctness or functional correctness. The paper acknowledges pass@k datasets exist (Section 2.2) but does not use them or discuss why proxy metrics may be insufficient.",
     50         "source": "opus"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 6.4 'Threats to Validity' contains three substantive subsections: generalization of model results, replication of experiments, and limited dataset.",
     58         "source": "opus"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The threats are specific: (1) uncertainty about applicability to larger models or different architectures, (2) device and parameter setting effects on replication, (3) CONCODE's preprocessing makes it unintuitive for humans — 'it is difficult for human developers to write code the same as the ground truth.'",
     64         "source": "opus"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Section 6.4 states findings may not apply to 'larger models or models with differing architectures' and acknowledges 'a distinct gap between the data within these datasets and the real, specific development environment context.' These are specific scope limitations.",
     70         "source": "opus"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding acknowledgment or grant information is mentioned anywhere in the paper text. One author is from Huawei Technologies but no funding disclosure accompanies this.",
     78         "source": "opus"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations are clearly listed: Harbin Institute of Technology (Shenzhen), Concordia University, Zhejiang University, and Huawei Technologies Co., Ltd.",
     84         "source": "opus"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No funding source is disclosed, so independence cannot be assessed. One author is from Huawei Technologies, which has commercial interest in code generation (PanGu-Coder is cited), creating a potential conflict.",
     90         "source": "opus"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests statement or financial interest declaration is present in the paper.",
     96         "source": "opus"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Code generation, retrieval-augmented framework (RAF), and its three phases (retrieval, fusion, generation) are all formally defined; retrieval techniques and fusion strategies are described with mathematical formulations.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 1 explicitly lists three contributions: first empirical study of RAF for code generation, exploration of retrieval technique and fusion strategy effects, and actionable implications for practitioners.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 2 situates the work relative to pre-trained code models, existing code generation methods, and retrieval-augmented generation literature; Section 3.2 contrasts this work's comprehensive multi-configuration approach against prior single-configuration studies.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "The paper provides a code repository at https://github.com/watreyoung/RACG (footnote 4) and retrieval-augmented datasets at a Google Drive link (footnote 3).",
    127           "source": "opus"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "The paper uses publicly available datasets (CONCODE, CoNaLa, HearthStone) and additionally releases retrieval-augmented datasets via Google Drive (footnote 3). The standard benchmarks are all publicly accessible.",
    133           "source": "opus"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "The paper mentions PyTorch, Huggingface, and hardware (Intel Xeon Platinum 8276, NVIDIA A100 80G), but does not provide a requirements.txt, Dockerfile, or specific library versions. Section 4.4 states only the framework and hardware without sufficient detail to recreate the environment.",
    139           "source": "opus"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "While code and data are released, the paper does not include step-by-step reproduction instructions. Section 4.4 says 'all the hyper-parameter settings of pre-trained models are the same as the original corresponding papers' without providing specific reproduction steps.",
    145           "source": "opus"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "All results tables (Tables 3-8) report only point estimates. No confidence intervals, error bars, or ± notation appear anywhere in the paper.",
    153           "source": "opus"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Section 5.2 states: 'We perform a statistical significance test (t-test), and the results show that the models with the RAF outperform the original models at the significance level at 0.05 (p-value 0.035).'",
    159           "source": "opus"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "The paper reports percentage improvements with baseline context throughout (e.g., 'an average improvement of 41.60% in the EM metric' in Section 5.2) and Tables 4 and 6 include percentage improvements in parentheses alongside absolute values.",
    165           "source": "opus"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "No justification is given for the choice of three models, three datasets, or the specific dataset sizes. No power analysis is discussed. The paper states datasets are 'well recognized' and 'widely-used' but does not justify sample adequacy.",
    171           "source": "opus"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "No variance, standard deviation, or spread measures are reported in any table. All results appear to be from single experimental runs without reporting result stability across seeds or runs.",
    177           "source": "opus"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Table 3 compares retrieval-augmented models against base models (CodeGen, UniXcoder, CodeT5 fine-tuned on original datasets). Table 4 compares five different retrieval techniques against each other and the baseline.",
    185           "source": "opus"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "The pre-trained models (CodeGen, UniXcoder, CodeT5) and LLMs (ChatGLM3-6B, CodeLlama-7B, DeepSeek-Coder-6.7B) were contemporary at the time of writing. CoCoSoDa is described as achieving 'state-of-the-art performance' for code search.",
    191           "source": "opus"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "The study systematically varies retrieval techniques (RQ2), fusion strategies (RQ3), number of retrieved snippets (Section 5.4.1), and ordering of snippets (Table 8), effectively ablating different components of the retrieval-augmented framework.",
    197           "source": "opus"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Five metrics are used: Exact Match (EM), BLEU-4, Edit Distance (ED), SimilarityAST, and CodeBLEU, as described in Section 4.3.",
    203           "source": "opus"
    204         },
    205         "human_evaluation": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "No human evaluation is performed. All evaluation is automated using the five metrics. The case studies in Section 6.2 are qualitative illustrations by the authors, not systematic human evaluation.",
    209           "source": "opus"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Section 4.1 describes separate train/test/validation splits for all three datasets. CONCODE has 100k/2k/2k, CoNaLa has 2179/500/200, HearthStone has 533/66/66. Results are reported on test sets.",
    215           "source": "opus"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Results are broken down by dataset (CONCODE, CoNaLa, HearthStone), by model, by retrieval technique (Table 4), and by fusion strategy (Table 5). This provides detailed per-category analysis.",
    221           "source": "opus"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Section 5.3 discusses RetroMAE's performance degradation (7.74% and 1.32% BLEU drops on CONCODE). Section 5.4.2 discusses VDF's decreased performance compared to SEF. Section 6.2.2 analyzes why UniXcoder retrieves suboptimal results.",
    227           "source": "opus"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "RetroMAE decreases CodeGen performance by 7.74% BLEU on CONCODE and causes 81.33% drop on HearthStone (Table 4). VDF performs worse than SEF across all datasets (Table 5). CoCoSoDa decreases performance on HearthStone for CodeGen.",
    233           "source": "opus"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Table 2 specifies model names with parameter counts: CodeGen-MONO (350M), UniXcoder (126M), CodeT5 (223M). LLMs are specified as ChatGLM3-6B, CodeLlama-7B, DeepSeek-Coder-6.7B. Pre-training data sources are listed.",
    241           "source": "opus"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "For LLM experiments in Section 6.1, the paper states 'The prompts are constructed following [43]' without providing the actual prompt text. For fine-tuned models, the input format is described conceptually but actual prompts/templates are not reproduced.",
    247           "source": "opus"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "Section 4.4 states 'all the hyper-parameter settings of pre-trained models are the same as the original corresponding papers' without listing specific values. No learning rate, batch size, epoch count, or temperature settings are reported. BM25 parameters b and k are mentioned but values are not given.",
    253           "source": "opus"
    254         },
    255         "scaffolding_described": {
    256           "applies": false,
    257           "answer": false,
    258           "justification": "No agentic scaffolding is used. The paper studies a retrieval-augmented framework with standard fine-tuning and prompting, not agentic workflows.",
    259           "source": "opus"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Section 4.1 describes dataset organization, splits, and how the CoNaLa validation set was created (200 random instances from training). Section 4.4 describes data format ('organized in the form of <Natural Language Description, Code Snippets> and stored in JSON files'). The retrieval database construction is described in Section 3.2.",
    265           "source": "opus"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Retrieval-augmented datasets are released via Google Drive (footnote 3) and code is available at the GitHub repository (footnote 4). The underlying benchmarks (CONCODE, CoNaLa, HearthStone) are publicly available.",
    273           "source": "opus"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Section 4.1 describes each dataset's origin and construction: CONCODE from ~33k Java GitHub projects, CoNaLa from 2,879 Stack Overflow annotations, HearthStone from 665 card implementations. Table 1 provides detailed statistics.",
    279           "source": "opus"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants. Data sources are standard public benchmarks.",
    285           "source": "opus"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "The three-phase pipeline is clearly documented: Retrieval Phase retrieves top-k code snippets (Section 3.2), Fusion Phase integrates them with input (Section 3.3), Generation Phase produces code (Section 3.4). Section 4.4 describes the learning process. Dataset splits and validation creation are documented.",
    291           "source": "opus"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "Training data sources are mentioned (The Pile, BigQuery, BigPython for CodeGen; CodeSearchNet for UniXcoder and CodeT5) but no training data cutoff dates are provided for any model.",
    299           "source": "opus"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No discussion of whether benchmark data (CONCODE, CoNaLa, HearthStone) could appear in the pre-training corpora of the evaluated models. The models are pre-trained on large code corpora that could contain these benchmarks.",
    305           "source": "opus"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "CONCODE (2018), CoNaLa (2018), and HearthStone (2016) were all published years before the pre-trained models were trained. Models trained on GitHub/StackOverflow data could easily contain these benchmarks, but this contamination risk is not addressed.",
    311           "source": "opus"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants in this study. It is a benchmark evaluation of pre-trained models.",
    319           "source": "opus"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants in this study.",
    325           "source": "opus"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "opus"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "opus"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "opus"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "opus"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants in this study.",
    355           "source": "opus"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "Table 5 reports inference costs (in seconds) for each fusion strategy across all three datasets. Table 7 reports retrieval costs per 50 instances for each retrieval technique. Section 6.1.2 discusses cost trade-offs.",
    363           "source": "opus"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": true,
    368           "justification": "Section 4.4 specifies hardware (two Intel Xeon Platinum 8276 CPUs, two NVIDIA A100 80G). Table 5 reports total training time (e.g., 128-923 min on CONCODE). Table 7 reports per-epoch training costs and retrieval costs.",
    369           "source": "opus"
    370         }
    371       },
    372       "experimental_rigor": {
    373         "seed_sensitivity_reported": {
    374           "applies": true,
    375           "answer": false,
    376           "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single experimental runs.",
    377           "source": "opus"
    378         },
    379         "number_of_runs_stated": {
    380           "applies": true,
    381           "answer": false,
    382           "justification": "The number of experimental runs is not stated anywhere. It is unclear whether results are from single runs or averaged over multiple runs.",
    383           "source": "opus"
    384         },
    385         "hyperparameter_search_budget": {
    386           "applies": true,
    387           "answer": false,
    388           "justification": "No hyperparameter search budget is reported. The paper defers to original papers for hyperparameter settings without indicating whether any search was performed for the retrieval-augmented configurations.",
    389           "source": "opus"
    390         },
    391         "best_config_selection_justified": {
    392           "applies": true,
    393           "answer": true,
    394           "justification": "The selection of k=5 retrieved snippets is justified by experiments in Section 5.4.1 showing performance across k=1,3,5,7,10. The choice of ascending ordering is justified by Table 8. Model selections reference prior empirical comparisons.",
    395           "source": "opus"
    396         },
    397         "multiple_comparison_correction": {
    398           "applies": true,
    399           "answer": false,
    400           "justification": "The paper performs a single t-test (p-value 0.035) but runs many comparisons across 3 models × 3 datasets × 5 retrieval techniques × 5 metrics without any correction for multiple comparisons.",
    401           "source": "opus"
    402         },
    403         "self_comparison_bias_addressed": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "The authors implement and evaluate all retrieval techniques and fusion strategies themselves without acknowledging the bias of evaluating their own implementations. No independent evaluation or acknowledgment of author-evaluation bias.",
    407           "source": "opus"
    408         },
    409         "compute_budget_vs_performance": {
    410           "applies": true,
    411           "answer": true,
    412           "justification": "Table 5 directly reports both performance metrics and training/inference costs for each fusion strategy side by side. Section 6.1.2 explicitly discusses the 'trade-off between performance improvement and computational costs.'",
    413           "source": "opus"
    414         },
    415         "benchmark_construct_validity": {
    416           "applies": true,
    417           "answer": true,
    418           "justification": "Section 6.4 acknowledges that CONCODE's 'pre-processing makes human hard to understand the code intuitively' and that 'it is difficult for human developers to write code the same as the ground truth,' questioning whether these benchmarks reflect real-world development.",
    419           "source": "opus"
    420         },
    421         "scaffold_confound_addressed": {
    422           "applies": false,
    423           "answer": false,
    424           "justification": "No agentic scaffolding is involved. The study evaluates fine-tuned models and prompted LLMs directly without scaffolding.",
    425           "source": "opus"
    426         }
    427       },
    428       "data_leakage": {
    429         "temporal_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "Not addressed. The benchmarks (2016-2018) were created before the pre-trained models' training periods, meaning solutions could have been in training data. This temporal leakage risk is not discussed.",
    433           "source": "opus"
    434         },
    435         "feature_leakage_addressed": {
    436           "applies": true,
    437           "answer": false,
    438           "justification": "Not discussed. The retrieval database is the training set, meaning the model has already been fine-tuned on the same data it retrieves from. The paper does not discuss whether this creates a feature leakage concern.",
    439           "source": "opus"
    440         },
    441         "non_independence_addressed": {
    442           "applies": true,
    443           "answer": false,
    444           "justification": "Only partially addressed for CONCODE, which uses 'repository-based partitioning' to separate domains. No independence analysis is performed for CoNaLa or HearthStone, and no systematic discussion of non-independence is provided.",
    445           "source": "opus"
    446         },
    447         "leakage_detection_method": {
    448           "applies": true,
    449           "answer": false,
    450           "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are used.",
    451           "source": "opus"
    452         }
    453       }
    454     }
    455   },
    456   "claims": [
    457     {
    458       "claim": "Retrieval-augmented framework universally improves code generation performance for all three tested pre-trained models across all three datasets.",
    459       "evidence": "Table 3 shows consistent improvements for CodeGen, UniXcoder, CodeT5 on CONCODE, CoNaLa, HearthStone. Average improvements: 6.79–41.60% EM, 9.01–18.42% BLEU. t-test p=0.035.",
    460       "supported": "strong"
    461     },
    462     {
    463       "claim": "BM25 is the most effective retrieval technique for code generation, outperforming more complex deep learning-based code search models.",
    464       "evidence": "Table 4 shows BM25 achieves highest BLEU/CodeBLEU gains on CONCODE and HearthStone for all three models; Table 7 shows BM25 also has lower or comparable total cost compared to trained models.",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "Sequential Integration Fusion (SIF) is the recommended fusion strategy due to its balance of performance and computational cost.",
    469       "evidence": "Table 5 shows SIF achieves competitive performance (e.g., CodeBLEU 46.92 vs. SFF's 46.40 on CONCODE) at 285 min training vs. SFF's 917 min, and VDF's 393 min.",
    470       "supported": "strong"
    471     },
    472     {
    473       "claim": "Sketch Filling Fusion (SFF) yields the highest performance but at 2–7x training cost compared to fine-tuning on the original dataset.",
    474       "evidence": "Table 5 shows SFF achieves best EM and BLEU on HearthStone (34.85%, 81.89%) but requires 107 min vs. 50 min baseline training; on CONCODE SFF needs 917 min vs. 128 min.",
    475       "supported": "strong"
    476     },
    477     {
    478       "claim": "RAF also benefits large language models (ChatGLM, CodeLlama, DeepSeek-Coder) for code generation, sometimes dramatically.",
    479       "evidence": "Table 6 shows ratios up to 198.67× BLEU improvement for ChatGLM on HearthStone with BM25; all three LLMs improve across all datasets with all five retrieval techniques.",
    480       "supported": "moderate"
    481     },
    482     {
    483       "claim": "Increasing the number of retrieved code snippets beyond a dataset-specific threshold degrades rather than improves performance.",
    484       "evidence": "Figure 2 shows BLEU inflection points at 3–5 retrieved results on CoNaLa and CONCODE; HearthStone plateaus at 5 due to truncation.",
    485       "supported": "moderate"
    486     },
    487     {
    488       "claim": "RetroMAE as a retrieval technique can degrade code generation performance due to domain mismatch between NLP text retrieval and code tasks.",
    489       "evidence": "Table 4 shows CodeGen BLEU drops 7.74% and UniXcoder drops 1.23% with RetroMAE on CONCODE; HearthStone BLEU drops 81.33% for CodeGen.",
    490       "supported": "strong"
    491     }
    492   ],
    493   "methodology_tags": [
    494     "benchmark-eval",
    495     "observational"
    496   ],
    497   "key_findings": "The retrieval-augmented framework (RAF) consistently improves code generation for all tested pre-trained models (CodeGen, UniXcoder, CodeT5) and LLMs (ChatGLM3-6B, CodeLlama-7B, DeepSeek-Coder-6.7B) across three benchmark datasets, with HearthStone showing the largest gains due to its regular code structure. BM25 is the most cost-effective retrieval technique, outperforming more complex neural code search models. Among fusion strategies, Sequential Integration Fusion (SIF) offers the best trade-off between performance and computational cost, while Sketch Filling Fusion (SFF) can improve performance further but at 2–7× training cost. The optimal number of retrieved code snippets varies by dataset characteristics and shows diminishing returns beyond a dataset-specific threshold.",
    498   "red_flags": [
    499     {
    500       "flag": "No variance across runs",
    501       "detail": "All results are single-run point estimates with no standard deviation or confidence intervals; for deep learning models with random initialization this is a significant reliability concern."
    502     },
    503     {
    504       "flag": "Benchmark contamination unaddressed",
    505       "detail": "CONCODE (2018), CoNaLa (2018), and HearthStone (2016) benchmarks predate the LLMs' training cutoffs; the paper never discusses whether LLMs had seen these examples during pre-training."
    506     },
    507     {
    508       "flag": "Prompts not disclosed",
    509       "detail": "LLM prompts for in-context learning experiments follow [43] but are not shown in the paper; prompt sensitivity could substantially affect LLM results."
    510     },
    511     {
    512       "flag": "Hyperparameters deferred to prior work",
    513       "detail": "Section 4.4 states all hyperparameters follow original papers but lists none, making exact replication dependent on finding and following multiple external sources."
    514     },
    515     {
    516       "flag": "Significance testing limited to main comparison",
    517       "detail": "Statistical significance testing (t-test) is only reported for the overall RAF vs. baseline comparison in RQ1; the many individual comparisons in Tables 4–6 lack significance tests."
    518     },
    519     {
    520       "flag": "Proxy metrics conflated with utility",
    521       "detail": "BLEU and CodeBLEU improvements are consistently framed as 'performance improvements' without acknowledging these are imperfect proxies for code correctness or developer utility."
    522     }
    523   ],
    524   "cited_papers": [
    525     {
    526       "title": "Retrieval Augmented Code Generation and Summarization (REDCODER)",
    527       "relevance": "Key prior work on retrieval-augmented code generation; directly compared against in this study"
    528     },
    529     {
    530       "title": "SKCODER: A Sketch-Based Approach for Automatic Code Generation",
    531       "relevance": "Sketch Filling Fusion strategy in this paper is directly adapted from SKCODER; central baseline method"
    532     },
    533     {
    534       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    535       "relevance": "One of the three primary pre-trained models evaluated; used as the base for fusion strategy ablations"
    536     },
    537     {
    538       "title": "Evaluating Large Language Models Trained on Code (Codex/HumanEval)",
    539       "relevance": "Foundational work establishing pass@k metric for code generation; motivates the broader field"
    540     },
    541     {
    542       "title": "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation",
    543       "relevance": "Provides the CONCODE benchmark used in this study; frames the code generation evaluation landscape"
    544     },
    545     {
    546       "title": "CoCoSoDa: Effective Contrastive Learning for Code Search",
    547       "relevance": "State-of-the-art code search model used as retrieval technique; one of five retrieval methods compared"
    548     },
    549     {
    550       "title": "Retrieval-Augmented Generation for Large Language Models: A Survey",
    551       "relevance": "Situates this work within the broader RAG for LLMs context; directly related literature"
    552     },
    553     {
    554       "title": "DocPrompting: Generating Code by Retrieving the Docs",
    555       "relevance": "Alternative retrieval-augmented code generation approach using documentation retrieval"
    556     },
    557     {
    558       "title": "AceCoder: An Effective Prompting Technique Specialized in Code Generation",
    559       "relevance": "Provides the prompt construction approach used for LLM experiments in this paper"
    560     }
    561   ],
    562   "engagement_factors": {
    563     "practical_relevance": {
    564       "score": 2,
    565       "justification": "Practitioners building RAG systems for code generation can directly apply the recommendation to use BM25 + Sequential Integration Fusion."
    566     },
    567     "surprise_contrarian": {
    568       "score": 1,
    569       "justification": "The finding that simple BM25 outperforms neural retrieval models is mildly surprising but aligns with known patterns in information retrieval."
    570     },
    571     "fear_safety": {
    572       "score": 0,
    573       "justification": "No safety, security, or risk concerns are raised."
    574     },
    575     "drama_conflict": {
    576       "score": 0,
    577       "justification": "Straightforward empirical study with no controversy or conflict."
    578     },
    579     "demo_ability": {
    580       "score": 1,
    581       "justification": "Code repository is available on GitHub but requires substantial setup with GPU hardware and multiple models to replicate."
    582     },
    583     "brand_recognition": {
    584       "score": 0,
    585       "justification": "Authors from academic institutions and Huawei; not associated with high-profile AI labs."
    586     }
    587   },
    588   "hn_data": {
    589     "threads": [],
    590     "top_points": 0,
    591     "total_points": 0,
    592     "total_comments": 0
    593   }
    594 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs