scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24492B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "An Empirical Study of Retrieval-Augmented Code Generation: Challenges and Opportunities",
      6     "authors": [
      7       "Zezhou Yang",
      8       "Sirong Chen",
      9       "Cuiyun Gao",
     10       "Zhenhao Li",
     11       "Xing Hu",
     12       "Kui Liu",
     13       "Xin Xia"
     14     ],
     15     "year": 2025,
     16     "venue": "ACM Transactions on Software Engineering and Methodology",
     17     "arxiv_id": "2501.13742",
     18     "doi": "10.1145/3717061"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Abstract claims about RAF improving pre-trained models, BM25 and SIF being recommended, SFF further helping, and LLM effectiveness are all backed by Tables 3–6 with specific numeric results.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Causal claims (RAF improves performance) are supported by controlled ablation experiments holding models constant while varying retrieval and fusion components; t-test confirms significance at p=0.035.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Finding 1 uses the word 'universal' for a finding based on only 3 models and 3 datasets; the threats section acknowledges uncertainty about larger or differently-architected models but the main findings overstate scope.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper does not discuss alternative explanations for why BM25 outperforms trained retrievers (e.g., training set memorization, dataset-specific keyword overlap) or why SFF underperforms on CoNaLa beyond a brief 'lack of structure' observation.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper uses BLEU, CodeBLEU, EM, Edit Distance, and SimAST as metrics and treats them as code generation quality proxies without claiming they equate to real-world developer productivity.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 6.4 'Threats to Validity' is a dedicated section covering generalization, replication, and dataset limitations.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Threats include specific concerns: uncertainty about larger models with different architectures, deep learning randomness affecting replication, and CONCODE preprocessing making ground truth hard for humans to match intuitively.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Section 6.4 explicitly states 'there remains uncertainty regarding whether these findings remain applicable to larger models or models with differing architectures,' bounding claims to the 3 tested models.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding acknowledgment or disclosure section is present in the paper.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations are clearly stated: Harbin Institute of Technology, Concordia University, Zhejiang University, and Huawei Technologies Co., Ltd.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "Kui Liu is affiliated with Huawei Technologies, which has commercial interests in code generation tools; funding is undisclosed so independence cannot be confirmed.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial interests statement appears in the paper.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Retrieval-augmented framework is defined with three phases (Retrieval, Fusion, Generation) in Section 3; all fusion strategies (SIF, SEF, VDF, SFF) and retrieval techniques (BM25, RetroMAE, CodeBERT, etc.) are explicitly defined.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 1 lists three explicit contributions: first empirical study on RAF for code generation, exploration of retrieval techniques and fusion strategies, and actionable implications.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 2 situates the work against REDCODER, SKCODER, DocPrompting, and retrieval-augmented NLP methods, and Section 3 distinguishes this systematic study from prior single-configuration approaches.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "A GitHub repository (https://github.com/watreyoung/RACG) is explicitly cited in footnote 4 of the paper.",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "Standard public benchmarks (CONCODE, CoNaLa, HearthStone) are used, and augmented retrieval datasets are shared via Google Drive (footnote 3).",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Hardware is described (Intel Xeon + NVIDIA A100) and PyTorch/Huggingface are mentioned, but no requirements.txt, Dockerfile, or pinned dependency versions are provided.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No step-by-step reproduction instructions appear in the paper; readers are pointed to the code repository, but the paper itself does not contain reproducible procedures.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "No confidence intervals or error bars appear in any table; only point estimates are reported.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "A t-test is reported for RQ1 (p=0.035 at significance level 0.05), though no significance tests are reported for RQ2 or RQ3 comparisons.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Tables 4 and 6 report percentage improvements (e.g., '14.48% ↑' in BLEU) alongside absolute values, providing effect size context.",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Dataset sizes are reported as standard benchmark sizes; no power analysis or justification for why 3 models and 3 datasets are sufficient is provided.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "All results are single-run point estimates with no standard deviation, confidence intervals, or cross-run variance reported.",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "All three models are evaluated without RAF as baselines (Table 3 'base model' rows), enabling direct comparison.",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Baselines include CoCoSoDa (state-of-the-art code search as of 2022–2023) and contemporary LLMs ChatGLM3-6B, CodeLlama-7B, and DeepSeek-Coder-6.7B.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "RQ2 ablates 5 retrieval techniques and RQ3 ablates 4 fusion strategies and the number of retrieved snippets, systematically isolating each component.",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Five metrics are used: Exact Match (EM), BLEU, Edit Distance, SimilarityAST, and CodeBLEU, covering lexical, syntactic, and semantic dimensions.",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "All evaluation is automated; no human judges assess the quality or correctness of generated code beyond automated metrics.",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "All three datasets have held-out test splits used for evaluation; CONCODE uses repository-based partitioning to prevent domain overlap.",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Results are broken down per dataset, per model, and per retrieval technique/fusion strategy, enabling fine-grained comparison across configurations.",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Section 6.2 provides case studies on failure modes (RetroMAE retrieving semantically mismatched NL, VDF underperforming) with concrete examples.",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "RetroMAE degrades performance by -7.74% BLEU on CONCODE for CodeGen and -81.33% on HearthStone; VDF underperforms SEF across all datasets — both reported prominently.",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Model sizes (CodeGen 350M, UniXcoder 126M, CodeT5 223M) and variants (CodeGen-MONO) are specified; LLMs include size designations (ChatGLM3-6B, CodeLlama-7B, DeepSeek-Coder-6.7B).",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "LLM prompts are described as following reference [43] (AceCoder), with details deferred to the code repository; no actual prompt templates appear in the paper.",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "The paper states 'all the hyper-parameter settings...are the same as the original corresponding papers' without specifying learning rates, batch sizes, or number of epochs.",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": false,
    257           "answer": false,
    258           "justification": "This is not an agentic scaffolding paper; the three-phase RAF pipeline is described architecturally but there is no agentic scaffolding involved.",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Dataset splits are described (CoNaLa validation set constructed by random sampling 200 from training), data format (<NL, Code> pairs in JSON) is specified, and retrieval database construction is described.",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Standard benchmark datasets are publicly available; the paper also shares augmented retrieval datasets via Google Drive (footnote 3).",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Dataset provenance is described: CONCODE from 33K GitHub Java projects, CoNaLa from Stack Overflow manual annotations, HearthStone from card game implementations.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants; standard benchmarks were used without recruitment.",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "The full pipeline from retrieval database construction through fusion to fine-tuning is described in Section 3 with formulas and Section 4 with implementation details.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "Training data cutoffs for LLMs (ChatGLM3, CodeLlama, DeepSeek-Coder) are not stated, despite these models being used in in-context learning experiments on pre-2019 benchmarks.",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No discussion of whether CONCODE (2018), CoNaLa (2018), or HearthStone (2016) examples may appear in the pretraining data of the LLMs evaluated in Section 6.1.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "All three benchmarks predate the training cutoffs of the LLMs used; potential contamination of these widely-used benchmarks is not addressed.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants in this study.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants in this study.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants in this study.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "Table 5 reports inference times per fusion strategy (e.g., 547s for baseline CONCODE, 1662s for VDF) and Table 7 reports per-instance retrieval costs.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": true,
    368           "justification": "Training times are reported per configuration in Tables 5 and 7 (e.g., 128–923 min for CONCODE); hardware (two A100 80G GPUs) is specified, enabling compute budget estimation.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "The retrieval-augmented framework universally improves code generation performance across various pre-trained models and datasets.",
    377       "evidence": "Table 3 shows consistent improvements for CodeGen, UniXcoder, and CodeT5 on CONCODE, CoNaLa, and HearthStone; t-test confirms significance at p=0.035.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "BM25 is the most effective retrieval technique for code generation, requiring no training.",
    382       "evidence": "Table 4 shows BM25 achieves highest gains on CONCODE and HearthStone across all models; optimal for CodeT5 on CoNaLa (25.69% BLEU improvement); no training required vs. deep learning alternatives.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Sketch Filling Fusion achieves 14.83% average BLEU improvement across datasets, the highest of any fusion strategy.",
    387       "evidence": "Table 5 shows SFF outperforms on HearthStone (81.89% BLEU) but underperforms SIF on CoNaLa; average computed by authors only for CodeT5.",
    388       "supported": "weak"
    389     },
    390     {
    391       "claim": "Sequential Integration Fusion is the most recommended fusion strategy when balancing cost and performance.",
    392       "evidence": "Table 5 shows SIF training time (285 min) is substantially lower than SEF (923 min) and SFF (917 min) with competitive performance; SIF also achieves best EM on CONCODE and CoNaLa.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "RAF effectively improves LLMs (ChatGLM, CodeLlama, DeepSeek-Coder) during inference via prompt engineering.",
    397       "evidence": "Table 6 shows improvements across all 3 LLMs on all 3 datasets; ChatGLM BLEU ratio on HearthStone reaches 198.67× baseline with BM25.",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "More complex retrieval techniques do not necessarily outperform BM25; RetroMAE can degrade performance.",
    402       "evidence": "Table 4 shows RetroMAE reduces CodeGen BLEU by 7.74% on CONCODE and by 81.33% on HearthStone; deep learning models add training cost without consistent gains.",
    403       "supported": "strong"
    404     }
    405   ],
    406   "methodology_tags": [
    407     "benchmark-eval",
    408     "observational"
    409   ],
    410   "key_findings": "The retrieval-augmented framework consistently improves code generation performance for three pre-trained models (CodeGen, UniXcoder, CodeT5) across three standard benchmarks with statistical significance (p=0.035), with particularly large gains on the structured HearthStone dataset (41.60% EM improvement average). BM25, despite requiring no training, outperforms learned retrieval models including state-of-the-art code search models on most configurations, suggesting that simple lexical matching often suffices. Among fusion strategies, Sequential Integration Fusion offers the best cost-performance trade-off while Sketch Filling Fusion achieves marginally higher performance only on structured datasets at 2–7× training cost. The framework also benefits large language models (ChatGLM3, CodeLlama, DeepSeek-Coder) in inference-time in-context settings.",
    411   "red_flags": [
    412     {
    413       "flag": "Generalization overclaim",
    414       "detail": "Finding 1 declares the framework 'universal' based on only 3 models and 3 datasets, despite the threats section acknowledging uncertainty about larger or differently-architected models."
    415     },
    416     {
    417       "flag": "No variance or multiple runs",
    418       "detail": "All quantitative results are single-run point estimates with no standard deviation, error bars, or multiple seeds reported, making it impossible to assess result stability."
    419     },
    420     {
    421       "flag": "LLM contamination unaddressed",
    422       "detail": "ChatGLM3, CodeLlama, and DeepSeek-Coder are evaluated on benchmarks from 2016–2018 (HearthStone, CoNaLa, CONCODE) with no discussion of whether these datasets appear in LLM pretraining data."
    423     },
    424     {
    425       "flag": "Hyperparameters deferred",
    426       "detail": "Training hyperparameters are described as 'same as original corresponding papers' without specifying learning rates, batch sizes, or epochs, reducing reproducibility without consulting multiple external sources."
    427     },
    428     {
    429       "flag": "SFF average claim questionable",
    430       "detail": "The claim of '14.83% average BLEU improvement' for SFF is computed only for CodeT5 and masks that SFF underperforms SIF on CoNaLa while being 2–7× more expensive to train."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "Retrieval Augmented Code Generation and Summarization (REDCODER)",
    436       "relevance": "Key prior work on retrieval-augmented code generation that this paper extends to a systematic empirical study."
    437     },
    438     {
    439       "title": "Skcoder: A sketch-based approach for automatic code generation",
    440       "relevance": "Source of the Sketch Filling Fusion strategy and sketch extraction mechanism used in experiments."
    441     },
    442     {
    443       "title": "DocPrompting: Generating Code by Retrieving the Docs",
    444       "relevance": "Representative retrieval-augmented code generation approach using documentation retrieval."
    445     },
    446     {
    447       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    448       "relevance": "Primary base model used in ablation experiments for fusion strategy and retrieval technique comparisons."
    449     },
    450     {
    451       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    452       "relevance": "Decoder-only base model evaluated in all three RQs."
    453     },
    454     {
    455       "title": "UniXcoder: Unified Cross-Modal Pre-training for Code Representation",
    456       "relevance": "Encoder-decoder base model used both as a generation model and as a retrieval technique."
    457     },
    458     {
    459       "title": "CoCoSoDa: Effective Contrastive Learning for Code Search",
    460       "relevance": "State-of-the-art code search model compared as a retrieval technique and shown competitive with BM25 for LLMs."
    461     },
    462     {
    463       "title": "Retrieval-Augmented Generation for Large Language Models: A Survey",
    464       "relevance": "Background survey on RAG for LLMs providing context for extending RAF to code generation with LLMs."
    465     }
    466   ],
    467   "engagement_factors": {
    468     "practical_relevance": {
    469       "score": 2,
    470       "justification": "Gives concrete actionable recommendations (use BM25 + SIF) with cost-performance trade-off data that practitioners can apply directly."
    471     },
    472     "surprise_contrarian": {
    473       "score": 1,
    474       "justification": "Mildly surprising that simple BM25 consistently outperforms trained neural retrieval models despite their greater complexity."
    475     },
    476     "fear_safety": {
    477       "score": 0,
    478       "justification": "No AI risk or safety concerns raised; purely a benchmark engineering paper."
    479     },
    480     "drama_conflict": {
    481       "score": 0,
    482       "justification": "Incremental benchmark study with no controversy or conflict with prior work."
    483     },
    484     "demo_ability": {
    485       "score": 2,
    486       "justification": "Code and augmented datasets are publicly released on GitHub and Google Drive, enabling practitioners to replicate the framework."
    487     },
    488     "brand_recognition": {
    489       "score": 0,
    490       "justification": "Authors from Harbin Institute of Technology, Concordia University, Zhejiang University, and Huawei; no headline-grabbing lab affiliation."
    491     }
    492   },
    493   "hn_data": {
    494     "threads": [],
    495     "top_points": 0,
    496     "total_points": 0,
    497     "total_comments": 0
    498   }
    499 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs