scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (35352B)
      1 {
      2   "paper": {
      3     "title": "A Deep Dive into Retrieval-Augmented Generation for Code Completion: Experience on WeChat",
      4     "authors": [
      5       "Zezhou Yang",
      6       "Ting Peng",
      7       "Cuiyun Gao",
      8       "Chaozheng Wang",
      9       "Hailiang Huang",
     10       "Yuetang Deng"
     11     ],
     12     "year": 2025,
     13     "venue": "IEEE International Conference on Software Maintenance and Evolution (ICSME)",
     14     "arxiv_id": "2507.18515",
     15     "doi": "10.1109/ICSME64153.2025.00062"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval", "qualitative"],
     20   "key_findings": "Both identifier-based and similarity-based RAG consistently improve code completion on WeChat's proprietary C++ codebase across 26 open-source LLMs (0.5B–671B parameters), with similarity-based RAG substantially outperforming identifier-based RAG (e.g., DeepSeek-V3 achieves 42.7% higher CodeBLEU). Among retrieval techniques, BM25 (lexical) and GTE-Qwen (semantic) achieve the best individual performance, and their combination yields optimal results for 7B+ models due to minimal overlap in retrieved candidates (64–76% completely distinct). A developer survey with 3 participants confirms the combined approach produces higher-quality completions.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No source code repository URL, GitHub link, or archive is provided anywhere in the paper. The preprocessing algorithm and retrieval pipeline are described but not released."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The benchmark (100 examples) and retrieval corpus (1,669 repositories) are from WeChat's proprietary codebase and are not released. No download links or public data are provided."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Hardware is described (8× A100 40GB, 8× H20 96GB, 16× H20 96GB) and vLLM is mentioned for deployment, but no requirements.txt, Dockerfile, library versions, or detailed environment setup is provided. FP16/FP8 precision is mentioned but this alone is insufficient to recreate the environment."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The algorithmic description (Algorithm 1) describes preprocessing logic but does not constitute reproduction instructions for the experiments."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All results in Tables I, II, and III are reported as point estimates (e.g., '51.12/61.96') with no confidence intervals, error bars, or uncertainty measures."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims 'similarity-based RAG substantially performs better' and 'BM25 and GTE-Qwen-based retrieval techniques demonstrate superior performance' based solely on comparing numbers without any statistical significance tests (no p-values, t-tests, or other tests)."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper reports relative improvements with baseline context throughout, e.g., 'CB/ES metrics improvement from 29.79/48.56 to 51.12/61.96, representing a 71.60% and 27.59% relative increase' (Section IV-A), and 'DeepSeek-V3 shows an enhancement from 35.23/54.85 to 60.28/73.11 using GTE-Qwen retrieval technique, corresponding to a 71.1% and 33.3% increase.'"
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The benchmark has 100 examples and the developer survey has only 3 participants evaluating 52 examples. No justification is given for why these sample sizes are adequate, and no power analysis is conducted."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "Temperature is set to 0 for deterministic output, and results are from single runs. No variance, standard deviation, or spread measures are reported across any experimental conditions."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Base models without RAG serve as baselines ('base' column in Table I). Each RAG method is compared against the base model performance, and different RAG methods are compared against each other."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The evaluation includes 26 contemporary LLMs including DeepSeek-V3 (2024), Qwen2.5-Coder (2024), Llama-3.3 (2024), and other recent model series, representing current state of the art."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "RQ1 compares identifier-based vs. similarity-based RAG with different retrieval types (message definition, class definition, function declaration, function definition). RQ2 compares 5 retrieval techniques with incomplete vs. complete queries. RQ3 explores combinations of techniques. These constitute ablation over retrieval components."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Two automated metrics are used: CodeBLEU (CB) measuring structural and semantic code similarity, and Edit Similarity (ES) measuring token-level edit distance. The developer survey adds a 1-5 human rating scale."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Section V-A describes a developer survey with 3 developers evaluating 52 code completions on a 1-5 scale, comparing BM25, GTE-Qwen, and their combination across 3 LLMs. Error patterns are also categorized."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "The 100-example benchmark is used for all evaluation. There is no mention of a separate dev/validation split for tuning hyperparameters (e.g., number of retrieved results, BM25 parameters). The same 100 examples are used for all results."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "The benchmark has 7 domains (client call, connection, kv, colib, mq, utils, encoding) and easy/hard difficulty levels (Figure 1), but results in Tables I–III are reported only as overall averages across all 100 examples. No per-domain or per-difficulty breakdowns are provided."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The developer survey (Section V-A, Figure 2c) identifies three error categories: Missing or Incorrect Logic (52%), Extra Logic (30%), and Nonexistent Function Call (17%), with per-model error distributions analyzed."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Several negative results are reported: CodeLlama-70B-Instruct shows degradation with RAG in most conditions; combination of retrieval techniques shows 'limited or even negative impact' for models below 7B (Section IV-C); CodeBERT 'consistently underperforms' other semantic retrieval techniques."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract's three key claims are supported: (1) both RAG methods effective with similarity-based superior — Table I; (2) BM25 and GTE-Qwen achieve superior performance — Table II; (3) combination of lexical and semantic yields optimal results — Table III."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Causal claims (e.g., 'RAG methods consistently outperform base models') are justified by controlled comparisons where the same model is evaluated with and without RAG under identical conditions. The single-variable manipulation (adding RAG context) is adequate for the claims made."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Finding 1 states 'Both types of RAG methods can consistently improve code completion performance across different models and scales in closed-source repositories,' generalizing from one C++ codebase (WeChat) to 'closed-source repositories' broadly. The paper tests only C++ code in one organization's codebase but does not bound claims to this setting in the findings."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Section V-C discusses specific alternative explanations: internal validity (parameter settings and hardware could influence results), external validity (WeChat's codebase may differ from other organizations'), and construct validity (automated metrics may not capture semantic correctness). Section V-B discusses the gap between training and application scenarios for semantic retrieval."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper measures CodeBLEU and Edit Similarity and frames these as 'code completion performance' — a reasonable match. Section V-C explicitly acknowledges the proxy gap: 'automated metrics (CodeBLEU and Edit Similarity) to measure code quality... these metrics might not fully capture the semantic correctness and functionality of generated code,' and supplements with the developer survey."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Specific model names with sizes are provided for all 26 LLMs (e.g., 'Qwen2.5-Coder-14B-Instruct', 'Llama-3.3-70B-Instruct', 'DeepSeek-V3-671B/37B'). Retrieval models are also specified (e.g., 'GTE-Qwen2-1.5B-instruct'). All models are obtained from 'official Hugging Face repositories' (Section III-D3)."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Prompt templates are described conceptually (e.g., 'we develop four distinct prompt templates to help LLMs understand different types of background knowledge,' Section II-C3) and Section III-D3 states 'we design our prompts in Chinese wrapped in C++ comment format,' but the actual prompt text is never provided."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Key hyperparameters are reported: temperature=0 for all models, number of retrieved results=4, prompt length <2k tokens, FP16/FP8 precision, CodeBLEU weights α=β=γ=δ=0.25 (Section III-C). BM25 formula parameters k and b are described but exact values are not given."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. The RAG pipeline is a fixed multi-step process (retrieve → prompt → generate) without retry logic, feedback mechanisms, or autonomous tool use."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Algorithm 1 provides detailed pseudocode for the data preprocessing pipeline, addressing file segmentation, recursive dependencies, auto-generated code, and macro handling. Section III-D1 describes the tree-sitter implementation and regex-based protobuf extraction."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section V-C 'Threats to Validity' provides substantive discussion of internal, external, and construct validity threats across three dedicated paragraphs."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Specific threats are discussed: internal validity addresses that 'performance of these deep learning-based models can be influenced by multiple factors, including parameter settings and hardware devices'; external validity notes 'experiments are conducted on the specific enterprise codebase in WeChat group'; construct validity specifically identifies that 'automated metrics might not fully capture the semantic correctness and functionality of generated code in real development scenarios.'"
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not explicitly state what the results do NOT show. While external validity mentions the WeChat-specific limitation, there is no explicit listing of untested settings (e.g., no mention that only C++ was tested, that only function-level completion was evaluated, that proprietary code patterns may not apply to other industries, etc.)."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw data is available. The benchmark (100 examples), retrieval corpus (1,669 projects), and developer survey responses are all proprietary and not released."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Benchmark construction is described in detail (Section II-A): three senior developers with 5+ years experience, four annotation rules (function significance, context selection, difficulty classification, quality assurance), cross-validation, three weeks of effort, 100 examples across 7 domains. Retrieval corpus: 1,669 internal projects spanning multiple business units."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "Benchmark annotators are described as 'three senior developers from our group, each with over five years of industrial experience.' Developer survey participants are 'three developers from our group (excluding the authors).' No description of how these specific individuals were selected or whether selection could introduce bias."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The full pipeline is documented: Algorithm 1 details preprocessing from raw source files to retrieval corpus. Section II-A describes benchmark construction with four rules and cross-validation. Section II-B describes retrieval corpus construction with filtering of duplicates and format standardization."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Funding is disclosed in a footnote: 'This research is supported by National Key R&D Program of China (No. 2022YFB3103900), National Natural Science Foundation of China (No. 62472126), Natural Science Foundation of Guangdong Province, Shenzhen-Hong Kong Jointly Funded Project, and Shenzhen Basic Research.'"
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: three authors from Tencent (Guangzhou, China) and two from The Chinese University of Hong Kong. Tencent is the parent company of WeChat, the codebase being evaluated."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Funders are Chinese government research programs (NSFC, National Key R&D Program, provincial and municipal science foundations) that do not have a financial stake in whether RAG improves code completion at Tencent."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is provided. Three authors are Tencent employees evaluating methods on Tencent's codebase, but this potential commercial interest is not formally declared."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No training data cutoff dates are stated for any of the 26 LLMs evaluated. While the benchmark is proprietary (reducing contamination risk), the paper does not state when any model's training data ends."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of train/test overlap. The proprietary nature of the WeChat codebase inherently mitigates contamination risk, but this is never explicitly discussed as a contamination consideration."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No explicit discussion of benchmark contamination. While the custom proprietary benchmark is unlikely to be in training data, the paper does not leverage this as a methodological advantage or discuss contamination risk at all."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "No pre-registration is mentioned for either the benchmark annotation or the developer survey."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": true,
    258         "answer": false,
    259         "justification": "No IRB or ethics board approval is mentioned for the developer survey or the benchmark annotation process involving human participants."
    260       },
    261       "demographics_reported": {
    262         "applies": true,
    263         "answer": false,
    264         "justification": "Benchmark annotators are described as having 'over five years of industrial experience,' but no other demographics are reported. Developer survey participants are described only as 'three developers from our group' with no experience level, role, or other demographics."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": true,
    268         "answer": false,
    269         "justification": "No inclusion or exclusion criteria are stated for selecting the benchmark annotators or developer survey participants beyond being 'from our group.'"
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "The developer survey is a within-subjects evaluation where all participants rate all conditions, not an experimental study with treatment/control assignment requiring randomization."
    275       },
    276       "blinding_described": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No mention of whether developer survey participants were blinded to which retrieval technique produced which code completion. The evaluation design description does not address blinding."
    280       },
    281       "attrition_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No attrition information is reported. The paper does not state whether all 3 developer survey participants completed all evaluations."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No inference cost, latency, or wall-clock time is reported for any of the 26 models or retrieval techniques, despite evaluating models ranging from 0.5B to 671B parameters on industrial-scale codebases."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Hardware configurations are described (8× A100 40GB, 8× H20 96GB, 16× H20 96GB) but total GPU hours, wall-clock time, or total computational budget for the experiments is not stated."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "Temperature is set to 0 for deterministic output, producing single-run results. No seed sensitivity analysis is conducted, and the paper does not discuss whether non-determinism from other sources (GPU floating-point, batching) could affect results."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is never explicitly stated. Temperature=0 implies single deterministic runs, but this is not explicitly confirmed as single-run evaluation."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No hyperparameter search budget is reported. The number of retrieved results (k=4) is chosen to fit prompt length constraints, and BM25 parameters k and b are referenced but their values and how they were selected are not stated."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The selection of k=4 retrieved results is justified only by prompt length constraints ('less than 2k tokens'), not by systematic evaluation. No validation set is used to select hyperparameters."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper makes hundreds of comparisons across 26 models × 10 RAG conditions (Tables I–III) without any statistical tests, let alone multiple comparison corrections."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors implement the RAG pipeline and compare it against base model performance without acknowledging potential author-implementation bias. The retrieval techniques are standard, but the preprocessing algorithm and prompt templates are the authors' own design."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "While results are grouped by model scale (0.5B+ to 671B+), the actual compute cost of each RAG method (retrieval + inference) is not compared. The added cost of retrieval over base model inference is not quantified."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": true,
    338         "justification": "Section V-C explicitly discusses construct validity: 'We use automated metrics (CodeBLEU and Edit Similarity) to measure code quality... these metrics might not fully capture the semantic correctness and functionality of generated code in real development scenarios.' The developer survey is conducted to partially address this gap."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "All models are evaluated under identical RAG configurations within each comparison (same retrieval technique, same number of results, same prompt template per method). Cross-model comparisons in each column of Tables I–III use matched conditions."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of temporal leakage. The paper does not discuss whether any model's training data could include code patterns similar to the proprietary benchmark, or the temporal relationship between model training and benchmark creation."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of feature leakage. The paper does not consider whether the complete code snippets used as queries in Table II's 'Complete' condition leak answer information that would not be available in real code completion scenarios."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of independence between the 100 benchmark examples. Examples from the same domain (e.g., 29 client_call examples) may share patterns, but potential non-independence is not addressed."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention method is applied. While the proprietary nature of the code inherently reduces contamination risk, no formal detection (canary strings, membership inference, n-gram overlap) is employed."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Both identifier-based and similarity-based RAG methods consistently improve code completion performance across different models and scales in closed-source repositories, with similarity-based RAG performing substantially better.",
    372       "evidence": "Table I shows improvements across 26 LLMs. For example, DeepSeek-V3 achieves 42.24/61.75 CB/ES with identifier-based RAG (func-def) and 60.28/73.11 with similarity-based RAG (GTE-Qwen), representing 42.7% and 18.4% improvement of similarity over identifier-based (Section IV-A).",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "BM25 (lexical) and GTE-Qwen (semantic) achieve superior performance among retrieval techniques for similarity-based RAG.",
    377       "evidence": "Table I shows BM25 achieving 55.67/69.18 and GTE-Qwen reaching 55.29/68.21 for DeepSeek-V2.5, outperforming CodeBERT, UniXcoder, and CoCoSoDa. Table II confirms this pattern across model scales (Section IV-A, IV-B).",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "GTE-Qwen demonstrates superior performance with incomplete code contexts as queries, unlike other retrieval techniques that perform better with complete code snippets.",
    382       "evidence": "Table II shows that most retrieval techniques improve with complete queries, but GTE-Qwen achieves better results with incomplete queries for larger models (e.g., DeepSeek-V3: 60.28/73.11 incomplete vs. 58.85/71.02 complete). Finding 2 in Section IV-B.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Combining BM25 and GTE-Qwen yields optimal performance in similarity-based RAG for most LLMs 7B+.",
    387       "evidence": "Table III shows BM25+GTE-Qwen achieving 63.62/75.26 CB/ES for DeepSeek-V3 and 63.73/72.25 for Qwen2.5-32B. However, the combination shows 'limited or even negative impact' for models below 7B (Section IV-C).",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Lexical and semantic retrieval techniques have minimal overlap in retrieved candidates, suggesting they capture fundamentally different aspects of code similarity.",
    392       "evidence": "Out of 100 test examples, there are 76, 74, and 64 completely distinct retrieved samples when comparing BM25 with UniXcoder, CoCoSoDa, and GTE-Qwen respectively (Section IV-C).",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "The developer survey confirms that the BM25+GTE-Qwen combination consistently achieves higher scores than either technique alone.",
    397       "evidence": "Figure 2a shows average scores across 3 LLMs, and Figure 2b shows win-rate analysis where the combined technique outperforms single techniques in about half of test cases. However, the survey involves only 3 developers on 52 examples (Section V-A).",
    398       "supported": "weak"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "Tiny benchmark size",
    404       "detail": "The evaluation benchmark contains only 100 examples for evaluating 26 models across 10+ RAG conditions. With such a small sample, individual examples can disproportionately affect results, yet no statistical significance testing is performed on any comparison."
    405     },
    406     {
    407       "flag": "Company evaluating on its own codebase",
    408       "detail": "Three of six authors are Tencent employees evaluating RAG methods on Tencent's WeChat codebase. The benchmark was annotated by 'senior developers from our group' and the developer survey involved 'three developers from our group.' This creates multiple points where organizational bias could influence results."
    409     },
    410     {
    411       "flag": "No statistical tests on any comparison",
    412       "detail": "Across Tables I–III, hundreds of comparisons are made (26 models × 10 RAG conditions) with claims of 'superiority' and 'consistent improvement,' but no statistical significance tests are conducted. Small differences in CodeBLEU/ES on 100 examples may not be meaningful."
    413     },
    414     {
    415       "flag": "Developer survey with 3 participants",
    416       "detail": "The developer survey (Section V-A) uses only 3 developers evaluating 52 examples. This is far too small for reliable conclusions, yet the results are presented as validation of the automated findings."
    417     },
    418     {
    419       "flag": "No variance or uncertainty quantification",
    420       "detail": "All results are single deterministic runs (temperature=0). While this ensures reproducibility, there is no analysis of sensitivity to other sources of variation (prompt wording, retrieval parameters, benchmark selection), making it impossible to assess result stability."
    421     },
    422     {
    423       "flag": "Proprietary data prevents any independent reproduction",
    424       "detail": "Neither the benchmark (100 examples), retrieval corpus (1,669 repositories), nor any code is released. The prompts are in Chinese and not provided. No external researcher can verify or reproduce any result in this paper."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "GraphCoder: Enhancing Repository-Level Code Completion via Code Context Graph-based Retrieval and Language Model",
    430       "authors": ["W. Liu", "A. Yu", "D. Zan", "B. Shen", "W. Zhang", "H. Zhao", "Z. Jin", "Q. Wang"],
    431       "year": 2024,
    432       "arxiv_id": "2406.07003",
    433       "relevance": "Proposes graph-based RAG for repository-level code completion, directly relevant to code generation evaluation methodology."
    434     },
    435     {
    436       "title": "Language Models for Code Completion: A Practical Evaluation",
    437       "authors": ["M. Izadi", "J. Katzy", "T. van Dam", "M. Otten", "R. M. Popescu", "A. van Deursen"],
    438       "year": 2024,
    439       "relevance": "Practical evaluation of LLMs for code completion at ICSE 2024, directly relevant to LLM code generation capability assessment."
    440     },
    441     {
    442       "title": "RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems",
    443       "authors": ["T. Liu", "C. Xu", "J. J. McAuley"],
    444       "year": 2024,
    445       "relevance": "Benchmark for repository-level code completion, relevant to evaluation methodology for code generation systems."
    446     },
    447     {
    448       "title": "How Practitioners Expect Code Completion?",
    449       "authors": ["C. Wang", "J. Hu", "C. Gao", "Y. Jin", "T. Xie", "H. Huang", "Z. Lei", "Y. Deng"],
    450       "year": 2023,
    451       "relevance": "Studies developer expectations for code completion tools, relevant to understanding practical AI coding tool impact."
    452     },
    453     {
    454       "title": "REPOFUSE: Repository-Level Code Completion with Fused Dual Context",
    455       "authors": ["M. Liang", "X. Xie", "G. Zhang", "X. Zheng", "P. Di", "W. Jiang", "H. Chen", "C. Wang", "G. Fan"],
    456       "year": 2024,
    457       "arxiv_id": "2402.14323",
    458       "relevance": "Repository-level code completion using dependency and similar code retrieval, directly comparable RAG approach for code generation."
    459     },
    460     {
    461       "title": "Repoformer: Selective Retrieval for Repository-Level Code Completion",
    462       "authors": ["D. Wu", "W. U. Ahmad", "D. Zhang", "M. K. Ramanathan", "X. Ma"],
    463       "year": 2024,
    464       "relevance": "Selective retrieval approach for repository-level code completion, relevant to RAG for code generation evaluation."
    465     },
    466     {
    467       "title": "Dataflow-Guided Retrieval Augmentation for Repository-Level Code Completion",
    468       "authors": ["W. Cheng", "Y. Wu", "W. Hu"],
    469       "year": 2024,
    470       "relevance": "Dataflow-guided RAG for code completion presented at ACL 2024, directly relevant to code generation methodology."
    471     },
    472     {
    473       "title": "ReACC: A Retrieval-Augmented Code Completion Framework",
    474       "authors": ["S. Lu", "N. Duan", "H. Han", "D. Guo", "S. Hwang", "A. Svyatkovskiy"],
    475       "year": 2022,
    476       "relevance": "Early retrieval-augmented code completion framework at ACL 2022, foundational work for RAG in code generation."
    477     },
    478     {
    479       "title": "Studying LLM Performance on Closed-and Open-source Data",
    480       "authors": ["T. Ahmed", "C. Bird", "P. Devanbu", "S. Chakraborty"],
    481       "year": 2024,
    482       "arxiv_id": "2402.15100",
    483       "relevance": "Studies the open-source vs closed-source performance gap for LLMs, directly relevant to distribution shift in code generation evaluation."
    484     },
    485     {
    486       "title": "FT2Ra: A Fine-Tuning-Inspired Approach to Retrieval-Augmented Code Completion",
    487       "authors": ["Q. Guo", "X. Li", "X. Xie", "S. Liu", "Z. Tang", "R. Feng", "J. Wang", "J. Ge", "L. Bu"],
    488       "year": 2024,
    489       "relevance": "Retrieval-augmented code completion approach at ISSTA 2024, relevant to code generation methods evaluation."
    490     },
    491     {
    492       "title": "CodeBLEU: a Method for Automatic Evaluation of Code Synthesis",
    493       "authors": ["S. Ren", "D. Guo", "S. Lu", "L. Zhou", "S. Liu", "D. Tang", "N. Sundaresan", "M. Zhou", "A. Blanco", "S. Ma"],
    494       "year": 2020,
    495       "arxiv_id": "2009.10297",
    496       "relevance": "Defines the CodeBLEU metric used widely for evaluating code generation quality, foundational evaluation methodology."
    497     },
    498     {
    499       "title": "STALL+: Boosting LLM-based Repository-level Code Completion with Static Analysis",
    500       "authors": ["J. Liu", "Y. Chen", "M. Liu", "X. Peng", "Y. Lou"],
    501       "year": 2024,
    502       "arxiv_id": "2406.10018",
    503       "relevance": "Uses static analysis to improve LLM-based code completion, relevant to agentic code generation approaches."
    504     }
    505   ],
    506   "engagement_factors": {
    507     "practical_relevance": {
    508       "score": 3,
    509       "justification": "Directly applicable to practitioners building RAG-based code completion systems for proprietary codebases, with specific guidance on retrieval technique selection."
    510     },
    511     "surprise_contrarian": {
    512       "score": 1,
    513       "justification": "Results largely confirm expected benefits of RAG for code completion; the finding that BM25+semantic combination works best is mildly interesting but not surprising."
    514     },
    515     "fear_safety": {
    516       "score": 0,
    517       "justification": "No safety, security, or AI risk concerns are raised by this work."
    518     },
    519     "drama_conflict": {
    520       "score": 0,
    521       "justification": "No controversy or conflict; a straightforward empirical evaluation."
    522     },
    523     "demo_ability": {
    524       "score": 0,
    525       "justification": "No code, data, or demo is released; everything is proprietary to WeChat/Tencent."
    526     },
    527     "brand_recognition": {
    528       "score": 2,
    529       "justification": "WeChat/Tencent is widely recognized, though not as closely associated with AI research as OpenAI or Google DeepMind."
    530     }
    531   }
    532 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs