scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29746B)
      1 {
      2   "paper": {
      3     "title": "Prompt-Based Code Completion via Multi-Retrieval Augmented Generation",
      4     "authors": [
      5       "Hanzhuo Tan",
      6       "Qi Luo",
      7       "Ling Jiang",
      8       "Zizheng Zhan",
      9       "Jing Li",
     10       "Haotian Zhang",
     11       "Yuqun Zhang"
     12     ],
     13     "year": 2024,
     14     "venue": "ACM Transactions on Software Engineering and Methodology",
     15     "arxiv_id": "2405.07530",
     16     "doi": "10.1145/3725812"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "ProCC, a multi-retrieval augmented code completion framework using prompt engineering and contextual multi-armed bandits (LinUCB), outperforms prior RAG-based approaches by 8.6% and 10.1% Exact Match on open-source and private-domain benchmarks respectively. Prompt-based multi-retrievers (lexical semantics, hypothetical line, code summarization) achieve comparable performance to dedicated encoding models without additional training. The LinUCB adaptive selection algorithm significantly outperforms simpler alternatives including direct concatenation, max similarity, and logistic regression. Applying ProCC to a fine-tuned Code Llama yields an additional 5.6% EM improvement.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Section 8 states 'we provide the repository [1] for all the other available materials, including the source code of the artifact and the open-source dataset.' Reference [1] links to https://github.com/anonepo/issta2024pcc. However, this appears to be an anonymized review-period URL and may not be permanently accessible."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The open-source benchmark dataset (20 GitHub repositories) is stated to be included in the repository. The private-domain benchmark from the e-commerce company is explicitly withheld: 'the dataset containing the private-domain code of the company shall remain undisclosed' (Section 8)."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions '8 NVIDIA A100-80GB GPUs', 'Text-Generation-Inference framework', Python, and Faiss, but provides no requirements.txt, Dockerfile, or detailed dependency listing with library versions sufficient to recreate the environment."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper does not include step-by-step reproduction instructions. While a repository is referenced, the paper itself provides no commands, README description, or scripts for replicating the main experiments."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All tables (Tables 1, 3, 5, 6) report only point estimates with no confidence intervals, error bars, or ± notation."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper repeatedly claims ProCC 'outperforms' and 'significantly improves' over baselines based solely on comparing point estimates. No p-values, t-tests, or any statistical significance tests are reported."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Percentage improvements are reported with baseline context throughout, e.g., '14.1% EM improvement over the vanilla model (from 47.90 to 54.66)' and '8.6% EM improvement over the previous SOTA technique RepoCoder (from 50.31 to 54.66)' in Section 4.2.1."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The test set sizes (3317 open-source, 3074 private-domain) are stated but not justified. No power analysis or justification for why these sizes are sufficient for the claims made."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The Threats to Validity section states 'we averaged results over five runs, reducing variance,' but no standard deviations, interquartile ranges, or spread measures are reported anywhere in the results tables."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The paper compares against BM25, ReACC (ACL 2022), and RepoCoder (EMNLP 2023), plus a vanilla base model, across both benchmark suites (Tables 1, 3)."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "RepoCoder (EMNLP 2023) and GTE-large (August 2023) for the ReACC re-implementation are contemporary baselines for a 2024 paper. The paper states RepoCoder is the SOTA technique."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "RQ2 (Section 4.2.2) provides extensive ablations: individual retriever perspectives (Table 4), perspective combinations (Table 5), and selection algorithm comparisons (Table 6). The Venn diagram (Figure 3) shows unique contributions of each retriever."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Two metrics are used throughout: Exact Match (EM) and Edit Similarity (ES), as stated in Section 4.1.4."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No human evaluation is conducted. All evaluation is automated using EM and ES metrics."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 4.1.3 describes a clear split: '10% of the files as the test set, 10% for validation used in training the adaptive retrieval selection algorithm, and the rest as retrieval data.'"
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results are broken down by task type (Function Body vs. Random Line) in Tables 1 and by domain (open-source vs. private-domain) in Tables 1 and 3."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "While Figure 1 shows motivating scenarios, no systematic error analysis or failure case discussion is provided for ProCC's outputs."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "Every configuration and combination of ProCC shows improvement. The paper notes that direct concatenation ('Union') provides only marginal gain (Table 6) and that expanding to 6 perspectives yields marginal improvement over 3, but these are framed as validation of design choices rather than negative findings."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims of 8.6% improvement on open-source (Table 1: 50.31→54.66), 10.1% on private-domain (Table 3: 41.84→46.07), and 5.6% over fine-tuned (Figure 4: 60.75→64.17) are all supported by the results."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper makes causal claims via ablation studies (Tables 5, 6) that systematically add/remove components and selection algorithms, providing adequate controlled single-variable manipulation for claims like 'incorporating varied perspectives enriches representations.'"
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper's title claims 'Code Completion' broadly, yet the evaluation covers only Java code repositories (visible from code examples). The programming languages used in the 20 open-source and 58 private repositories are never explicitly stated, and no scope boundaries on language or domain are articulated."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The Threats to Validity section (Section 5) addresses implementation bugs, benchmark selection, and randomness, but does not consider alternative explanations for the results (e.g., whether improvements come from additional context length rather than multi-perspective semantics, or whether the LinUCB simply learns input difficulty)."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures Exact Match and Edit Similarity and claims 'code completion effectiveness' — these metrics directly measure code completion quality at the token level without overreaching into broader framing like 'developer productivity.'"
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Section 4.1.1 specifies 'StarCoderBase 15.5B' and 'Code Llama 13b-Instruct' — these are open-source models with specific architecture identifiers (parameter count + variant), obtained from Hugging Face."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Full prompt templates are provided in Section 3.2 and Table 4: 'Embedding the following code snippets: [code]', '<PRE> [Prefix] <SUF> [Suffix] <MID>', and 'This code snippets of [code] means'. The placeholders ([code], [Prefix], [Suffix]) represent the deterministic input (the incomplete code), so the full prompts are reconstructible."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 4.1.5 reports: LinUCB α=0.1, greedy decoding, fine-tuning batch_size=12, learning_rate=2e-5, AdamW optimizer, 2 epochs. Greedy decoding minimizes generation randomness."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. ProCC is a retrieval-augmented generation pipeline with no agent loops, tool use, or feedback mechanisms."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 4.1.3 documents: following RepoCoder's protocol to crawl 20 GitHub repos, 10%/10%/80% test/validation/retrieval split, random line selection (3 per file), function extraction for all test files, yielding 3317 and 3074 test instances."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 5 'Threats to Validity' provides a substantive discussion organized into internal, external, and construct validity threats."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 5 discusses specific mitigations: using original source code and identical hyperparameters for baselines (internal), including industry data unknown to LLMs alongside established benchmarks (external), averaging over five runs to reduce randomness (external), and using two established metrics (construct)."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The paper does not explicitly state what the results do NOT show — no mention of untested languages, model families, or completion scenarios that fall outside the evaluation scope."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The open-source benchmark data is stated to be available in the GitHub repository [1]. Private-domain data is withheld but this limitation is explicitly disclosed in Section 8."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 4.1.3 describes: crawling 20 high-quality GitHub repositories following RepoCoder's protocol, same protocol applied to 58 private-domain repositories from an e-commerce company with ~1 billion MAU."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. Data sources are GitHub repositories (standard public data) and private company repositories."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Section 4.1.3 documents the pipeline: crawl repos → split files (80/10/10) → select test cases (3 random lines per file for RL, all functions for FB) → final counts (3317 open-source, 3074 private-domain instances)."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding section, acknowledgments, or grant information appears in the paper."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are listed: Southern University of Science and Technology, The Hong Kong Polytechnic University, and Kwai Inc. The Kwai affiliation is relevant given the private-domain evaluation likely uses Kwai-related data."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "Two authors (Zizheng Zhan, Haotian Zhang) are affiliated with Kwai Inc., and the private-domain benchmark comes from 'an e-commerce company with around one billion Monthly Active Users' — likely Kwai or a partner. This creates a potential conflict where the company has interest in showing its data benefits from the proposed technique."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests statement or financial disclosure appears in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The paper states StarCoder is 'trained on 1 trillion tokens from GitHub' and Code Llama on '500B code data' but does not specify training data cutoff dates for either model."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": true,
    243         "justification": "Section 4.1.3 explicitly acknowledges: 'Given that LLMs are pre-trained on expanded GitHub datasets, it might inadvertently encompass elements from our test set and lead to the risk of test set contamination.' They construct a private-domain benchmark as mitigation."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": true,
    248         "justification": "The paper constructs a private-domain benchmark from proprietary code specifically to mitigate contamination risk: 'To alleviate this issue, we construct another benchmark based on private-domain code from an e-commerce company' (Section 4.1.3)."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Section 4.2.3 reports: 'ProCC is considerably more resource-efficient and operable on a single A100 GPU' with 'embedding and search processes aggregating to approximately 0.5 seconds on the same device.'"
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": true,
    297         "justification": "Section 4.1.5 states: 'all experiments are performed on a cluster equipped with 8 NVIDIA A100-80GB GPUs.' Fine-tuning takes 'approximately 9.5 hours on the 8×A100 cluster' (Section 4.2.3)."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The paper mentions 'averaged results over five runs' (Section 5) but reports no variance, standard deviation, or sensitivity analysis across seeds."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Section 5 (External Validity) states: 'we averaged results over five runs, reducing variance.'"
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "LinUCB α is set to 0.1 and fine-tuning hyperparameters are specified, but no search budget is reported. The paper mentions 'multiple reasonable training setups' with results in the GitHub repo (footnote 1) but does not quantify the search in the paper."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "Section 4.1.3 describes a clear validation set (10% of files) used for training the LinUCB algorithm, separate from the test set used for final evaluation."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors re-implement ReACC with a different encoding model (GTE-large instead of the original) since 'it does not provide complete reproducible encoding models for retrieval' (Section 4.1.2). This substitution could bias results but is not acknowledged as a potential bias."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "While the paper compares ProCC's cost (~0.5s on 1 A100) to fine-tuning (9.5h on 8×A100), performance is not systematically reported as a function of compute budget, and no performance-compute curves are provided."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper uses Exact Match and Edit Similarity without discussing whether these metrics capture meaningful code completion quality (e.g., whether exact match is too strict or whether edit similarity handles semantic equivalence)."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No agentic scaffolding is involved. ProCC is a retrieval pipeline, not a scaffold-dependent system."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "The paper acknowledges contamination risk but does not analyze temporal relationships between when the 20 GitHub repositories were created/published and the model training data cutoff dates."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the evaluation setup leaks information through context (e.g., whether the retrieval database provides hints that wouldn't be available in real deployment)."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether the training/retrieval data and test data from the same repositories share structural similarities or near-duplicate code patterns that could inflate results."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection or prevention methods are applied. The private-domain benchmark is a contamination mitigation strategy but not a detection method for the open-source benchmark."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "ProCC outperforms state-of-the-art code completion technique by 8.6% on the open-source benchmark in terms of Exact Match.",
    373       "evidence": "Table 1 shows ProCC achieves 54.66 average EM on Code Llama vs RepoCoder's 50.31 on the open-source benchmark suite (Section 4.2.1).",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "ProCC outperforms SOTA by 10.1% on the private-domain benchmark in terms of Exact Match.",
    378       "evidence": "Table 3 shows ProCC achieves 46.07 average EM on Code Llama vs BM25's 41.84 on the private-domain benchmark suite (Section 4.2.1).",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Prompt-based retrievers are robust across templates and comparable to external encoders.",
    383       "evidence": "Table 4 shows prompt variations produce EM scores between 50.41-51.61, comparable to ReACC (50.31) and RepoCoder (50.31) which use dedicated encoding models (Section 4.2.2).",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Combining multiple retrieval perspectives outperforms individual perspectives.",
    388       "evidence": "Table 5 shows combinations (52.43-54.88 EM) consistently outperform the best single perspective (51.61 EM for hypothetical line). Table 6 shows LinUCB (54.66) beats Union (51.97), Max Similarity (52.37), and Logistic Regression (53.21).",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "ProCC improves fine-tuned models by 5.6% in a plug-and-play manner.",
    393       "evidence": "Figure 4 shows fine-tuned Code Llama achieves 60.75 EM; adding ProCC increases it to 64.17 on the open-source benchmark (Section 4.2.3).",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "ProCC shows even more significant enhancement on private-domain data where LLMs are limited.",
    398       "evidence": "Table 3 shows a 36.8% EM increase over baseline (33.67 to 46.07) on private-domain, compared to 14.1% on open-source (Section 4.2.1). This is consistent with RAG benefiting more when model knowledge is limited.",
    399       "supported": "moderate"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "No statistical significance tests",
    405       "detail": "The paper claims ProCC 'significantly improves' and 'outperforms' baselines throughout, but provides no statistical tests (p-values, confidence intervals, or significance tests). All comparisons are based on point estimates from averaged runs without reported variance, making it impossible to determine whether differences are statistically meaningful."
    406     },
    407     {
    408       "flag": "Variance not reported despite averaging over multiple runs",
    409       "detail": "The paper states results are 'averaged over five runs' but never reports standard deviations or variance. This makes it impossible to assess result stability or whether the reported improvements are within the noise margin."
    410     },
    411     {
    412       "flag": "Baseline re-implementation with different model",
    413       "detail": "ReACC is re-implemented using GTE-large instead of its original encoding model because 'it does not provide complete reproducible encoding models for retrieval.' This substitution could systematically bias the comparison, as the original model may perform differently."
    414     },
    415     {
    416       "flag": "Potential industry conflict of interest",
    417       "detail": "Two authors are from Kwai Inc., and the private-domain benchmark appears to come from Kwai's codebase. The paper shows larger improvements on private-domain data (10.1%) than open-source (8.6%), which is the more favorable narrative for industry adoption. No conflict of interest statement is provided."
    418     },
    419     {
    420       "flag": "Anonymized repository URL may be inaccessible",
    421       "detail": "The code repository URL (https://github.com/anonepo/issta2024pcc) appears to be a temporary anonymized repository for review, raising concerns about long-term artifact availability."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Evaluating large language models trained on code",
    427       "authors": ["Mark Chen", "Jerry Tworek"],
    428       "year": 2021,
    429       "arxiv_id": "2107.03374",
    430       "relevance": "Introduces Codex and the HumanEval benchmark, foundational for LLM code generation evaluation."
    431     },
    432     {
    433       "title": "StarCoder: may the source be with you!",
    434       "authors": ["Raymond Li", "Loubna Ben Allal"],
    435       "year": 2023,
    436       "arxiv_id": "2305.06161",
    437       "relevance": "Open-source code LLM used as a base model in this paper's experiments."
    438     },
    439     {
    440       "title": "Code llama: Open foundation models for code",
    441       "authors": ["Baptiste Roziere", "Jonas Gehring"],
    442       "year": 2023,
    443       "arxiv_id": "2308.12950",
    444       "relevance": "Open-source code LLM used as a base model in this paper's experiments, featuring fill-in-the-middle pre-training."
    445     },
    446     {
    447       "title": "ReACC: A Retrieval-Augmented Code Completion Framework",
    448       "authors": ["Shuai Lu", "Nan Duan"],
    449       "year": 2022,
    450       "relevance": "Pioneering RAG-based code completion technique using dual-encoder retrieval, key baseline."
    451     },
    452     {
    453       "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation",
    454       "authors": ["Fengji Zhang", "Bei Chen"],
    455       "year": 2023,
    456       "relevance": "State-of-the-art RAG code completion technique using iterative retrieval, primary baseline and benchmark protocol source."
    457     },
    458     {
    459       "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion",
    460       "authors": ["Yangruibo Ding", "Zijian Wang"],
    461       "year": 2023,
    462       "relevance": "Cross-file code completion benchmark relevant to repository-level evaluation."
    463     },
    464     {
    465       "title": "RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems",
    466       "authors": ["Tianyang Liu", "Canwen Xu"],
    467       "year": 2023,
    468       "relevance": "Repository-level code completion benchmark for evaluating completion systems."
    469     },
    470     {
    471       "title": "Retrieval-augmented generation for knowledge-intensive nlp tasks",
    472       "authors": ["Patrick Lewis", "Ethan Perez"],
    473       "year": 2020,
    474       "relevance": "Foundational RAG paper establishing the retrieval-augmented generation paradigm used in this work."
    475     },
    476     {
    477       "title": "InCoder: A Generative Model for Code Infilling and Synthesis",
    478       "authors": ["Daniel Fried", "Armen Aghajanyan"],
    479       "year": 2022,
    480       "relevance": "Code infilling model that established fill-in-the-middle benchmarks for code completion."
    481     },
    482     {
    483       "title": "UniXcoder: Unified Cross-Modal Pre-training for Code Representation",
    484       "authors": ["Daya Guo", "Shuai Lu"],
    485       "year": 2022,
    486       "relevance": "Code representation model used as the encoding model for RepoCoder baseline."
    487     },
    488     {
    489       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    490       "authors": ["Zhangyin Feng", "Daya Guo"],
    491       "year": 2020,
    492       "relevance": "Pioneering pre-trained code model for code understanding tasks."
    493     },
    494     {
    495       "title": "Codet5+: Open code large language models for code understanding and generation",
    496       "authors": ["Yue Wang", "Hung Le"],
    497       "year": 2023,
    498       "arxiv_id": "2305.07922",
    499       "relevance": "Open-source code LLM advancing code understanding and generation capabilities."
    500     }
    501   ],
    502   "engagement_factors": {
    503     "practical_relevance": {
    504       "score": 2,
    505       "justification": "ProCC is a directly usable RAG technique for code completion that works with off-the-shelf models and requires no additional training of encoding models."
    506     },
    507     "surprise_contrarian": {
    508       "score": 1,
    509       "justification": "The finding that prompts can replace dedicated encoding models for retrieval is modestly novel, but multi-perspective retrieval being better than single is expected."
    510     },
    511     "fear_safety": {
    512       "score": 0,
    513       "justification": "No safety or security concerns raised; the paper focuses on code completion accuracy."
    514     },
    515     "drama_conflict": {
    516       "score": 0,
    517       "justification": "No controversy or conflict; a straightforward technical contribution."
    518     },
    519     "demo_ability": {
    520       "score": 1,
    521       "justification": "Code is released via GitHub but requires significant infrastructure (A100 GPU, model downloads) to run."
    522     },
    523     "brand_recognition": {
    524       "score": 1,
    525       "justification": "Uses well-known models (Code Llama, StarCoder) but authors are from relatively less prominent institutions (SUSTech, PolyU, Kwai)."
    526     }
    527   }
    528 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs