scan.json (27615B)
1 { 2 "paper": { 3 "title": "Challenge on Optimization of Context Collection for Code Completion", 4 "authors": [ 5 "Dmitry Ustalov", 6 "Egor Bogomolov", 7 "Alexander Bezzubov", 8 "Yaroslav Golubev", 9 "Evgeniy Glukhov", 10 "Georgii Levtsov", 11 "Vladimir Kovalenko" 12 ], 13 "year": 2025, 14 "venue": "ASE 2025 Workshops (40th IEEE/ACM International Conference on Automated Software Engineering)", 15 "arxiv_id": "2510.04349" 16 }, 17 "checklist": { 18 "artifacts": { 19 "code_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "A starter kit with baseline solutions is released on GitHub (https://github.com/JetBrains-Research/ase2025-starter-kit), referenced in Section III. The private phase submissions from competing teams are also included in the Zenodo dataset." 23 }, 24 "data_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "Section II.C states: 'The complete competition dataset, including ground truth data, repositories, and private phase submissions, can be downloaded from Zenodo [18] under the CC BY 4.0 license.' A specific Zenodo URL is provided." 28 }, 29 "environment_specified": { 30 "applies": true, 31 "answer": false, 32 "justification": "No environment specification (requirements.txt, Dockerfile, conda environment, or dependency versions) is described in the paper for reproducing the evaluation pipeline. The starter kit may contain this, but the paper itself does not document it." 33 }, 34 "reproduction_instructions": { 35 "applies": true, 36 "answer": false, 37 "justification": "While the competition workflow is described at a high level (Section II.A), there are no step-by-step reproduction instructions for rerunning the evaluation pipeline. The paper describes the conceptual workflow but not concrete commands or scripts to replicate the results." 38 } 39 }, 40 "statistical_methodology": { 41 "confidence_intervals_or_error_bars": { 42 "applies": true, 43 "answer": false, 44 "justification": "Tables II-V report only point estimates of chrF scores. No confidence intervals, error bars, or uncertainty measures are provided for any results." 45 }, 46 "significance_tests": { 47 "applies": true, 48 "answer": false, 49 "justification": "Rankings are determined solely by comparing raw chrF averages. No statistical significance tests are used to determine whether differences between teams are statistically meaningful. For example, in Table V, teams WSPR NCSU and REALISE Lab share rank 3 due to 'negligible difference' (0.660 vs 0.659), but this is judged informally rather than through a statistical test." 50 }, 51 "effect_sizes_reported": { 52 "applies": true, 53 "answer": false, 54 "justification": "Only raw chrF scores are reported. No effect sizes, percentage improvements over baselines with context, or standardized measures of difference magnitude are provided." 55 }, 56 "sample_size_justified": { 57 "applies": true, 58 "answer": false, 59 "justification": "The dataset sizes (688 Python completion points, 1,076 Kotlin completion points) are stated but not justified. No power analysis or reasoning for why these sizes are sufficient for reliable ranking is provided." 60 }, 61 "variance_reported": { 62 "applies": true, 63 "answer": false, 64 "justification": "Results are single-run evaluations. No variance, standard deviation, or spread measures across multiple runs or completion points are reported. Only aggregate averages across all completion points and three models are shown." 65 } 66 }, 67 "evaluation_design": { 68 "baselines_included": { 69 "applies": true, 70 "answer": true, 71 "justification": "Section II.B describes several baselines: empty context, random recent file, most similar file according to BM25, and recent files strategy. These appear in Tables II-V for comparison." 72 }, 73 "baselines_contemporary": { 74 "applies": true, 75 "answer": true, 76 "justification": "The baselines (BM25, recent files) represent common practical strategies used in modern IDEs, as stated in Section II.B: 'Based on our experience, most IDEs on the market use the recent files strategy.' These are appropriate for the competition setting." 77 }, 78 "ablation_study": { 79 "applies": false, 80 "answer": false, 81 "justification": "This is a competition report, not a system paper. The organizers did not propose a single system to ablate. Individual team papers may contain ablations, but this overview paper describes the competition structure and results." 82 }, 83 "multiple_metrics": { 84 "applies": true, 85 "answer": false, 86 "justification": "Only chrF is used as the evaluation metric. Section II.B justifies this choice by citing Evtikhiev et al. [14], but no additional metrics (e.g., exact match, BLEU, CodeBLEU, pass@k) are reported." 87 }, 88 "human_evaluation": { 89 "applies": true, 90 "answer": false, 91 "justification": "No human evaluation of completion quality is included. All evaluation is automated via the chrF metric. Human judgment of code completion quality could provide complementary insights but was not conducted." 92 }, 93 "held_out_test_set": { 94 "applies": true, 95 "answer": true, 96 "justification": "Section II.A describes a clear separation: practice, public, and private phases with non-overlapping data. Section II.C confirms: 'no repository appeared in more than one subset of the data (practice, public, private).' Final rankings used the private held-out dataset." 97 }, 98 "per_category_breakdown": { 99 "applies": true, 100 "answer": true, 101 "justification": "Results are broken down by programming language (Python vs Kotlin) and by model (Mellum, Codestral, Qwen chrF scores reported separately in all tables). This provides meaningful per-category information." 102 }, 103 "failure_cases_discussed": { 104 "applies": true, 105 "answer": false, 106 "justification": "No failure cases or error analysis is presented. The paper does not discuss which types of completion points were hardest, where solutions failed, or qualitative examples of poor completions." 107 }, 108 "negative_results_reported": { 109 "applies": true, 110 "answer": false, 111 "justification": "No negative results are reported. The paper does not discuss approaches that failed to beat baselines (besides listing rankings), configurations that were tried and abandoned by the organizers, or design decisions that did not work." 112 } 113 }, 114 "claims_and_evidence": { 115 "abstract_claims_supported": { 116 "applies": true, 117 "answer": true, 118 "justification": "The abstract's claims are modest and factual: it describes the competition structure, number of participants (19 Python, 8 Kotlin public; 6 private), and that solutions were evaluated using chrF. These are all supported by the detailed content in the paper." 119 }, 120 "causal_claims_justified": { 121 "applies": true, 122 "answer": false, 123 "justification": "Section I states 'Context quality is so critical that a smaller model with better context clues can outperform a larger, more capable model [8]' -- a causal claim attributed to cited work but used to motivate the competition. More importantly, the paper implicitly treats chrF differences as reflecting context quality differences, but confounds between team skill, implementation details, and context strategy are not disentangled." 124 }, 125 "generalization_bounded": { 126 "applies": true, 127 "answer": false, 128 "justification": "The conclusion states the competition 'allowed us to explore the solution space and identify the most practical strategies for context collection' -- this generalizes beyond the specific dataset, models, and metric tested. The paper does not bound its conclusions to the specific Python/Kotlin datasets, three models, and chrF metric used." 129 }, 130 "alternative_explanations_discussed": { 131 "applies": true, 132 "answer": false, 133 "justification": "No alternative explanations for performance differences are discussed. For example, performance differences could stem from engineering quality rather than context strategy design, or from overfitting to specific models during the public phase. Section IV.G mentions data leakage but does not discuss other confounds." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": false, 140 "justification": "The three evaluation models are named as 'Codestral by Mistral AI [10]', 'Qwen2.5-Coder [11]', and 'Mellum [12]'. No specific version identifiers, snapshot dates, or model sizes are given for Codestral or Qwen2.5-Coder. Mellum links to a blog post but no specific checkpoint is identified for the evaluation." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": false, 145 "justification": "Section II.A describes that the platform 'converted each context into a model-specific prompt... by inserting the model-specific special tokens and arranging context, prefix, and suffix in the format required by the model.' The actual prompt templates and special tokens used for each model are not provided in the paper." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": false, 150 "justification": "No inference hyperparameters (temperature, top-p, max tokens, beam size) are reported for the three completion models. These can significantly affect code completion quality but are not disclosed." 151 }, 152 "scaffolding_described": { 153 "applies": false, 154 "answer": false, 155 "justification": "No agentic scaffolding is used. This is a straightforward pipeline: context collection -> prompt construction -> model inference -> metric computation." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section II.C describes the dataset creation process: repositories were collected from permissively-licensed GitHub projects, commit histories were enumerated, multi-line insertions were identified using content-based heuristics, and data was split into practice/public/private subsets with no repository overlap. Sizes are given for each split." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section IV.G 'Threats to Validity' is present and discusses a specific concern about data leakage from solutions using all available repository data including future revisions." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section IV.G identifies a specific threat: 'the solutions by teams SpareCodeComplete and NoMoreActimel use all available data in the provided dataset, including other repositories and all the revisions for the given repository. Such an approach may lead to a data leakage from the future versions of the same repository.' This is specific to this competition." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": false, 177 "justification": "The paper does not explicitly state what the results do not show. For example, it does not acknowledge that results are limited to: only two programming languages, only fill-in-the-middle completion, only three specific models, only the chrF metric, or only permissively licensed open-source code. No explicit scope boundaries or 'what we do not claim' section is present." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": true, 184 "justification": "The complete competition dataset including ground truth, repositories, and private phase submissions is available on Zenodo [18] under CC BY 4.0 license, enabling independent verification." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section II.C describes the collection procedure: built on Long Code Arena [6], collected from permissively-licensed open-source GitHub repositories, enumerated commit histories, retained commits with significant multi-line insertions, generated multi-line splits using in-house code analysis tools." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants in a research sense. The competition participants are not research subjects. Data comes from public GitHub repositories, making this NA." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section II.C documents the pipeline: selection of large permissively-licensed repositories -> commit history enumeration -> filtering for multi-line insertions -> split generation using code analysis tools -> separation into practice/public/private subsets. Final counts are provided (102 repositories, 1,176 revisions, 1,764 completion points)." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No explicit funding disclosure is present. The Acknowledgments section thanks colleagues at JetBrains and Mistral AI but does not state funding sources or grants. Mistral AI 'offset' Codestral API expenses, but formal funding is not disclosed." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "All authors' JetBrains affiliations are clearly stated in the author block. JetBrains organized the competition and one of the three evaluation models (Mellum) is JetBrains' own product." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "JetBrains organized and funded the competition, and one of the three evaluation models (Mellum) is JetBrains' product. Mistral AI co-sponsored and provided Codestral. Both organizations have a commercial interest in demonstrating that context collection improves their models' performance. The funders are not independent of the outcome." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests or financial interests statement is present in the paper. The authors are JetBrains employees evaluating a competition that includes JetBrains' own model, but no conflict of interest declaration is provided." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": true, 227 "answer": false, 228 "justification": "Three pre-trained models (Codestral, Qwen2.5-Coder, Mellum) are used to evaluate code completions on public GitHub repositories. No training data cutoff dates are stated for any of the models. The dataset is constructed from public repos that could have been in training data." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": true, 232 "answer": false, 233 "justification": "The paper does not discuss whether the GitHub repositories in the competition dataset might have appeared in the training data of Codestral, Qwen2.5-Coder, or Mellum. Section II.C mentions temporal separation via commit histories but only for context-to-target leakage within the competition, not for model training data contamination." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": true, 237 "answer": false, 238 "justification": "The dataset is derived from public GitHub repositories. All three models were likely trained on public code. The paper does not address whether the specific repositories or code patterns in the dataset appeared in model training data. Section IV.G discusses data leakage within solutions but not model training contamination." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants in a research sense. Competition participants are not human subjects." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants in a research sense. Competition participants are not human subjects." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in a research sense. Competition participants are not human subjects." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in a research sense. Competition participants are not human subjects." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in a research sense. This is not an experimental study with randomized conditions." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in a research sense. This is not an experimental study requiring blinding." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in a research sense. Competition participants are not human subjects." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": false, 282 "justification": "The competition required running three neural models across thousands of completion points. No API costs, compute costs, or inference time per completion are reported. Section III mentions 'API keys at our expense' but does not quantify costs." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": false, 287 "justification": "No computational budget is stated. The paper does not report hardware used for evaluation, total API spend, GPU hours for running the models, or wall-clock time for the evaluation process." 288 } 289 } 290 }, 291 "claims": [ 292 { 293 "claim": "Context collection strategy significantly affects code completion quality, to the extent that a smaller model with better context can outperform a larger model.", 294 "evidence": "Cited as reference [8] in Section I. Not directly demonstrated in this paper's results, but the competition results show substantial variation in chrF scores across different context strategies (e.g., Python private phase ranges from 0.610 to 0.734 average chrF, Tables IV-V).", 295 "supported": "moderate" 296 }, 297 { 298 "claim": "Nineteen teams submitted solutions to the Python track and eight teams submitted solutions to the Kotlin track during the public phase.", 299 "evidence": "Stated in the abstract and Section III, confirmed by the leaderboard tables (Tables II and III).", 300 "supported": "strong" 301 }, 302 { 303 "claim": "Most competitive strategies share common traits: using parsing tools to extract definitions and symbols, applying classical IR ranking (e.g., BM25), and assembling context within token budgets.", 304 "evidence": "Section IV describes six private-phase solutions. Five of six use some form of symbol extraction (AST, Tree-sitter, PSI), and four of six use BM25 or similar retrieval. Summarized in the opening of Section IV.", 305 "supported": "moderate" 306 }, 307 { 308 "claim": "The competition dataset is the first to provide multi-line fill-in-the-middle completion with separate Python and Kotlin tracks from permissively licensed code.", 309 "evidence": "Section II.C describes differences from Long Code Arena: 'it provides multi-line completion using a fill-in-the-middle approach rather than prefix-based completion, it includes subsets for Python and Kotlin programming languages.' No explicit claim of being 'first' but implicitly positioned as novel.", 310 "supported": "moderate" 311 }, 312 { 313 "claim": "Some solutions may suffer from data leakage because they use all available repository data including future revisions.", 314 "evidence": "Section IV.G identifies that teams SpareCodeComplete and NoMoreActimel 'use all available data in the provided dataset, including other repositories and all the revisions for the given repository' which 'may lead to a data leakage from the future versions of the same repository.'", 315 "supported": "moderate" 316 } 317 ], 318 "methodology_tags": [ 319 "benchmark-eval" 320 ], 321 "key_findings": "This paper describes a community competition at ASE 2025 on optimizing context collection for fill-in-the-middle code completion in Python and Kotlin. The winning solutions combined code parsing (AST, Tree-sitter) with information retrieval (BM25, FAISS) to select relevant context snippets from repositories. The top team (NoMoreActimel for Python, SpareCodeComplete for Kotlin) substantially outperformed baselines, with average chrF scores of 0.734 and 0.748 respectively on the private test sets. The competition dataset of 1,764 completion points from 102 repositories has been released on Zenodo.", 322 "red_flags": [ 323 { 324 "flag": "Organizer conflict of interest", 325 "detail": "All authors are JetBrains employees. One of the three evaluation models (Mellum) is JetBrains' own product, and Mistral AI (co-sponsor) provided Codestral. The competition design, metric selection, and dataset construction were all controlled by parties with commercial interest in the evaluated models. No conflict of interest statement is provided." 326 }, 327 { 328 "flag": "No statistical analysis of ranking differences", 329 "detail": "Team rankings are determined solely by raw chrF averages with no statistical tests. Some score differences are very small (e.g., WSPR NCSU 0.660 vs REALISE Lab 0.659 on Kotlin private) and it is unclear whether rankings reflect meaningful performance differences or noise." 330 }, 331 { 332 "flag": "Single metric evaluation", 333 "detail": "Only chrF is used as the evaluation metric. Code completion quality is multidimensional (syntactic correctness, semantic correctness, usefulness to developers), and chrF captures only character-level similarity. This limits the conclusions that can be drawn about which context strategies are truly better." 334 }, 335 { 336 "flag": "Benchmark contamination not addressed", 337 "detail": "The dataset is derived from public GitHub repositories. All three evaluation models were likely trained on public code. The paper addresses within-competition temporal leakage but does not discuss whether the models may have seen the evaluation code during pre-training." 338 }, 339 { 340 "flag": "Acknowledged data leakage risk in winning solutions", 341 "detail": "Section IV.G notes that the 1st and 2nd place teams (NoMoreActimel and SpareCodeComplete) used all available repository data including future revisions, which 'may lead to a data leakage.' These are the prize-winning solutions, yet this validity threat is noted but not investigated." 342 } 343 ], 344 "cited_papers": [ 345 { 346 "title": "CodeFill: Multi-Token Code Completion by Jointly Learning from Structure and Naming Sequences", 347 "authors": ["M. Izadi", "R. Gismondi", "G. Gousios"], 348 "year": 2022, 349 "relevance": "Code completion technique using neural networks with structural information, relevant to AI-assisted programming." 350 }, 351 { 352 "title": "Efficient Training of Language Models to Fill in the Middle", 353 "authors": ["M. Bavarian", "H. Jun", "N. Tezak", "J. Schulman", "C. McLeavey", "J. Tworek", "M. Chen"], 354 "year": 2022, 355 "arxiv_id": "2207.14255", 356 "relevance": "Foundational work on fill-in-the-middle training for LLMs, the core paradigm evaluated in this competition." 357 }, 358 { 359 "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation", 360 "authors": ["F. Zhang", "B. Chen", "Y. Zhang", "J. Keung", "J. Liu", "D. Zan", "Y. Mao", "J.-G. Lou", "W. Chen"], 361 "year": 2023, 362 "relevance": "Repository-level code completion using retrieval, directly relevant to the context collection strategies evaluated in this competition." 363 }, 364 { 365 "title": "Code Llama: Open Foundation Models for Code", 366 "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"], 367 "year": 2023, 368 "arxiv_id": "2308.12950", 369 "relevance": "Major open-source code LLM family relevant to understanding model capabilities for code completion." 370 }, 371 { 372 "title": "Repoformer: Selective Retrieval for Repository-Level Code Completion", 373 "authors": ["D. Wu", "W. U. Ahmad", "D. Zhang", "M. K. Ramanathan", "X. Ma"], 374 "year": 2024, 375 "relevance": "Selective retrieval approach for repository-level completion, closely related to context optimization strategies." 376 }, 377 { 378 "title": "Long Code Arena: A Set of Benchmarks for Long-Context Code Models", 379 "authors": ["E. Bogomolov", "A. Eliseeva", "T. Galimzyanov", "E. Glukhov"], 380 "year": 2024, 381 "arxiv_id": "2406.11612", 382 "relevance": "The benchmark this competition dataset builds upon, evaluating long-context code models." 383 }, 384 { 385 "title": "RLCoder: Reinforcement Learning for Repository-Level Code Completion", 386 "authors": ["Y. Wang", "Y. Wang", "D. Guo", "J. Chen", "R. Zhang", "Y. Ma", "Z. Zheng"], 387 "year": 2025, 388 "relevance": "RL-based approach to context selection for repository-level code completion, demonstrating that context quality can outweigh model size." 389 }, 390 { 391 "title": "Repohyper: Search-Expand-Refine on Semantic Graphs for Repository-Level Code Completion", 392 "authors": ["H. N. Phan", "H. N. Phan", "T. N. Nguyen", "N. D. Bui"], 393 "year": 2025, 394 "relevance": "Semantic graph-based approach to repository-level code completion, relevant to context collection optimization." 395 }, 396 { 397 "title": "Qwen2.5-Coder Technical Report", 398 "authors": ["B. Hui", "J. Yang", "Z. Cui"], 399 "year": 2024, 400 "arxiv_id": "2409.12186", 401 "relevance": "Technical report for one of the three evaluation models used in the competition." 402 }, 403 { 404 "title": "Out of the BLEU: How Should We Assess Quality of the Code Generation Models?", 405 "authors": ["M. Evtikhiev", "E. Bogomolov", "Y. Sokolov", "T. Bryksin"], 406 "year": 2023, 407 "relevance": "Study on code generation evaluation metrics that motivated the choice of chrF in this competition." 408 }, 409 { 410 "title": "On the Importance of Context Filtering in Retrieval-Augmented Code Completion", 411 "authors": ["S. Sedov", "V. Savinskiy", "A. Arzhantsev"], 412 "year": 2025, 413 "relevance": "Paper by the winning Python team (NoMoreActimel) describing their retrieval-augmented context collection strategy." 414 }, 415 { 416 "title": "SpareCodeSearch: Searching for Code Context When You Have No Spare GPU", 417 "authors": ["M. Nguyen"], 418 "year": 2025, 419 "relevance": "Paper by the winning Kotlin team (SpareCodeComplete) describing their trigram-index-based context search approach." 420 } 421 ] 422 }