scan.json (16948B)
1 { 2 "paper": { 3 "title": "Context Composing for Full Line Code Completion", 4 "authors": ["Anton Semenkin", "Yaroslav Sokolov", "Evgeniia Vu"], 5 "year": 2024, 6 "venue": "IDE '24 (First IDE Workshop)", 7 "arxiv_id": "2402.09230", 8 "doi": "10.1145/3643796.3648446" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": false, 15 "justification": "No source code or repository link is provided. The system is a proprietary JetBrains product." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": false, 20 "justification": "No dataset is released. The evaluation data (A/B test results, offline evaluation dataset) is proprietary." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "No environment specifications are provided. The paper mentions running on end-user devices but gives no library versions, dependencies, or setup details." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "No reproduction instructions are provided. The approach is described at a high level but not in sufficient detail to replicate." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "The A/B test results report a 1.5x improvement but no confidence intervals or error bars are provided." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper mentions 'taking into an account statistical significance of the observed results' in Section 2.2 but does not report any specific test, p-value, or methodology." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper reports '1.5 times' increase in ratio of code completed for FLCC users vs. non-FLCC users, and '40%' quality increase from larger contexts. These provide magnitude context." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper says 'hundreds of real Python users' but provides no justification for this sample size and no power analysis." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "No variance, standard deviation, or spread measures are reported for any results." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "The A/B test compares FLCC-enabled users against users with standard code completion only (Section 2.2)." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": false, 69 "justification": "The baseline is the IDE's own standard code completion. No comparison against other neural code completion tools (e.g., Copilot, CodeWhisperer) is provided." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": false, 74 "justification": "No ablation study is presented. The paper describes multiple context composing components (whitespace trimming, long tokens, scope tokens) but does not isolate their individual contributions." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": false, 79 "justification": "The paper primarily reports one metric: 'ratio of code completed.' An edit rate observation is mentioned but not quantified." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": true, 84 "justification": "The A/B test on hundreds of real users constitutes human evaluation of the system's outputs. Explicit feedback from 'tens of users' is also mentioned (Section 2.1)." 85 }, 86 "held_out_test_set": { 87 "applies": true, 88 "answer": false, 89 "justification": "No mention of held-out test sets. The offline evaluation dataset is mentioned but not described in detail." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": false, 94 "justification": "No per-category or per-task breakdowns are provided. Results are reported as single aggregate numbers." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section 3.1 describes an experiment (method rearrangement approach) that 'did not show any positive results' and was abandoned. Section 2.1 mentions user feedback describing 'potential growth points.'" 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "Section 3.1 reports that the class method rearrangement approach 'did not show any positive results while experimenting. So, we abandoned this research direction.'" 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims the feature 'proved its usefulness in A/B testing on hundreds of real Python users,' which is supported by the 1.5x metric increase reported in Section 2.2." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "The causal claim that FLCC improves coding workflow is supported by an A/B test (randomized assignment to treatment/control groups), which is appropriate for causal inference." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The title says 'Full Line Code Completion' generally, but results are only from PyCharm Pro Python users. The paper does not bound claims to Python or PyCharm explicitly." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": false, 126 "justification": "No alternative explanations for the A/B test results are discussed. Potential confounds (e.g., novelty effect, user self-selection in EAP) are not addressed." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper says 'GPT-like and LLaMA-like autoregressive language models' without specifying exact model names, versions, or sizes beyond 'under 1B parameters.'" 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": false, 138 "justification": "The context composition structure is described (file extension + separator + file path + separator + code above caret) but exact prompt formats and special token values are not fully provided." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": false, 143 "justification": "No hyperparameters (temperature, top-p, learning rate, etc.) are reported. Only context sizes (384 and 1536 tokens) are mentioned." 144 }, 145 "scaffolding_described": { 146 "applies": false, 147 "answer": false, 148 "justification": "No agentic scaffolding is used. This is a direct code completion model, not an agent." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 2.1 describes the preprocessing pipeline: whitespace trimming, scope token replacement, comment removal, BPE tokenization with 'long tokens' modification, and context construction steps." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": false, 160 "justification": "There is no dedicated limitations or threats-to-validity section." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": false, 165 "justification": "No specific threats to validity are discussed anywhere in the paper." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": false, 177 "justification": "No raw data from the A/B test or offline evaluation is available." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 2.2 describes the A/B testing procedure: EAP program, user splitting into groups, shipping different feature versions, and tracking metrics during fall 2023." 183 }, 184 "recruitment_methods_described": { 185 "applies": true, 186 "answer": false, 187 "justification": "Users are described as EAP participants who 'download experimental version of IDEs for free' but no details on how they were recruited or whether this introduces selection bias." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": false, 192 "justification": "The data pipeline from raw user telemetry to the reported 1.5x metric is not documented. No filtering criteria or intermediate processing steps are described." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": false, 199 "justification": "No funding disclosure. All authors are JetBrains employees but no funding section or acknowledgments section is present." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "All three authors clearly list JetBrains as their affiliation." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": false, 209 "justification": "The work is conducted by JetBrains employees evaluating a JetBrains product. JetBrains has a direct financial interest in showing FLCC is useful." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests or financial interests statement is present. The authors are employees of the company whose product is evaluated." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": false, 220 "answer": false, 221 "justification": "The paper evaluates the system via A/B testing with real users, not by testing model knowledge on a benchmark. Contamination is not relevant here." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": false, 225 "answer": false, 226 "justification": "No benchmark evaluation of model knowledge is performed. The evaluation is user-facing A/B testing." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": false, 230 "answer": false, 231 "justification": "No benchmark evaluation is performed; evaluation is via online A/B testing with real users." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": true, 237 "answer": false, 238 "justification": "No pre-registration is mentioned for the A/B study." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": true, 242 "answer": false, 243 "justification": "No IRB or ethics board approval is mentioned for the user study." 244 }, 245 "demographics_reported": { 246 "applies": true, 247 "answer": false, 248 "justification": "Participants are described only as 'hundreds of real Python users' with no demographics (experience level, geography, etc.)." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": true, 252 "answer": false, 253 "justification": "No inclusion/exclusion criteria are stated for participants beyond being PyCharm Pro EAP users." 254 }, 255 "randomization_described": { 256 "applies": true, 257 "answer": false, 258 "justification": "The paper says 'we split users to several groups' but does not describe the randomization procedure." 259 }, 260 "blinding_described": { 261 "applies": true, 262 "answer": false, 263 "justification": "No blinding description. It is unclear whether users knew which version of FLCC they received." 264 }, 265 "attrition_reported": { 266 "applies": true, 267 "answer": false, 268 "justification": "No attrition or dropout information is reported." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": false, 275 "justification": "No inference cost or latency numbers are reported, despite the paper emphasizing latency constraints. Only qualitative claims ('latency almost unchanged') are made." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": false, 280 "justification": "No computational budget for training or inference is stated." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "FLCC increased the ratio of code completed by 1.5 times compared to users without the feature.", 287 "evidence": "Section 2.1 and 2.2 report this metric from A/B testing on 'hundreds of real Python users' in PyCharm Pro during fall 2023.", 288 "supported": "moderate" 289 }, 290 { 291 "claim": "Users do not edit the selected code fragment immediately after inserting it.", 292 "evidence": "Stated in Section 2.2 as an observation from the A/B test, but no quantitative data is provided.", 293 "supported": "weak" 294 }, 295 { 296 "claim": "Increasing context size from 384 to 1536 tokens improved code completion quality by 40% while keeping latency almost unchanged.", 297 "evidence": "Section 2.3 reports this from offline evaluation, no details on the metric or evaluation setup.", 298 "supported": "weak" 299 }, 300 { 301 "claim": "Fill-in-the-middle and RAG methods showed about 10% target metric increase in offline experiments.", 302 "evidence": "Section 2.3 mentions 'promising results (about 10% target metric increase)' from offline experiments with no further detail.", 303 "supported": "weak" 304 } 305 ], 306 "methodology_tags": ["case-study"], 307 "key_findings": "JetBrains describes their Full Line Code Completion context composing approach for sub-1B parameter models running locally in IDEs. A/B testing on hundreds of PyCharm Pro Python users showed a 1.5x increase in code completion ratio. Increasing context size from 384 to 1536 tokens via LLaMA-like models improved offline quality by 40%. The paper also reports a failed experiment with method rearrangement that showed no improvement.", 308 "red_flags": [ 309 { 310 "flag": "Company evaluating own product", 311 "detail": "All authors are JetBrains employees evaluating JetBrains' Full Line Code Completion feature. No independent evaluation or conflict of interest disclosure." 312 }, 313 { 314 "flag": "Vague quantitative claims", 315 "detail": "Key results (1.5x improvement, 40% quality increase, 10% metric increase) are stated without confidence intervals, statistical test details, sample sizes, or metric definitions." 316 }, 317 { 318 "flag": "No uncertainty quantification", 319 "detail": "Despite running A/B tests, no error bars, p-values, or confidence intervals are reported for any result." 320 }, 321 { 322 "flag": "Selection bias in user sample", 323 "detail": "EAP users who voluntarily download experimental IDEs are likely not representative of general developer population, but this is not discussed." 324 } 325 ], 326 "cited_papers": [ 327 { 328 "title": "CodeCompose: A Large-Scale Industrial Deployment of AI-assisted Code Authoring", 329 "authors": ["Vijayaraghavan Murali", "Chandra Maddila", "Imad Ahmad", "Michael Bolin", "Daniel Cheng", "Negar Ghorbani", "Renuka Fernandez", "Nachiappan Nagappan"], 330 "year": 2023, 331 "arxiv_id": "2305.12050", 332 "relevance": "Large-scale industrial deployment of AI code completion at Meta, directly comparable industrial experience paper." 333 }, 334 { 335 "title": "Improving language models by retrieving from trillions of tokens", 336 "authors": ["Sebastian Borgeaud"], 337 "year": 2022, 338 "relevance": "RETRO method for retrieval-augmented generation, relevant to RAG approaches for code completion." 339 }, 340 { 341 "title": "Incoder: A generative model for code infilling and synthesis", 342 "authors": ["Daniel Fried"], 343 "year": 2022, 344 "arxiv_id": "2204.05999", 345 "relevance": "Fill-in-the-middle code generation model, directly relevant to code completion methodology." 346 }, 347 { 348 "title": "Ml-enhanced code completion improves developer productivity", 349 "authors": ["Maxim Tabachnyk", "Stoyan Nikolov"], 350 "year": 2022, 351 "relevance": "Google's ML-enhanced code completion productivity study, directly comparable evidence on code completion usefulness." 352 } 353 ] 354 }