scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22644B)
      1 {
      2   "paper": {
      3     "title": "Control Models for In-IDE Code Completion: Saving Inference Costs While Improving Completion Quality Metrics",
      4     "authors": ["Aral de Moor", "Yana Hrynevich", "Hleb Badzeika", "Vladyslav Furda", "Marko Kojic", "Artem Savelev", "Kostadin Cvejoski", "Darya Rovdo", "Ekaterina Garanina"],
      5     "year": 2026,
      6     "venue": "IDE '26 (3rd International Workshop on Integrated Development Environments)",
      7     "arxiv_id": "2601.20223",
      8     "doi": "10.1145/3786151.3788608"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No repository URL or code archive is provided in the paper. The paper mentions an 'open-source development of our ML API' in acknowledgments but provides no link to code for the control models themselves."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "The datasets consist of anonymised telemetry logs from JetBrains IDEs. No dataset is released or linked. Section 3.2 describes the data but provides no download link."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No environment specifications, dependency lists, or library versions are provided. CatBoost is mentioned as the boosting framework but no version is given."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No reproduction instructions are provided. The paper describes an approach using proprietary telemetry data with no guidance on how to replicate the experiments."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Table 2 reports relative change percentages with statistical significance markers but no confidence intervals or error bars. Figures show curves without uncertainty bands."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Section 4.3 states 'significance is computed using bootstrap resampling with individual users as the sampling unit' at p≤0.05. Non-significant results are marked with asterisks in Table 2."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Table 2 reports relative changes with baselines implicit (e.g., Filter model: +46.5% AR for Java, -9.6% RoCC). The percentage deltas with context of what is being measured provide effect magnitude."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Sample sizes are reported (e.g., 151/127 users for Java filter A/B) but no power analysis or justification for these sizes is given. Some A/B groups are quite small (95/62 for Kotlin filter)."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No standard deviations, variance, or spread measures are reported for any results. Table 2 shows only point estimates of relative changes."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The A/B study compares with and without control models (Table 2). Offline evaluation compares boosting vs. transformer architectures (Section 4.1, Figure 2). The no-model condition serves as baseline."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Comparisons are against their own production system and against the transformer approach from de Moor et al. 2024 and Sun et al. 2025, which are recent related works."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Section 4.3 ablates trigger and filter models separately in A/B tests. Section 4.1 investigates the interplay between trigger and filter at different filter rates and FNR thresholds (Figures 2-3)."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Three metrics are used: Ratio of Completed Code (RoCC), Accept Rate (AR), and Cancel Rate (CR), defined in Section 3.3. Generations count is also reported for trigger models."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The paper evaluates via automated metrics (accept/cancel rates). No human evaluation of output quality or user satisfaction surveys are included, despite claims about user perception and productivity."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Section 3.2.2 describes disjoint user splits: '126 users in the train split... and 98 users in the test split' with guaranteed no user overlap. The A/B study is a separate online evaluation."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down by language (Kotlin, Python, PHP, C# in Figures 3a-3c) and by model type (trigger vs. filter in Table 2). Per-language A/B results are shown for Java, Python, Kotlin."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 4.2 discusses the unexpected PHP result where trigger outperforms filter. Section 4.2 notes the Kotlin Cancel Rate abnormality. Section 5.1 discusses the gap between offline and online metrics."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports that filter models reduce RoCC (Table 2: -9.6% to -14.0%), that transformer models don't improve accept/cancel rates as well as boosting (Section 4.1), and that the trigger model didn't significantly reduce generations as expected (Section 4.3)."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims control models 'save around 20% of inference requests while improving completion quality metrics.' Table 2 shows -13.8% generations for trigger, and AR/CR improvements. The 20% is discussed in Section 4.3 as the offline threshold, with 13.8% realized online."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Causal claims about control models improving metrics are supported by A/B testing (Section 4.3), which is an appropriate experimental design for causal inference. Users are the randomization unit."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The paper explicitly limits scope to 'JetBrains IDEs' (Section 2.2), notes 'While this study is likely relevant to any similar tool, our focus is on the cloud-based code completion provided by JetBrains.' The title specifies 'In-IDE Code Completion.'"
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Section 5.1 discusses how completion event dependencies explain the gap between offline and online metrics. Section 4.2 hypothesizes about the Kotlin Cancel Rate abnormality. Section 4.3 discusses how hard rules may inflate filter model metrics."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper mentions '100M-parameter code generation model' as the transformer backbone and CatBoost for boosting, but provides no specific version numbers for any model or library used."
    134       },
    135       "prompts_provided": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "The paper does not use prompting. The control models are ML classifiers (CatBoost boosting and transformer classifiers), not prompt-based systems."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No hyperparameters are reported for either the CatBoost models or the transformer models. No learning rates, tree depths, number of estimators, or training configurations are provided."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used. The control models are classifiers integrated into the IDE completion pipeline, not agentic systems."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Section 3.2 describes data sources at a high level (telemetry logs, 'several hundred features') but does not document preprocessing, filtering, or transformation steps. Feature engineering details are absent."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 5 ('Discussion on Metrics') and Section 6 ('Future Work') discuss limitations including offline-to-online metric gaps (5.1), long-term productivity measurement challenges (5.2), and transformer deployment obstacles (6.1)."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 5.1 discusses the specific threat of completion event dependence affecting metric validity. Section 3.2.1 notes that training data requires disabling models, creating data collection constraints. Section 4.3 notes hard rules inflating filter metrics."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 2.2 explicitly states 'We limit the scope of this paper to LLM-driven code completion in JetBrains IDEs.' Section 5.2 acknowledges that current metrics measure perceived rather than actual productivity, citing contrary evidence."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No raw data is released. The data consists of proprietary anonymised telemetry from JetBrains IDE users."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 3.2 describes data as 'anonymised feature usage logs from Mellum cloud-inferenced completion,' with details on feature types (Section 3.2.1), code context collection (Section 3.2.2), and dataset sizes (Table 1)."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The paper does not describe how users were selected for the telemetry dataset or the A/B study. Section 3.2.2 mentions 'internal users' for code context but does not describe recruitment or selection criteria."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The pipeline from raw telemetry to training data is not documented. No filtering criteria, data cleaning steps, or counts of removed samples are provided."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding statement is present. All authors are affiliated with JetBrains but no explicit funding disclosure is made."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "All nine authors list JetBrains or JetBrains Research as their affiliation, clearly visible in the author list."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "All authors work at JetBrains, which directly benefits from demonstrating that control models improve its IDE products. The funder (JetBrains as employer) has a financial interest in positive results."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present. The authors are employed by the company whose product they evaluate."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "The paper evaluates ML classifiers (CatBoost, transformer) on telemetry data, not a pre-trained LLM's capability on a benchmark. Contamination in the benchmark sense does not apply."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "Not a benchmark evaluation of a pre-trained model. However, the paper does address train/test user separation (Section 3.2.2), which is good practice for their setting."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "Not applicable — the paper does not evaluate a pre-trained model on a public benchmark."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The A/B study involves human users. No pre-registration is mentioned."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "The A/B study deploys experimental conditions to production users. No IRB or ethics approval is mentioned."
    244       },
    245       "demographics_reported": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No demographics of the A/B study participants or telemetry dataset users are reported beyond counts (e.g., 151/127 users). Experience level, geographic distribution, etc. are absent."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "No inclusion/exclusion criteria for A/B study participants are stated. It is unclear how users were selected for the experiment."
    254       },
    255       "randomization_described": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "The paper mentions 'A/B study' and 'A/B groups' but does not describe the randomization procedure for assigning users to conditions."
    259       },
    260       "blinding_described": {
    261         "applies": true,
    262         "answer": false,
    263         "justification": "No mention of whether users knew they were in an experiment or which condition they were assigned to."
    264       },
    265       "attrition_reported": {
    266         "applies": true,
    267         "answer": false,
    268         "justification": "No information on participant attrition or dropout during the A/B study is provided."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "The paper's goal is to reduce inference costs but does not report actual cost figures (dollars, latency numbers) for the control models or the LLM completions they gate."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No compute budget, training time, or hardware specifications are provided for training the control models."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "Boosting-based control models can save around 20% of inference requests while improving completion quality metrics.",
    287       "evidence": "Table 2 shows the trigger model reduces generations by 13.8% online (20% offline threshold). Filter models improve AR by +32.9% to +47.6% and reduce CR by -15.5% to -36.7%, but decrease RoCC by -9.6% to -14.0%.",
    288       "supported": "moderate"
    289     },
    290     {
    291       "claim": "Transformer-based models can filter out more generations at lower false-negative cost than boosting models.",
    292       "evidence": "Figure 2a shows transformer models achieve higher filter rates for the same Symbols Completed hit at certain operating points (Section 4.1).",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "Transformer-based models do not improve Accept and Cancel Rates as well as boosting models despite filtering more.",
    297       "evidence": "Section 4.1 and Figures 2b-2c show transformers affect rate-based metrics less favorably because 'the denominator of these rate-based metrics (shown completions) is relatively less affected.'",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "The trigger model did not reduce online generations as much as the offline threshold suggested (13.8% vs 20%).",
    302       "evidence": "Section 4.3 and 5.1 explain this through completion event dependencies: 'by affecting completion behaviour, users change their interactions with code completions, resulting in more overall opportunities for completions.'",
    303       "supported": "strong"
    304     }
    305   ],
    306   "methodology_tags": ["benchmark-eval", "observational"],
    307   "key_findings": "The paper introduces 'control models' — lightweight ML classifiers (CatBoost boosting and transformers) that gate when LLM code completions are triggered and shown in JetBrains IDEs. Offline evaluation on 98 users shows transformer models can filter more completions with lower false-negative rates, but boosting models better improve accept/cancel rate metrics. An online A/B study demonstrates that boosting-based filter models improve accept rates by 33-48% and reduce cancel rates by 16-37%, at the cost of 10-14% reduction in ratio of completed code. A key finding is that offline metrics do not directly translate to online due to dependencies between completion events.",
    308   "red_flags": [
    309     {
    310       "flag": "Company evaluating own product",
    311       "detail": "All nine authors are JetBrains employees evaluating JetBrains IDE code completion. No external validation or independent evaluation is included."
    312     },
    313     {
    314       "flag": "Small A/B sample sizes without justification",
    315       "detail": "Filter model A/B groups range from 62 to 151 users per condition. Several key metrics (RoCC for all filter languages, CR for Python and Kotlin) are not statistically significant, suggesting the studies may be underpowered."
    316     },
    317     {
    318       "flag": "No reproducibility artifacts",
    319       "detail": "No code, data, environment specifications, or reproduction instructions are provided. The proprietary telemetry data makes independent replication impossible."
    320     },
    321     {
    322       "flag": "Missing hyperparameters and training details",
    323       "detail": "No hyperparameters, training configurations, or model details beyond architecture type are provided, making it impossible to assess whether the models were properly tuned."
    324     }
    325   ],
    326   "cited_papers": [
    327     {
    328       "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models",
    329       "authors": ["Shraddha Barke", "Michael James", "Nadia Polikarpova"],
    330       "year": 2022,
    331       "doi": "10.1145/3586030",
    332       "relevance": "Foundational study on developer interaction modes (accelerative vs. explorative) with AI code completion."
    333     },
    334     {
    335       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    336       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    337       "year": 2025,
    338       "doi": "10.48550/arXiv.2507.09089",
    339       "relevance": "RCT finding misalignment between perceived (+20%) and actual (-20%) productivity with AI coding tools."
    340     },
    341     {
    342       "title": "When to Show a Suggestion? Integrating Human Feedback in AI-Assisted Programming",
    343       "authors": ["Hussein Mozannar", "Gagan Bansal", "Adam Fourney", "Eric Horvitz"],
    344       "year": 2024,
    345       "doi": "10.1609/aaai.v38i9.28878",
    346       "relevance": "GitHub-affiliated work on trigger and filter models for code completion, reporting 13% trigger savings and 25% filter rate."
    347     },
    348     {
    349       "title": "Don't Complete It! Preventing Unhelpful Code Completion for Productive and Sustainable Neural Code Completion Systems",
    350       "authors": ["Zhensu Sun", "Xiaoning Du", "Fu Song"],
    351       "year": 2025,
    352       "doi": "10.1145/3688831",
    353       "relevance": "Transformer-based trigger model rejecting 20% of completion requests with 97.4% precision on unhelpful completions."
    354     },
    355     {
    356       "title": "A Transformer-Based Approach for Smart Invocation of Automatic Code Completion",
    357       "authors": ["Aral de Moor", "Arie van Deursen", "Maliheh Izadi"],
    358       "year": 2024,
    359       "doi": "10.1145/3664646.3664760",
    360       "relevance": "Prior work by first author on hybrid transformer control models combining telemetry with code context."
    361     },
    362     {
    363       "title": "Mellum: Production-Grade in-IDE Contextual Code Completion with Multi-File Project Understanding",
    364       "authors": ["Nikita Pavlichenko", "Iurii Nazarov", "Ivan Dolgov"],
    365       "year": 2025,
    366       "arxiv_id": "2510.05788",
    367       "relevance": "JetBrains' production code completion system that the control models gate."
    368     },
    369     {
    370       "title": "Productivity assessment of neural code completion",
    371       "authors": ["Albert Ziegler", "Eirini Kalliamvakou"],
    372       "year": 2022,
    373       "doi": "10.1145/3520312.3534864",
    374       "relevance": "GitHub Copilot productivity assessment showing acceptance rate correlates with perceived productivity."
    375     },
    376     {
    377       "title": "Full Line Code Completion: Bringing AI to Desktop",
    378       "authors": ["Anton Semenkin", "Vitaliy Bibaev"],
    379       "year": 2024,
    380       "doi": "10.1109/ICSE-SEIP66354.2025.00055",
    381       "relevance": "JetBrains local code completion system describing the RoCC metric and hard filtering rules."
    382     },
    383     {
    384       "title": "Reading Between the Lines: Modeling User Behavior and Costs in AI-Assisted Programming",
    385       "authors": ["Hussein Mozannar", "Gagan Bansal", "Adam Fourney", "Eric Horvitz"],
    386       "year": 2024,
    387       "doi": "10.1145/3613904.3641936",
    388       "relevance": "Models user behavior and costs during AI-assisted programming interactions."
    389     },
    390     {
    391       "title": "\"It's Weird That it Knows What I Want\": Usability and Interactions with Copilot for Novice Programmers",
    392       "authors": ["James Prather", "Brent N. Reeves", "Paul Denny"],
    393       "year": 2023,
    394       "doi": "10.1145/3617367",
    395       "relevance": "Study of novice interactions with code completion, identifying shepherding and drifting modes."
    396     }
    397   ]
    398 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs