scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (16948B)
      1 {
      2   "paper": {
      3     "title": "Context Composing for Full Line Code Completion",
      4     "authors": ["Anton Semenkin", "Yaroslav Sokolov", "Evgeniia Vu"],
      5     "year": 2024,
      6     "venue": "IDE '24 (First IDE Workshop)",
      7     "arxiv_id": "2402.09230",
      8     "doi": "10.1145/3643796.3648446"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No source code or repository link is provided. The system is a proprietary JetBrains product."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No dataset is released. The evaluation data (A/B test results, offline evaluation dataset) is proprietary."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No environment specifications are provided. The paper mentions running on end-user devices but gives no library versions, dependencies, or setup details."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No reproduction instructions are provided. The approach is described at a high level but not in sufficient detail to replicate."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The A/B test results report a 1.5x improvement but no confidence intervals or error bars are provided."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper mentions 'taking into an account statistical significance of the observed results' in Section 2.2 but does not report any specific test, p-value, or methodology."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper reports '1.5 times' increase in ratio of code completed for FLCC users vs. non-FLCC users, and '40%' quality increase from larger contexts. These provide magnitude context."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper says 'hundreds of real Python users' but provides no justification for this sample size and no power analysis."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No variance, standard deviation, or spread measures are reported for any results."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The A/B test compares FLCC-enabled users against users with standard code completion only (Section 2.2)."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The baseline is the IDE's own standard code completion. No comparison against other neural code completion tools (e.g., Copilot, CodeWhisperer) is provided."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No ablation study is presented. The paper describes multiple context composing components (whitespace trimming, long tokens, scope tokens) but does not isolate their individual contributions."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "The paper primarily reports one metric: 'ratio of code completed.' An edit rate observation is mentioned but not quantified."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The A/B test on hundreds of real users constitutes human evaluation of the system's outputs. Explicit feedback from 'tens of users' is also mentioned (Section 2.1)."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No mention of held-out test sets. The offline evaluation dataset is mentioned but not described in detail."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No per-category or per-task breakdowns are provided. Results are reported as single aggregate numbers."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 3.1 describes an experiment (method rearrangement approach) that 'did not show any positive results' and was abandoned. Section 2.1 mentions user feedback describing 'potential growth points.'"
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 3.1 reports that the class method rearrangement approach 'did not show any positive results while experimenting. So, we abandoned this research direction.'"
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims the feature 'proved its usefulness in A/B testing on hundreds of real Python users,' which is supported by the 1.5x metric increase reported in Section 2.2."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The causal claim that FLCC improves coding workflow is supported by an A/B test (randomized assignment to treatment/control groups), which is appropriate for causal inference."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The title says 'Full Line Code Completion' generally, but results are only from PyCharm Pro Python users. The paper does not bound claims to Python or PyCharm explicitly."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "No alternative explanations for the A/B test results are discussed. Potential confounds (e.g., novelty effect, user self-selection in EAP) are not addressed."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper says 'GPT-like and LLaMA-like autoregressive language models' without specifying exact model names, versions, or sizes beyond 'under 1B parameters.'"
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The context composition structure is described (file extension + separator + file path + separator + code above caret) but exact prompt formats and special token values are not fully provided."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No hyperparameters (temperature, top-p, learning rate, etc.) are reported. Only context sizes (384 and 1536 tokens) are mentioned."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used. This is a direct code completion model, not an agent."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 2.1 describes the preprocessing pipeline: whitespace trimming, scope token replacement, comment removal, BPE tokenization with 'long tokens' modification, and context construction steps."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "There is no dedicated limitations or threats-to-validity section."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No specific threats to validity are discussed anywhere in the paper."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No raw data from the A/B test or offline evaluation is available."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 2.2 describes the A/B testing procedure: EAP program, user splitting into groups, shipping different feature versions, and tracking metrics during fall 2023."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "Users are described as EAP participants who 'download experimental version of IDEs for free' but no details on how they were recruited or whether this introduces selection bias."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The data pipeline from raw user telemetry to the reported 1.5x metric is not documented. No filtering criteria or intermediate processing steps are described."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding disclosure. All authors are JetBrains employees but no funding section or acknowledgments section is present."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "All three authors clearly list JetBrains as their affiliation."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "The work is conducted by JetBrains employees evaluating a JetBrains product. JetBrains has a direct financial interest in showing FLCC is useful."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present. The authors are employees of the company whose product is evaluated."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "The paper evaluates the system via A/B testing with real users, not by testing model knowledge on a benchmark. Contamination is not relevant here."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "No benchmark evaluation of model knowledge is performed. The evaluation is user-facing A/B testing."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "No benchmark evaluation is performed; evaluation is via online A/B testing with real users."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No pre-registration is mentioned for the A/B study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No IRB or ethics board approval is mentioned for the user study."
    244       },
    245       "demographics_reported": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "Participants are described only as 'hundreds of real Python users' with no demographics (experience level, geography, etc.)."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "No inclusion/exclusion criteria are stated for participants beyond being PyCharm Pro EAP users."
    254       },
    255       "randomization_described": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "The paper says 'we split users to several groups' but does not describe the randomization procedure."
    259       },
    260       "blinding_described": {
    261         "applies": true,
    262         "answer": false,
    263         "justification": "No blinding description. It is unclear whether users knew which version of FLCC they received."
    264       },
    265       "attrition_reported": {
    266         "applies": true,
    267         "answer": false,
    268         "justification": "No attrition or dropout information is reported."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No inference cost or latency numbers are reported, despite the paper emphasizing latency constraints. Only qualitative claims ('latency almost unchanged') are made."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No computational budget for training or inference is stated."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "FLCC increased the ratio of code completed by 1.5 times compared to users without the feature.",
    287       "evidence": "Section 2.1 and 2.2 report this metric from A/B testing on 'hundreds of real Python users' in PyCharm Pro during fall 2023.",
    288       "supported": "moderate"
    289     },
    290     {
    291       "claim": "Users do not edit the selected code fragment immediately after inserting it.",
    292       "evidence": "Stated in Section 2.2 as an observation from the A/B test, but no quantitative data is provided.",
    293       "supported": "weak"
    294     },
    295     {
    296       "claim": "Increasing context size from 384 to 1536 tokens improved code completion quality by 40% while keeping latency almost unchanged.",
    297       "evidence": "Section 2.3 reports this from offline evaluation, no details on the metric or evaluation setup.",
    298       "supported": "weak"
    299     },
    300     {
    301       "claim": "Fill-in-the-middle and RAG methods showed about 10% target metric increase in offline experiments.",
    302       "evidence": "Section 2.3 mentions 'promising results (about 10% target metric increase)' from offline experiments with no further detail.",
    303       "supported": "weak"
    304     }
    305   ],
    306   "methodology_tags": ["case-study"],
    307   "key_findings": "JetBrains describes their Full Line Code Completion context composing approach for sub-1B parameter models running locally in IDEs. A/B testing on hundreds of PyCharm Pro Python users showed a 1.5x increase in code completion ratio. Increasing context size from 384 to 1536 tokens via LLaMA-like models improved offline quality by 40%. The paper also reports a failed experiment with method rearrangement that showed no improvement.",
    308   "red_flags": [
    309     {
    310       "flag": "Company evaluating own product",
    311       "detail": "All authors are JetBrains employees evaluating JetBrains' Full Line Code Completion feature. No independent evaluation or conflict of interest disclosure."
    312     },
    313     {
    314       "flag": "Vague quantitative claims",
    315       "detail": "Key results (1.5x improvement, 40% quality increase, 10% metric increase) are stated without confidence intervals, statistical test details, sample sizes, or metric definitions."
    316     },
    317     {
    318       "flag": "No uncertainty quantification",
    319       "detail": "Despite running A/B tests, no error bars, p-values, or confidence intervals are reported for any result."
    320     },
    321     {
    322       "flag": "Selection bias in user sample",
    323       "detail": "EAP users who voluntarily download experimental IDEs are likely not representative of general developer population, but this is not discussed."
    324     }
    325   ],
    326   "cited_papers": [
    327     {
    328       "title": "CodeCompose: A Large-Scale Industrial Deployment of AI-assisted Code Authoring",
    329       "authors": ["Vijayaraghavan Murali", "Chandra Maddila", "Imad Ahmad", "Michael Bolin", "Daniel Cheng", "Negar Ghorbani", "Renuka Fernandez", "Nachiappan Nagappan"],
    330       "year": 2023,
    331       "arxiv_id": "2305.12050",
    332       "relevance": "Large-scale industrial deployment of AI code completion at Meta, directly comparable industrial experience paper."
    333     },
    334     {
    335       "title": "Improving language models by retrieving from trillions of tokens",
    336       "authors": ["Sebastian Borgeaud"],
    337       "year": 2022,
    338       "relevance": "RETRO method for retrieval-augmented generation, relevant to RAG approaches for code completion."
    339     },
    340     {
    341       "title": "Incoder: A generative model for code infilling and synthesis",
    342       "authors": ["Daniel Fried"],
    343       "year": 2022,
    344       "arxiv_id": "2204.05999",
    345       "relevance": "Fill-in-the-middle code generation model, directly relevant to code completion methodology."
    346     },
    347     {
    348       "title": "Ml-enhanced code completion improves developer productivity",
    349       "authors": ["Maxim Tabachnyk", "Stoyan Nikolov"],
    350       "year": 2022,
    351       "relevance": "Google's ML-enhanced code completion productivity study, directly comparable evidence on code completion usefulness."
    352     }
    353   ]
    354 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs