scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20687B)
      1 {
      2   "paper": {
      3     "title": "A Combined Approach of Program Analysis and Deep Learning for Code Completion",
      4     "authors": ["Yi Liu"],
      5     "year": 2024,
      6     "venue": "Scientific Journal of Technology",
      7     "doi": "10.54691/hkyc3a89"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper mentions 'public datasets for Python and JavaScript' but does not name the specific datasets, provide download links, or cite the data sources."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, library versions, requirements files, or hardware details are provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No reproduction instructions, README, or scripts are mentioned. The methodology section describes the approach conceptually but does not provide enough detail to reproduce the experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Table 1 reports only point estimates (e.g., '82.16% accuracy') with no confidence intervals, error bars, or uncertainty measures."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims 'our method significantly outperforms existing approaches' in the abstract but provides no statistical significance tests. The comparison in Table 1 appears to show two columns per language but no baselines are clearly labeled or tested for significance."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No effect sizes are reported. Results are presented as raw accuracy and MRR numbers without baseline context showing the magnitude of improvement."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for the dataset sizes. The paper says datasets were randomly split 70/15/15 but never states the total number of examples."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or results across multiple runs are reported. Only single point estimates appear in Table 1."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Table 1 appears to have two columns per language but the columns are not labeled with baseline names. The abstract claims the method 'significantly outperforms existing approaches' but no explicit baseline comparisons with named prior methods are presented."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No baselines are clearly identified. The references cite work from 2009-2022 but no specific prior methods are compared against in the experiments."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The system has multiple components (program analysis, GGNN, Transformer, MLP) but no ablation study is conducted to determine the contribution of each component."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 1 reports both Accuracy and Mean Reciprocal Rank (MRR) for both Python and JavaScript datasets."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation is included. The paper claims the approach 'enhances development efficiency and code quality' but evaluation is entirely automated metrics (accuracy and MRR). Human evaluation of code completion quality would be relevant to these claims."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 3 describes a 70/15/15 split for training, validation, and test sets: 'Randomly select training, validation, and test sets from the Python and JavaScript datasets, with the training set comprising 70%, the validation set 15%, and the test set 15%.'"
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Results are reported only as aggregate accuracy and MRR per language. No per-category breakdown (e.g., by code construct type, complexity level, or project type) is provided."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4 discusses failure modes: 'the experiments observed a decline in model performance in more complex code structures, such as nested loops and deep function calls.' Section 5 also acknowledges 'limitations in the current model's performance when dealing with highly nested code and multilevel function calls.'"
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No negative results are reported. The paper mentions performance declines in complex structures qualitatively but does not show any quantitative negative results, failed approaches, or ablations that hurt performance."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "The abstract claims 'our method significantly outperforms existing approaches in terms of code completion accuracy and Mean Reciprocal Rank.' However, no clear baseline comparisons are presented in the results — Table 1 shows numbers but without labeled baselines, so the claim of outperformance is not verifiably supported."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims such as 'by integrating program analysis techniques and deep learning methods, our code completion system can effectively identify and complete code snippets, thereby enhancing development efficiency and code quality.' Without ablation studies isolating the contribution of program analysis vs. deep learning, these causal attributions are not justified."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title broadly claims 'A Combined Approach of Program Analysis and Deep Learning for Code Completion' without bounding to the tested setting. While the paper tests on Python and JavaScript, the conclusions in Section 5 speak generally about 'enhancing the accuracy and efficiency of code completion' without qualifying that results are limited to these two languages and specific unnamed datasets."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations for the results are discussed. The paper does not consider whether improvements might be due to dataset characteristics, increased model capacity, or other confounding factors."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "No specific model versions, library versions, or implementation details are provided. The paper describes GGNN and Transformer architecturally but does not specify which implementations or versions were used."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "This paper does not use prompting. It trains a neural network model (GGNN + Transformer) from scratch rather than prompting an LLM."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Only learning rate (0.001) and batch size (128) are mentioned along with the Adam optimizer. Critical hyperparameters like embedding dimension D, number of GGNN layers, number of Transformer heads/layers, dropout rate, number of training epochs, and L2 regularization coefficient are not reported."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a standard neural network training and inference pipeline."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper mentions 'code is collected from open-source community websites like GitHub and Stack Overflow' and 'after cleansing' but does not describe the cleansing procedure, filtering criteria, how many examples were collected, or any details of the preprocessing pipeline."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations or threats-to-validity section. Limitations are briefly mentioned in Section 5 (Conclusion) as a single sentence about nested code, but this does not constitute a substantive limitations section."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed. The brief mention of performance decline on complex structures in Section 4 is an observation about results, not a discussion of threats to the study's validity."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show or what settings are excluded from the claims."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data is made available. The paper references 'public datasets' but does not name them, provide links, or release processed data."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "Data collection is described only vaguely: 'code is collected from open-source community websites like GitHub and Stack Overflow.' No specifics about time period, selection criteria, number of repositories, or filtering are provided."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. The data comes from public code repositories, which is a standard benchmark-style data source."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The data pipeline from raw code to training examples is not documented. The paper jumps from 'code is collected' to 'program analysis is performed' without detailing intermediate steps, filtering counts, or transformation procedures."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgments section states: 'This work is supported by the Research Project of Hunan Provincial Education Department [Grant No.22C0600].'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The author's affiliation is clearly stated: 'School of Information, Hunan University of Humanities Science and Technology, Loudi, Hunan, 417000, China.'"
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The funder is the Hunan Provincial Education Department, a government education body with no apparent financial interest in the outcome of code completion research."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper trains its own model from scratch on code datasets rather than evaluating a pre-trained model's capability on a benchmark. Contamination in the LLM sense does not apply."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Same as above — the paper trains a custom model with a described train/test split, not evaluating a pre-trained LLM on a benchmark. Standard train/test overlap is addressed by the 70/15/15 split."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable — no pre-trained model is evaluated on a public benchmark. The model is trained and tested on the paper's own data splits."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, latency, or tokens-per-prediction are reported despite the method being proposed for use in IDEs where latency is critical."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No training time, GPU hours, hardware specifications, or computational budget are stated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The combined program analysis and deep learning method significantly outperforms existing approaches in code completion accuracy and MRR.",
    286       "evidence": "Abstract and Section 3 (Table 1) report accuracy of 82.16% and MRR of 87.02% for Python, and accuracy of 81.37% and MRR of 86.45% for JavaScript. However, no baseline methods are clearly identified or compared against.",
    287       "supported": "unsupported"
    288     },
    289     {
    290       "claim": "The approach enhances development efficiency and code quality by providing contextually relevant code snippets.",
    291       "evidence": "Section 3 states the results 'demonstrate that by integrating program analysis techniques and deep learning methods, our code completion system can effectively identify and complete code snippets, thereby enhancing development efficiency and code quality.' No user studies or efficiency measurements are provided.",
    292       "supported": "unsupported"
    293     },
    294     {
    295       "claim": "The system performs slightly better with Python code than with JavaScript.",
    296       "evidence": "Section 4 discusses this observation based on Table 1 numbers, where Python accuracy (82.16%) slightly exceeds JavaScript accuracy (81.37%). However, without statistical tests or variance measures, this small difference cannot be confirmed as meaningful.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "Model performance declines in more complex code structures such as nested loops and deep function calls.",
    301       "evidence": "Section 4 states: 'the experiments observed a decline in model performance in more complex code structures, such as nested loops and deep function calls.' No quantitative evidence for this observation is provided.",
    302       "supported": "weak"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "The paper proposes combining program analysis (inter-procedural backward slicing to construct program graphs) with GGNN and Transformer models for code completion. On unnamed Python and JavaScript datasets, it reports accuracy of ~82% and ~81% and MRR of ~87% and ~86% respectively. However, the lack of clearly identified baselines, named datasets, ablation studies, and statistical analysis makes the results difficult to interpret or verify.",
    307   "red_flags": [
    308     {
    309       "flag": "No identifiable baselines",
    310       "detail": "The abstract claims the method 'significantly outperforms existing approaches' but Table 1 does not clearly label any baseline methods. The two columns per language may represent before/after but this is never explained, making the claim of outperformance unverifiable."
    311     },
    312     {
    313       "flag": "Unnamed datasets",
    314       "detail": "The paper refers to 'public datasets for Python and JavaScript' without naming the specific datasets, providing download links, or citing them. This makes the results completely unreproducible."
    315     },
    316     {
    317       "flag": "No uncertainty quantification",
    318       "detail": "All results are single point estimates with no error bars, confidence intervals, variance across runs, or significance tests despite claims of significant outperformance."
    319     },
    320     {
    321       "flag": "Missing critical experimental details",
    322       "detail": "Key details like dataset sizes, model architecture dimensions (embedding size, number of layers/heads), training epochs, and hardware are absent, making reproduction impossible."
    323     },
    324     {
    325       "flag": "Possible table formatting error",
    326       "detail": "Table 1 shows '8645%' for JavaScript MRR which appears to be a formatting error (likely 86.45%). This raises questions about the care taken in preparing the results."
    327     },
    328     {
    329       "flag": "Venue quality concern",
    330       "detail": "Published in 'Scientific Journal of Technology' (ISSN: 2688-8645), which does not appear to be a well-known peer-reviewed venue in software engineering or machine learning. The paper is very short (~5 pages) and lacks the methodological depth expected for the claims made."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "On the naturalness of software",
    336       "authors": ["A. Hindle", "E. T. Barr", "M. Gabel"],
    337       "year": 2016,
    338       "relevance": "Foundational work on treating code as natural language, which motivates code completion using language models."
    339     },
    340     {
    341       "title": "The hidden cost of code completion: Understanding the impact of the recommendation-list length on its efficiency",
    342       "authors": ["X. Jin", "F. Servant"],
    343       "year": 2018,
    344       "relevance": "Studies the practical impact of code completion tools on developer productivity."
    345     },
    346     {
    347       "title": "Learning from examples to improve code completion systems",
    348       "authors": ["M. Bruch", "M. Monperrus", "M. Mezini"],
    349       "year": 2009,
    350       "relevance": "Early work on ML-based code completion systems, relevant as a baseline approach."
    351     },
    352     {
    353       "title": "A unified multi-task learning model for ast-level and token-level code completion",
    354       "authors": ["F. Liu", "G. Li", "B. Wei", "X. Xia", "Z. Fu", "Z. Jin"],
    355       "year": 2022,
    356       "relevance": "Recent work on AST-based code completion using multi-task learning, directly comparable to this paper's approach."
    357     },
    358     {
    359       "title": "Probabilistic model for code with decision trees",
    360       "authors": ["V. Raychev", "P. Bielik", "M. Vechev"],
    361       "year": 2016,
    362       "relevance": "Probabilistic approach to code modeling that represents a baseline category for code completion."
    363     }
    364   ]
    365 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs