scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23082B)
      1 {
      2   "paper": {
      3     "title": "COCOMIC: Code Completion By Jointly Modeling In-file and Cross-file Context",
      4     "authors": [
      5       "Yangruibo Ding",
      6       "Zijian Wang",
      7       "Wasi Uddin Ahmad",
      8       "Murali Krishna Ramanathan",
      9       "Ramesh Nallapati",
     10       "Parminder Bhatia",
     11       "Dan Roth",
     12       "Bing Xiang"
     13     ],
     14     "year": 2022,
     15     "venue": "arXiv",
     16     "arxiv_id": "2212.10007"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "A GitHub repository is provided: https://github.com/amazon-science/cocomic (footnote 1 in Section 1)."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper states they release a dataset for statement-level code completion (contribution 3: 'We release a diverse and high-quality dataset on statement-level code completion'). The dataset is linked via the same GitHub repository."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions using Transformers (Wolf et al., 2020) and training on 8 Nvidia A100s (Appendix D), but does not provide a requirements.txt, Dockerfile, or detailed dependency list with versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided in the paper. Appendix D mentions hardware and training time but does not give commands or scripts to reproduce the experiments."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables 1-5 and 10 are reported as point estimates without confidence intervals or error bars."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims COCOMIC 'significantly outperforms' baselines (e.g., Table 4 caption) but no statistical significance tests (p-values, t-tests, etc.) are reported."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Effect sizes are reported as relative percentage improvements with baseline context. For example, '33.94% relative increase in exact match' comparing COCOMIC (21.39%) to the finetuned CodeGen baseline (15.97%). Tables provide raw numbers enabling readers to compute effect sizes."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The test set contains 6,888 prompts (Section 5.1) but there is no justification for why this size is adequate, nor any power analysis."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Appendix D explicitly states: 'We perform one round of experiments only as it is very expensive to repeat the experiments many times.' No variance, standard deviation, or multi-run results are reported."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Multiple baselines are included: zero-shot CodeGen, finetuned CodeGen, and CodeGen with cross-file context prepended as plain text (Section 5.3, Table 1). Additional baseline variants are in Appendix E.2."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "CodeGen (Nijkamp et al., 2022) was contemporary to the paper's submission in December 2022. The concurrent work by Shrivastava et al. (2022) and Zhang et al. (2023) is discussed in related work."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Extensive ablation studies are provided: random vs. CCFINDER entities (Table 3), 1-hop vs. 2-hop retrieval (Table 3), [SUM] vs. mean pooling (Table 4), locale vs. MTL vs. no relations (Table 5), and additional baseline variants (Appendix E)."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Multiple metrics are used: Code Match EM, BLEU-4, Identifier Match EM, Identifier Precision, Identifier Recall, and Perplexity (Section 5.3, Table 1)."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation is conducted. All evaluation is automated via exact match, BLEU-4, identifier match, and perplexity. Human evaluation of code quality or correctness would have strengthened the claims about practical utility."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The dataset is divided into 80%/10%/10% train/validation/test splits (Section 5.1). Additionally, the test set explicitly excludes projects used as dependencies by training projects to prevent information leakage."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Results are reported only as aggregate metrics across the full test set. There is no breakdown by project size, complexity, number of cross-file dependencies, or programming pattern."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The Limitations section discusses failure cases: COCOMIC performs 5-7% lower than finetuned CodeGen when cross-file context is absent. Case studies in Appendix F show qualitative examples of both successes and baseline failures."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Negative results are reported: the MTL approach for encoding entity relations achieved 97.2% accuracy on the auxiliary task but 'hardly improves COCOMIC in code completion' (Section 6.4, Table 5). Also, COCOMIC degrades 5-7% without cross-file context (Limitations section)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims '33.94% relative increase in exact match and a 28.69% relative increase in identifier matching' are directly supported by Table 1 (COCOMIC 21.39% vs. finetuned CodeGen 15.97% EM, and 31.26% vs. 24.29% ID EM)."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper makes causal claims about cross-file context improving code completion. These are justified through controlled ablation studies: comparing COCOMIC with/without cross-file context, random vs. CCFINDER entities, and different encoding strategies (Tables 1, 3, 4, 5)."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The Limitations section explicitly bounds generalization: 'Our work focuses on Python language' and acknowledges the technique is not tested on larger models ('our work does not substantiate that our proposed technique would boost the performance of language models of any size'). The paper also notes it is a 'proof-of-concept' for Python."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for the improvements. For example, it does not consider whether the gains come from additional training data exposure rather than the cross-file architecture, or whether the test set is biased toward cases where cross-file context is easy to retrieve."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The backbone model is specified as 'CodeGen-350M-Mono' (Section 5.2), which is an exact model variant with specific size and training data specification."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "COCOMIC is a fine-tuned model, not a prompting-based approach. The model is trained end-to-end with cross-file context encoded via the [SUM] mechanism, not through prompting an LLM."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Key hyperparameters are reported in Section 5.2: 'finetune the model for 5 epochs with max sequence length of 2,048 tokens and learning rate of 5e-5 with 5% warm-up steps then cosine annealing.' The retrieval uses k=2 hops, max 128 entities, each up to 128 tokens."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "COCOMIC is not an agentic system. It is a fine-tuned language model with a retrieval component (CCFINDER), not an agentic scaffold with tools, retry logic, or feedback mechanisms."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Data preprocessing is documented: projects filtered by size (5-5k Python files), train/val/test split (80/10/10), test set excludes dependency projects, prompts cut at cross-file context locations, samples filtered for AST parseability and local API calls, yielding 6,888 test prompts (Section 5.1, Appendix C)."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "There is a dedicated 'Limitations' section after Section 8 (Conclusion) with three substantive subsections: extension to other languages, performance without cross-file context, and impact on different model sizes."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The limitations are specific to this study: Python-only scope, single model size (350M), degraded performance without cross-file context (5-7% drop), and the hypothesis that larger models would benefit similarly is explicitly not substantiated."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Scope boundaries are stated: Python only, project-level context only (not third-party packages), CodeGen-350M only, and the authors explicitly state 'our work does not substantiate that our proposed technique would boost the performance of language models of any size.'"
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The dataset is released via the GitHub repository (https://github.com/amazon-science/cocomic), and the data stems from the publicly available Python Package Index (PyPI)."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Data collection is described: projects collected from PyPI, filtered for permissive licenses, filtered out projects with too few files (<=5) or too many nodes (>=5k), resulting in 60,891 projects (Section 5.1)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. The data source is a standard public repository (PyPI)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline is documented: PyPI collection -> license filtering -> size filtering (60,891 projects) -> 80/10/10 split with dependency exclusion -> prompt creation at cross-file context locations -> AST parseability and local API filtering -> 6,888 test prompts. Filtering criteria are specified at each stage."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding source or acknowledgments section is present in the paper. Most authors are from AWS AI Labs, but there is no explicit funding disclosure."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: Yangruibo Ding from Columbia University, all other authors from AWS AI Labs (Amazon). Email addresses confirm the affiliations."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Most authors are from AWS AI Labs (Amazon), which has a commercial interest in code completion tools. The funder (Amazon/AWS) is not independent of the outcome, as improved code completion directly benefits their products. This conflict is not disclosed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement or financial interest declaration is present in the paper. Authors at Amazon/AWS may hold equity or other financial interests related to code completion technology."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper uses CodeGen-350M-Mono as the backbone but does not state the training data cutoff date for this pretrained model."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "The paper addresses train-test overlap by ensuring test set projects are not dependencies of training projects (Section 5.1: 'we only include projects that were not used as dependencies by any training projects in the test set'). However, this addresses their fine-tuning data, not the pretrained model's data."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The benchmark is built from PyPI, which is publicly available. The CodeGen pretrained model may have trained on PyPI packages, but this potential contamination between the pretrained model's training data and the benchmark is not discussed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference cost or latency is reported. The paper does not mention how long CCFINDER takes to retrieve context or how much inference time the [SUM] encoding adds compared to standard CodeGen."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Appendix D states: 'We train our models on a machine with 8 Nvidia A100s. Each job takes around 50 hours (i.e., 400 GPU hours) to train.'"
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "COCOMIC improves over finetuned CodeGen by 33.94% relative increase in exact match for code completion.",
    295       "evidence": "Table 1 shows COCOMIC achieves 21.39% EM vs. finetuned CodeGen at 15.97% EM, a relative improvement of (21.39-15.97)/15.97 = 33.94%.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "COCOMIC improves identifier matching by 28.69% relative increase over finetuned CodeGen.",
    300       "evidence": "Table 1 shows COCOMIC achieves 31.26% ID Match EM vs. finetuned CodeGen at 24.29%, a relative improvement of (31.26-24.29)/24.29 = 28.69%.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "CCFINDER retrieves 27.07% more relevant identifiers than in-file context alone.",
    305       "evidence": "Table 2 shows in-file context covers 75.19% of identifiers, while cross-file context brings recall to 95.55%, a 27.07% increase. This measures identifier coverage, not end-to-end model performance.",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "The [SUM] token representation significantly outperforms mean pooling for cross-file context encoding.",
    310       "evidence": "Table 4 shows [SUM] achieves 21.39% Code EM vs. mean pooling at 16.78%, and 31.26% ID EM vs. 25.01%.",
    311       "supported": "moderate"
    312     },
    313     {
    314       "claim": "Adding cross-file context does not degrade perplexity when cross-file context is not required.",
    315       "evidence": "Table 1 shows COCOMIC achieves the lowest perplexity (2.69) across all settings, lower than all baselines including those without cross-file context.",
    316       "supported": "moderate"
    317     },
    318     {
    319       "claim": "Locales improve cross-file context representation while multi-task learning with edge prediction does not.",
    320       "evidence": "Table 5 shows locale-based COCOMIC (21.39% EM) outperforms no-relations (20.27%) and MTL (20.01%). MTL achieves 97.2% edge prediction accuracy but does not improve code completion.",
    321       "supported": "moderate"
    322     }
    323   ],
    324   "methodology_tags": [
    325     "benchmark-eval"
    326   ],
    327   "key_findings": "COCOMIC, a framework that jointly models in-file and cross-file context for code completion, achieves a 33.94% relative improvement in exact match over a finetuned CodeGen-350M baseline on a Python dataset from PyPI. The key innovation is compressing cross-file entities into single-token representations via a [SUM] token, enabling the model to incorporate up to 16,384 tokens of cross-file context within limited input length. Ablation studies show that the static retrieval tool CCFINDER retrieves 27.07% more relevant identifiers than in-file context alone, and that the [SUM] representation substantially outperforms mean pooling. However, all results are single-run on one model size (350M) and one language (Python), limiting generalizability.",
    328   "red_flags": [
    329     {
    330       "flag": "Single-run experiments",
    331       "detail": "Appendix D explicitly states 'We perform one round of experiments only as it is very expensive to repeat the experiments many times.' Without variance estimates, it is impossible to assess whether the reported improvements are stable or due to training randomness."
    332     },
    333     {
    334       "flag": "No significance tests",
    335       "detail": "Claims of 'significantly outperforms' (e.g., Table 4 caption) are made without any statistical significance tests. All reported differences are raw number comparisons."
    336     },
    337     {
    338       "flag": "Company evaluating its own product",
    339       "detail": "6 of 8 authors are from AWS AI Labs (Amazon). Amazon has commercial interest in code completion tools. No conflicts of interest statement is provided."
    340     },
    341     {
    342       "flag": "Potential pretrained model contamination",
    343       "detail": "The benchmark is built from PyPI (public Python packages). CodeGen was pretrained on GitHub code which likely includes many of these same packages. The paper addresses fine-tuning data leakage but not pretrained model contamination."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "Evaluating large language models trained on code",
    349       "authors": ["Mark Chen", "Jerry Tworek"],
    350       "year": 2021,
    351       "relevance": "Introduces Codex and HumanEval, foundational work on LLM-based code generation evaluation."
    352     },
    353     {
    354       "title": "CodeGen: An open large language model for code with multi-turn program synthesis",
    355       "authors": ["Erik Nijkamp", "Bo Pang"],
    356       "year": 2022,
    357       "relevance": "The backbone model used in COCOMIC; open-source code LLM for code generation."
    358     },
    359     {
    360       "title": "Grounded copilot: How programmers interact with code-generating models",
    361       "authors": ["Shraddha Barke", "Michael B James", "Nadia Polikarpova"],
    362       "year": 2022,
    363       "relevance": "Studies developer interaction with AI code generation tools, relevant to understanding productivity impact."
    364     },
    365     {
    366       "title": "Repository-level prompt generation for large language models of code",
    367       "authors": ["Disha Shrivastava", "Hugo Larochelle", "Daniel Tarlow"],
    368       "year": 2022,
    369       "relevance": "Proposes repository-level prompt engineering for code LLMs, a closely related approach to cross-file context."
    370     },
    371     {
    372       "title": "RepoCoder: Repository-level code completion through iterative retrieval and generation",
    373       "authors": ["Fengji Zhang", "Bei Chen"],
    374       "year": 2023,
    375       "arxiv_id": "2303.12570",
    376       "relevance": "Concurrent work on repository-level code completion using iterative retrieval, directly comparable approach."
    377     },
    378     {
    379       "title": "DocCoder: Generating code by retrieving and reading docs",
    380       "authors": ["Shuyan Zhou", "Uri Alon", "Frank F Xu"],
    381       "year": 2022,
    382       "relevance": "Retrieval-augmented code generation using API documentation, related approach to using external context for code completion."
    383     },
    384     {
    385       "title": "A systematic evaluation of large language models of code",
    386       "authors": ["Frank F Xu", "Uri Alon", "Graham Neubig"],
    387       "year": 2022,
    388       "relevance": "Systematic evaluation methodology for code LLMs, relevant to benchmarking and evaluation design."
    389     },
    390     {
    391       "title": "InCoder: A generative model for code infilling and synthesis",
    392       "authors": ["Daniel Fried", "Armen Aghajanyan"],
    393       "year": 2022,
    394       "relevance": "Code infilling model relevant to code completion approaches and evaluation."
    395     },
    396     {
    397       "title": "Competition-level code generation with AlphaCode",
    398       "authors": ["Yujia Li", "David Choi"],
    399       "year": 2022,
    400       "relevance": "Large-scale code generation system evaluated on competitive programming, relevant to code generation capability assessment."
    401     },
    402     {
    403       "title": "Survey of hallucination in natural language generation",
    404       "authors": ["Ziwei Ji", "Nayeon Lee"],
    405       "year": 2022,
    406       "relevance": "Discusses hallucination in generation models, which COCOMIC addresses by providing cross-file context to reduce code hallucination."
    407     }
    408   ]
    409 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs