scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19091B)
      1 {
      2   "paper": {
      3     "title": "CROSSCODEEVAL: A Diverse and Multilingual Benchmark for Cross-File Code Completion",
      4     "authors": ["Yangruibo Ding", "Zijian Wang", "Wasi Uddin Ahmad", "Hantian Ding", "Ming Tan", "Nihal Jain", "Murali Krishna Ramanathan", "Ramesh Nallapati", "Parminder Bhatia", "Dan Roth", "Bing Xiang"],
      5     "year": 2023,
      6     "venue": "NeurIPS 2023",
      7     "arxiv_id": "2310.11248"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper provides a project website (https://crosscodeeval.github.io) and the benchmark is publicly available."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The benchmark dataset is released via the project website. Built from permissively-licensed GitHub repositories in Python, Java, TypeScript, and C#."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions using the Transformers library but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions or README with commands are described in the paper. Experimental setup is described at a high level but not with reproducible scripts."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results in Tables 2 and 7 report only point estimates (e.g., EM, ES percentages) with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims cross-file context leads to 'significant improvements' but uses no statistical significance tests. Comparisons are based on raw number differences only."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports percentage improvements with baseline context (e.g., 'up to 3.0x and 4.5x better exact code match when including retrieved and retrieved with reference context respectively'). Tables show absolute scores in both conditions."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for the number of benchmark examples per language or why this size is sufficient. Dataset statistics are reported (Table 1) but no power analysis or sample size rationale."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures reported across runs. All results are single-run point estimates."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple baselines are included: in-file context only (standard baseline), retrieved cross-file context, and retrieval with reference context. Multiple models (CodeGen variants, StarCoder variants, GPT-3.5-turbo) are compared."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include StarCoder (2023) and GPT-3.5-turbo, which were state-of-the-art code LMs at the time of publication."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Appendix D contains additional evaluation results and ablations. Section 3.5 analyzes retrieval quality with different context settings (in-file only, retrieved, retrieval with reference)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Four metrics are used: code exact match (EM), code edit similarity (ES), identifier EM, and identifier F1 score (Section 3.2)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of model outputs is performed. All evaluation is automated via code match and identifier match metrics. Human evaluation could have assessed code quality beyond exact match."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The benchmark is constructed from repositories created after specific dates to avoid overlap with model training data. The evaluation set is separate from any training data."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per programming language (Python, Java, TypeScript, C#) in Tables 2 and 7. Section 3.5 provides per-category retrieval quality analysis."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 3.5 analyzes cases where retrieval fails, with qualitative examples in Figures 9-10 and Appendix. The Limitations section discusses retrieval quality issues."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that the RG retrieval framework is 'not perfect' and identifies cases where it retrieves useless information. Footnote 8 notes they failed to execute the code from Zhang et al. (2023)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims that CrossCodeEval is 'extremely challenging' without cross-file context and shows 'clear improvements' with it. Tables 2 and 7 support both claims with detailed metrics."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The main causal claim is that adding cross-file context improves performance. This is tested via controlled experiments where the only variable changed is the presence/absence of cross-file context in the prompt. This is adequate for the claim."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper bounds claims to the four tested languages (Python, Java, TypeScript, C#) and the specific models tested. The Limitations section acknowledges zero-shot-only evaluation and retrieval quality constraints."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The Limitations section discusses memorization as an alternative explanation for performance, and the paper addresses potential data leakage by filtering repos by creation date. Section 3.5 discusses retrieval quality as a confound."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are specified by family and size (e.g., CodeGen-2B-mono, StarCoder-15.5B) but no specific checkpoint versions or snapshot dates are given. GPT-3.5-turbo is used without a version identifier."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Figure 3 shows the prompt construction with cross-file context template. The prompt format is described in detail in Section 3.3 and Appendix C with concrete examples."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 3.3 reports max sequence lengths (2048, 4096, 8192), max generation length (50), and greedy decoding. Appendix D.2 reports temperature=0.2 for nucleus sampling experiments."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The approach is straightforward prompt-based code completion with retrieval-augmented generation."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Sections 2.1-2.3 document the full pipeline: repository collection criteria, static analysis filtering, cross-file dependency identification, deduplication, and post-processing. Appendices A and B provide additional details."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Appendix E contains a dedicated 'Limitations' section covering zero-shot evaluation, cross-file context retrieval quality, and memorization concerns."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The Limitations section discusses specific threats: zero-shot-only evaluation limiting generalizability, RG framework's fixed context window causing retrieval failures, and potential memorization of evaluation data by code LMs."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 2.4 explicitly states the scope (4 languages, statement-level completion) and future extensions. Limitations section states zero-shot only, no few-shot evaluation was done."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The benchmark data is publicly available via the project website (https://crosscodeeval.github.io), allowing independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 2.1 describes collecting permissively licensed repos from GitHub created after specific dates, with star-count filtering and language restrictions."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data source is standard public benchmark constructed from GitHub repositories."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Sections 2.1-2.3 document the full pipeline from repository collection to final benchmark examples, including filtering criteria at each stage. Table 1 provides final statistics."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section found in the paper. Most authors are from AWS AI Labs, but no funding disclosure is provided."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Columbia University and AWS AI Labs."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Most authors are AWS AI Labs employees. AWS/Amazon has commercial interest in code completion tools, and the funder (employer) is not independent of the outcome. No disclosure of this conflict."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper does not state the training data cutoff dates for the models evaluated (CodeGen, StarCoder, GPT-3.5-turbo). They mitigate contamination by filtering repos by creation date but don't state model cutoffs."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Section 2.1 explicitly addresses this: repos were selected with creation dates after training data cutoffs of existing code LMs. The Limitations section discusses memorization risks. Popular packages are excluded from annotation."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "The benchmark is newly created from repos with creation dates designed to post-date model training data. Section 2.1 describes temporal filtering to mitigate contamination. Limitations section acknowledges remaining risk."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, API costs, or wall-clock time reported for running the benchmark evaluations across multiple models."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No mention of GPU hours, hardware used, or total computational budget for running experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CrossCodeEval is extremely challenging when relevant cross-file context is absent, with best model (StarCoder 15.5B) achieving only 8.82% code exact match in Python.",
    286       "evidence": "Table 2, Section 3.4: StarCoder 15.5B reports 8.82% code EM in Python with in-file context only.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Adding retrieved cross-file context improves performance by up to 3.0x (retrieved) and 4.5x (retrieval with reference) in code exact match.",
    291       "evidence": "Table 2, Section 3.4: StarCoder model shows these improvement factors across settings.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "The static-analysis-based approach effectively identifies examples that strictly require cross-file context for accurate completion.",
    296       "evidence": "Section 2.2-2.3: The approach uses tree-sitter parsing to identify cross-file identifier usage. Performance gap between in-file-only and cross-file-context settings validates this (Table 2).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "The RG retrieval framework is imperfect, sometimes retrieving useless information.",
    301       "evidence": "Section 3.5 analyzes retrieval quality and identifies failure cases. The gap between 'retrieved' and 'retrieval with reference' settings quantifies this limitation.",
    302       "supported": "strong"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "CrossCodeEval is a multilingual cross-file code completion benchmark in Python, Java, TypeScript, and C# that uses static analysis to ensure examples require cross-file context. State-of-the-art code LMs perform poorly without cross-file context (e.g., 8.82% EM for StarCoder on Python) but improve dramatically (up to 4.5x) when cross-file context is provided via retrieval. The benchmark reveals that existing single-file benchmarks like HumanEval significantly underestimate model capability in realistic multi-file settings.",
    307   "red_flags": [
    308     {
    309       "flag": "No uncertainty quantification",
    310       "detail": "All results are single-run point estimates with no error bars, confidence intervals, or variance across runs. This makes it impossible to assess whether observed differences are statistically meaningful."
    311     },
    312     {
    313       "flag": "Corporate conflict of interest not disclosed",
    314       "detail": "Most authors are AWS AI Labs employees. AWS has commercial interest in code completion tools. No competing interests statement is provided."
    315     }
    316   ],
    317   "cited_papers": [
    318     {
    319       "title": "Evaluating Large Language Models Trained on Code",
    320       "authors": ["Mark Chen"],
    321       "year": 2021,
    322       "arxiv_id": "2107.03374",
    323       "relevance": "Introduces HumanEval, a foundational code generation benchmark that CrossCodeEval critiques as insufficient for cross-file scenarios."
    324     },
    325     {
    326       "title": "A Large-Scale Benchmark for Code LM Evaluation",
    327       "authors": ["Jacob Austin"],
    328       "year": 2021,
    329       "relevance": "Introduces MBPP benchmark, another single-file code completion benchmark that CrossCodeEval aims to complement."
    330     },
    331     {
    332       "title": "StarCoder: may the source be with you!",
    333       "authors": ["Raymond Li"],
    334       "year": 2023,
    335       "arxiv_id": "2305.06161",
    336       "relevance": "Key baseline model evaluated in CrossCodeEval; represents state-of-the-art open code LMs."
    337     },
    338     {
    339       "title": "A Conversational Paradigm for Program Synthesis",
    340       "authors": ["Erik Nijkamp"],
    341       "year": 2023,
    342       "relevance": "CodeGen model family used as baseline in CrossCodeEval experiments."
    343     },
    344     {
    345       "title": "Repository-Level Prompt Generation for Large Language Models of Code",
    346       "authors": ["Disha Shrivastava"],
    347       "year": 2023,
    348       "relevance": "Proposes repository-level code generation framework; concurrent work to CrossCodeEval in the cross-file code completion space."
    349     },
    350     {
    351       "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation",
    352       "authors": ["Fengji Zhang"],
    353       "year": 2023,
    354       "relevance": "Concurrent work building repository-level code completion benchmark; the RG framework adopted in CrossCodeEval is inspired by this work."
    355     },
    356     {
    357       "title": "CodeBERTScore: Evaluating Code Generation with Pretrained Models of Code",
    358       "authors": ["Shuai Zhou"],
    359       "year": 2023,
    360       "relevance": "Proposes alternative code evaluation metrics relevant to assessing code generation quality."
    361     },
    362     {
    363       "title": "InCoder: A Generative Model for Code Infilling and Synthesis",
    364       "authors": ["Daniel Fried"],
    365       "year": 2023,
    366       "relevance": "Code LM with fill-in-the-middle capability relevant to code completion benchmarking."
    367     }
    368   ]
    369 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs