scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21421B)
      1 {
      2   "paper": {
      3     "title": "ComplexCodeEval: A Benchmark for Evaluating Large Code Models on More Complex Code",
      4     "authors": ["Jia Feng", "Jiachen Liu", "Cuiyun Gao", "Chun Yong Chong", "Chaozheng Wang", "Shan Gao", "Xin Xia"],
      5     "year": 2024,
      6     "venue": "ASE '24",
      7     "arxiv_id": "2409.10280",
      8     "doi": "10.1145/3691620.3695552"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "GitHub repository URL provided: https://github.com/ComplexCodeEval/ComplexCodeEval (reference [1]). The paper states 'ComplexCodeEval, its construction tools, and all experimental results have been open-sourced.'"
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The benchmark dataset is released at the same GitHub repository. The paper explicitly states the benchmark and construction tools are open-sourced."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper mentions 'two A100-80G GPUs' and greedy decoding settings, but no requirements.txt, Dockerfile, or detailed environment specification with library versions is provided in the paper."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions are provided in the paper. The experimental setup is described at a high level but lacks specific commands or scripts to replicate results."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Results are reported as point estimates only (e.g., CodeBLEU scores, F1 scores) with no confidence intervals or error bars."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper makes comparative claims (e.g., 'CodeLlama-34B achieves the highest CodeBLEU score') but uses no statistical significance tests. Comparisons are based solely on raw numbers."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper reports percentage improvements with baseline context, e.g., 'incorporating full contextual information increase the average CodeBLEU scores of all LCMs in Java and Python code generation by 70.73% and 31.90%' and absolute deltas in Table 6 for data leakage analysis."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper selects 84-100 samples per task per language (Section 4.5) without justifying why this number is sufficient or conducting a power analysis."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Standard deviation (σ) is reported in Table 3 across models for each metric, showing variance across the different LCMs evaluated."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper compares performance across 10 LCMs and also compares ComplexCodeEval results against existing benchmarks (CoderEval, CrossCodeEval, Method2Test, APIbench) in Table 8."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Models evaluated include contemporary LCMs: DeepSeek-Coder (2024), StarCoder2 (2024), CodeLlama (2023), and GPT-3.5-Turbo. These were state-of-the-art at time of writing."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "RQ2 systematically ablates contextual information (D+S, D+S+I, D+S+C, D+S+C+Dep) to measure the contribution of each context type in Tables 4 and 5."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Multiple metrics used: CodeBLEU, Edit Similarity, BLEU, Exact Match, F1, and Recall across different tasks."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No human evaluation of model outputs is included. All evaluation is automated via CodeBLEU, BLEU, ES, F1, and Recall metrics. Human evaluation was only used for docstring quality during benchmark construction, not for evaluating system outputs."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The paper uses temporal splits based on model knowledge cutoff dates: 'specifically from data after September 15, 2023, based on the model's knowledge cut-off time' (Section 4.5)."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down by task (code generation, completion, test case generation, API recommendation), by language (Java, Python), by model, and by context condition."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No qualitative failure analysis or specific error examples are discussed. The paper reports aggregate metrics without examining why models fail on particular examples."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports cases where data leakage did not consistently improve performance (e.g., DSC-33B Python code generation CodeBLEU decreased by 0.21 on leaked data in Table 6), and notes inconsistent patterns across models."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Abstract claims about LCM limitations, context impact, and data leakage effects are all supported by experimental results in Sections 5.1-5.3 with specific numbers matching the abstract's claims."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper makes causal claims about context improving performance, tested via controlled ablation (adding context incrementally). The data leakage analysis uses temporal splits as a quasi-experimental design. These constitute adequate controlled single-variable manipulations."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The title claims evaluation of 'Large Code Models' generally, but testing is limited to Java and Python with 10 specific models. The paper does not explicitly bound generalizations to these languages and models in the abstract or conclusion."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper does not discuss alternative explanations for observed performance differences. For example, the data leakage analysis does not consider that code from different time periods may differ in complexity or style independent of leakage."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "Models are named by family and size (e.g., 'DeepSeek-Coder-33B', 'GPT-3.5-Turbo') but no specific version identifiers or snapshot dates are provided for the API models. Table 2 lists training cutoff dates but not model version identifiers."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The docstring generation prompt template is shown in Figure 4, but the actual prompts used for the four evaluation tasks (code generation, completion, test case generation, API recommendation) are not provided. The paper describes task inputs at a high level without showing exact prompt text."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Section 4.5 states: 'we set the temperature to 0, top_p to 1, and max_tokens to 1000 (with the API-recommended setting being 10).'"
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used. Models are evaluated via direct prompting for each task."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 3 describes the full data collection and preprocessing pipeline in detail: library selection via SourceRank, repository filtering by stars (>99), AST-based API extraction, test case matching, deduplication, and docstring generation. Specific numbers are provided at each stage."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 6.3 'Threats To Validity' discusses threats in benchmark construction and empirical study."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 6.3 discusses specific threats: timestamp fallback mechanism for missing git commit data, and computing resource constraints limiting model selection. These are specific to this study."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The paper does not explicitly state what its results do NOT show. It acknowledges limited model coverage but does not bound generalizations about what languages, domains, or model types are excluded from its claims."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The benchmark dataset and experimental results are open-sourced at the GitHub repository (reference [1])."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 3 provides detailed data collection procedures: library selection from Libraries.io, repository selection from GitHub (>99 stars), AST-based API extraction, with specific criteria and counts at each stage."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants. Data source is GitHub repositories selected by objective criteria (star count, library dependencies)."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The full pipeline is documented in Section 3 with counts: 85,000+ repositories → 9,169 Java and 27,178 Python after dependency filtering → 3,897 Java and 7,184 Python samples after extraction and deduplication."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Acknowledgment section lists funding: National Natural Science Foundation of China (No. 62472126), Natural Science Foundation of Guangdong Province, Shenzhen-Hong Kong Jointly Funded Project, and others."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are listed: three authors from Huawei (Chun Yong Chong, Shan Gao, Xin Xia), others from universities. This is relevant since no Huawei products are directly evaluated."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Funding is from government research foundations (NSFC, Guangdong Province, Shenzhen). These funders have no stake in which LCM performs better on the benchmark."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present in the paper. Three authors are affiliated with Huawei, but no disclosure statement addresses potential conflicts."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Table 2 lists training data cutoff dates for all models: DeepSeek-Coder and CodeLlama at 2023-01-01, StarCoder2 at 2023-09-14, GPT-3.5-Turbo at 2021-10-01."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "RQ3 (Section 5.3) is entirely dedicated to analyzing the impact of data leakage. The paper uses temporal splits and Exact Match metrics to detect and quantify train/test overlap."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "The benchmark deliberately includes timestamps for each sample to enable contamination-aware evaluation. Section 3.2.4 describes the time-tagging approach, and RQ3 directly measures the effect of contamination on performance."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants in the study. The docstring quality check by two authors is part of benchmark construction, not a human subjects study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No inference costs, API costs, or wall-clock times are reported despite using GPT-3.5-Turbo (a paid API) and running experiments on A100 GPUs."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "Hardware is mentioned (two A100-80G GPUs) but total GPU hours, training time, or API spend are not reported."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "LCMs exhibit suboptimal performance in complex development scenarios, with the best CodeBLEU scores of 27.54 (Python) and 34.08 (Java) for code generation.",
    287       "evidence": "Table 3 shows performance of 10 LCMs across 4 tasks. CodeLlama-34B achieves highest CodeBLEU in code generation (Section 5.1).",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "Rich contextual information greatly enhances LCM performance, with full context increasing average Java code generation CodeBLEU by 70.73%.",
    292       "evidence": "Table 4 shows systematic context ablation (D+S → D+S+I → D+S+C → D+S+C+Dep) with consistent improvements across all models (Section 5.2).",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Data leakage leads to overestimation of LCM performance, with average CodeBLEU increases of 1.22 (Java) and 3.10 (Python) on leaked vs. non-leaked data.",
    297       "evidence": "Table 6 compares performance on pre-cutoff vs. post-cutoff data with Exact Match metrics confirming leakage presence (Section 5.3).",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "LLM-generated docstrings improve code generation performance, with average CodeBLEU increasing by 3.33 compared to original docstrings.",
    302       "evidence": "Table 7 compares original vs. generated docstrings across 9 models in Python code generation (Section 6.1).",
    303       "supported": "moderate"
    304     }
    305   ],
    306   "methodology_tags": ["benchmark-eval"],
    307   "key_findings": "ComplexCodeEval introduces a multi-task benchmark with 11,081 samples from 3,162 GitHub repositories covering code generation, completion, test case generation, and API recommendation in Java and Python. The benchmark incorporates timestamps to detect and mitigate data leakage. Key findings show that LCMs still struggle with complex code (best CodeBLEU ~34 for Java code generation), that adding contextual information dramatically improves performance (up to 70.73% improvement), and that data leakage inflates evaluation scores by 1-7 points depending on task and model.",
    308   "red_flags": [
    309     {
    310       "flag": "No statistical significance tests",
    311       "detail": "All comparative claims between models and conditions are based on raw metric differences without any significance testing. With only 84-100 samples per condition, observed differences could be due to chance."
    312     },
    313     {
    314       "flag": "Small evaluation sample size",
    315       "detail": "Only 84-100 samples per task per language are used for evaluation (Section 4.5), which is small for making broad claims about LCM capabilities. No justification for this sample size is provided."
    316     },
    317     {
    318       "flag": "Huawei author affiliations without conflict disclosure",
    319       "detail": "Three of seven authors are affiliated with Huawei, but no competing interests statement is provided. While no Huawei products are directly evaluated, the absence of a disclosure statement is a gap."
    320     },
    321     {
    322       "flag": "Variance metric is across models not runs",
    323       "detail": "The σ column in Table 3 appears to measure standard deviation across different models, not across multiple runs of the same model. Since greedy decoding is used (temperature=0), there is only one run per model, so variance across experimental runs is not assessed."
    324     }
    325   ],
    326   "cited_papers": [
    327     {
    328       "title": "Evaluating large language models trained on code",
    329       "authors": ["Mark Chen"],
    330       "year": 2021,
    331       "arxiv_id": "2107.03374",
    332       "relevance": "Introduces HumanEval, a foundational benchmark for code generation evaluation that ComplexCodeEval aims to improve upon."
    333     },
    334     {
    335       "title": "Program synthesis with large language models",
    336       "authors": ["Jacob Austin"],
    337       "year": 2021,
    338       "arxiv_id": "2108.07732",
    339       "relevance": "Introduces MBPP benchmark for evaluating LLMs on code generation tasks."
    340     },
    341     {
    342       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming–The Rise of Code Intelligence",
    343       "authors": ["Daya Guo"],
    344       "year": 2024,
    345       "arxiv_id": "2401.14196",
    346       "relevance": "One of the main LCM families evaluated in ComplexCodeEval."
    347     },
    348     {
    349       "title": "Code llama: Open foundation models for code",
    350       "authors": ["Baptiste Roziere"],
    351       "year": 2023,
    352       "arxiv_id": "2308.12950",
    353       "relevance": "One of the main LCM families evaluated, achieving best performance on several tasks."
    354     },
    355     {
    356       "title": "StarCoder 2 and The Stack v2: The Next Generation",
    357       "authors": ["Anton Lozhkov"],
    358       "year": 2024,
    359       "arxiv_id": "2402.19173",
    360       "relevance": "One of the main open-source LCM families evaluated in the benchmark."
    361     },
    362     {
    363       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    364       "authors": ["Naman Jain"],
    365       "year": 2024,
    366       "arxiv_id": "2403.07974",
    367       "relevance": "Related contamination-free benchmark using temporal updates, which ComplexCodeEval's timestamp approach is inspired by."
    368     },
    369     {
    370       "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion",
    371       "authors": ["Yangruibo Ding"],
    372       "year": 2023,
    373       "relevance": "Cross-file code completion benchmark used as comparison point in the evaluation."
    374     },
    375     {
    376       "title": "Codereval: A benchmark of pragmatic code generation with generative pre-trained models",
    377       "authors": ["Hao Yu"],
    378       "year": 2024,
    379       "relevance": "Repository-level code generation benchmark used as direct comparison in Table 8."
    380     },
    381     {
    382       "title": "ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation",
    383       "authors": ["Xueying Du"],
    384       "year": 2023,
    385       "arxiv_id": "2308.01861",
    386       "relevance": "Class-level code generation benchmark compared in Table 1 for benchmark characteristics."
    387     },
    388     {
    389       "title": "EvoCodeBench: An Evolving Code Generation Benchmark Aligned with Real-World Code Repositories",
    390       "authors": ["Jia Li"],
    391       "year": 2024,
    392       "arxiv_id": "2404.00599",
    393       "relevance": "Repository-level evolving benchmark addressing data leakage, closely related to ComplexCodeEval's goals."
    394     },
    395     {
    396       "title": "Competition-level code generation with AlphaCode",
    397       "authors": ["Yujia Li"],
    398       "year": 2022,
    399       "relevance": "Large-scale competitive code generation system relevant to LCM capability evaluation."
    400     }
    401   ]
    402 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs