scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20518B)
      1 {
      2   "paper": {
      3     "title": "CORECODEBENCH: Decoupling Code Intelligence via Fine-Grained Repository-Level Tasks",
      4     "authors": ["Lingyue Fu", "Hao Guan", "Bolun Zhang", "Haowei Yuan", "Yaoming Zhu", "Jun Xu", "Zongyu Wang", "Lin Qiu", "Xunliang Cai", "Xuezhi Cao", "Weiwen Liu", "Weinan Zhang", "Yong Yu"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2507.05281"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub link provided: https://github.com/AGI-Eval-Official/CoreCodeBench and HuggingFace collection for data."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Dataset released on HuggingFace: https://huggingface.co/collections/tubehhh/corecodebench."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No mention of requirements.txt, Dockerfile, or detailed environment setup with library versions in the paper."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The paper references appendices for implementation details but does not include a reproducibility guide."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The paper states 'Confidence intervals and results for additional LLMs are provided in Appendix L' (Section 4.3)."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper uses Spearman and Pearson correlations but does not perform significance tests when claiming one model outperforms another. Rankings are compared by raw numbers without statistical tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Performance differences are reported with baseline context (e.g., 'pass rate disparity exceeding 35%', AC@1 and AC Rate scores with absolute values from which differences can be computed)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for why 12 repositories were chosen, or why 9 (later 12) LLMs were evaluated. The Multi-BugFix subset has only 10 instances with no power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Results are reported as single-run greedy decoding numbers. No variance across runs or seeds is reported for the main results."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares 9 state-of-the-art LLMs against each other and correlates with 8 external benchmarks (Section 4.6)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Models evaluated include GPT-5.2, Claude-4.5-Opus, Gemini-3-Pro, Kimi-K2, DeepSeek-V3.2 — all contemporary frontier models."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Multiple ablations: generator backbone sensitivity (Appendix B, Spearman rho=1.0), Information Gain filtering ablation (Appendix C, Table 5), mask length impact (Section 4.5), task quantity impact (Section 4.5)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Two metrics used: AC@1 (all tests pass) and AC Rate (proportion of fixed test cases), defined in Section 3."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Large-scale manual inspection on 360/511 development problems (70.5%) by experienced engineers evaluating readability, accuracy, and completeness, yielding 78.55% qualification rate with >95% inter-annotator agreement (Section 4.2)."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "For fine-tuning validation, a repository-level split (11 train, 1 test) is used to prevent data leakage (Section 4.2)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results broken down by 6 task types (Dev, BugFix, TDD, Multi-Dev, Multi-BugFix, Multi-TDD) in Tables 2 and 3, plus difficulty subsets."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Multi-BugFix 0% AC@1 for all models is discussed as a failure mode. The paper analyzes that models 'strictly adhered to the prompt's sequence, ignoring dependency logic' (Section 4.3)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that all models fail Multi-BugFix (0% AC@1), and discusses the 'performance collapse' and 'critical deficit in hierarchical reasoning' (Section 4.3)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims about 78.55% validity yield, capability misalignment, and ranking shifts are all supported by Tables 2-3, Figures 3-4, and Section 4.2."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims are primarily about ablations (mask length causes difficulty increase, IG filtering improves discrimination). These are supported by controlled single-variable manipulations in Sections 4.5 and Appendix C."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims 'Code Intelligence' broadly but results are exclusively on Python repositories. The Limitations section acknowledges this ('CoreCodeBench focuses exclusively on Python') but the title and abstract do not bound the claims to Python."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper considers that capability misalignment could be due to generator bias (Appendix B rules this out with ρ=1.0), and discusses that Multi-BugFix failure may stem from sequential prompting rather than fundamental inability (Section 4.3)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are referred to by marketing names only: 'GPT-5.2', 'Claude-4.5-Opus', 'Gemini-3-Pro', etc. No API versions, snapshot dates, or specific model IDs are provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper references 'detailed implementation settings in Appendix I' and 'prompts in Appendix G' but the extracted text does not include the full prompt text. Prompt templates with placeholders may be in the appendix but the actual fill values are not shown."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper states 'zero-shot setting using greedy decoding' (Section 4.1), which implies temperature=0. Implementation settings referenced in Appendix I."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. Models are evaluated in a single-turn zero-shot setting."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The full pipeline is documented: repository selection criteria (Section 2.1), context extraction, core code identification with masking, IG filtering (Appendix C retains 48.56%), and validity checking via unit tests."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing Python-only scope, unit test dependency, and Multi-BugFix size."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats discussed: dependency on high-coverage unit tests biases toward well-maintained projects, Multi-BugFix has only 10 instances with 'inherently lower statistical power', Python-only scope limits cross-lingual claims."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Limitations section explicitly states: 'CoreCodeBench focuses exclusively on Python; extending support to other languages remains a critical direction.' Also acknowledges repositories with sparse tests cannot be processed."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Dataset released on HuggingFace and code on GitHub, allowing independent verification of the benchmark tasks."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Repository selection criteria (activeness, test coverage >15%, >5000 LOC) detailed in Section 2.1, with full repository metadata in Table 4 (Appendix A)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants recruited for the study. The human inspection uses 'experienced engineers' but this is benchmark validation, not a human subjects study. Data source is public PyPI repositories."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Full pipeline documented across Sections 2.1-2.3: repository selection → context extraction → atomic task generation → IG filtering (48.56% retained) → composite task generation. Counts provided at each stage."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section found in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations clearly stated: Shanghai Jiao Tong University and Meituan. Internship relationship disclosed ('Work done while interning at Meituan')."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Meituan is a tech company that uses LLMs; some authors are Meituan employees. No funding disclosure makes independence assessment impossible."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training data cutoff dates stated for any of the 9 evaluated models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "The paper explicitly addresses contamination as a core motivation. The pipeline applies 'fine-grained transformations that alter the code's surface form and structural context' to prevent LLMs from 'simply recalling exact solutions from pre-training data' (Section 2.2). The benchmark is designed to be contamination-resilient."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Contamination resilience is a central design goal. The paper argues that code transformations (masking, bug injection, description generation) alter surface form enough to prevent memorization. However, the source repositories (e.g., transformers, langchain) are widely known and likely in training data — the paper's argument is that the tasks are novel transformations of this code."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants study. The manual inspection is benchmark quality validation, not a human subjects experiment."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, API spend, or tokens consumed reported for the evaluations across 9 LLMs on 1,524 tasks."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget stated. The fine-tuning experiment and large-scale evaluations would require significant compute but this is not quantified."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CoreCodeBench achieves a 78.55% validity yield, significantly surpassing SWE-bench-Verified's 31.7% retention rate.",
    286       "evidence": "Section 4.2: Manual inspection of 360/511 development problems by experienced engineers with >95% inter-annotator agreement.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "Coding proficiency is non-monolithic: models exhibit significant capability misalignment across cognitive dimensions.",
    291       "evidence": "Tables 2-3 show distinct ranking shifts across Dev, BugFix, TDD tasks. Figure 4 shows IoU values below 0.7 even for top models. Kimi-K2 pass rate disparity exceeds 35%.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "All models fail Multi-BugFix (0% AC@1), exposing a critical deficit in hierarchical reasoning.",
    296       "evidence": "Table 3 shows 0.00% AC@1 for all 9 models on Multi-BugFix. Non-zero AC Rates indicate partial progress.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "CoreCodeBench captures evaluation dimensions distinct from existing benchmarks.",
    301       "evidence": "Section 4.6: Hierarchical clustering (Figure 7b) shows CoreCodeBench forms a separate cluster from 8 external benchmarks. Spearman correlations with external benchmarks are generally low.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "COREPIPE's evaluation results are invariant to generator backbone choice (Spearman ρ = 1.0).",
    306       "evidence": "Appendix B: Four generator backbones tested; model rankings remain identical across all configurations.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "CoreCodeBench decomposes code intelligence into six task types (Dev, BugFix, TDD, and their multi-function composites) derived from the same code context, revealing that LLM coding proficiency is non-monolithic. Models show significant capability misalignment, with IoU overlap below 0.7 even for top models. All models achieve 0% AC@1 on Multi-BugFix, exposing a ceiling in multi-function debugging. The benchmark's controllable difficulty via mask length and dependency depth provides a mechanism to prevent performance saturation.",
    312   "red_flags": [
    313     {
    314       "flag": "Multi-BugFix extremely small sample",
    315       "detail": "The Multi-BugFix subset contains only 10 instances, making statistical conclusions about this category unreliable. The paper acknowledges this but still draws conclusions from these results."
    316     },
    317     {
    318       "flag": "No model version specificity",
    319       "detail": "All models referenced by marketing names only (GPT-5.2, Claude-4.5-Opus, etc.) without API versions or snapshot dates. Results may not be reproducible as model behavior changes across versions."
    320     },
    321     {
    322       "flag": "Single-run greedy evaluation",
    323       "detail": "All evaluations use single-run greedy decoding with no variance reporting. While greedy decoding is deterministic for a given model version, this prevents assessment of sensitivity to sampling strategies."
    324     },
    325     {
    326       "flag": "Validity comparison with SWE-bench-Verified may be unfair",
    327       "detail": "The 78.55% vs 31.7% comparison conflates different quality standards and task types. SWE-bench-Verified filters real GitHub issues for solvability, a fundamentally different quality criterion than the readability/accuracy/completeness used here."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Evaluating large language models trained on code",
    333       "authors": ["Mark Chen"],
    334       "year": 2021,
    335       "arxiv_id": "2107.03374",
    336       "relevance": "Foundational HumanEval benchmark for LLM code generation evaluation."
    337     },
    338     {
    339       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    340       "authors": ["Carlos E. Jimenez"],
    341       "year": 2024,
    342       "arxiv_id": "2310.06770",
    343       "relevance": "Major repository-level benchmark that CoreCodeBench positions itself against."
    344     },
    345     {
    346       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    347       "authors": ["Terry Yue Zhuo"],
    348       "year": 2025,
    349       "arxiv_id": "2406.15877",
    350       "relevance": "Contemporary code benchmark used in external validity comparison."
    351     },
    352     {
    353       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    354       "authors": ["Naman Jain"],
    355       "year": 2024,
    356       "arxiv_id": "2403.07974",
    357       "relevance": "Contamination-aware code benchmark used in external validity comparison."
    358     },
    359     {
    360       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    361       "authors": ["Jiawei Liu"],
    362       "year": 2023,
    363       "relevance": "HumanEvalPlus and MBPP-Plus benchmarks used in external validity analysis."
    364     },
    365     {
    366       "title": "TDD-bench verified: Can LLMs generate tests for issues before they get resolved?",
    367       "authors": ["Toufique Ahmed"],
    368       "year": 2024,
    369       "arxiv_id": "2412.02883",
    370       "relevance": "Test-driven development benchmark relevant to CoreCodeBench's TDD task type."
    371     },
    372     {
    373       "title": "Kimi K2: Open agentic intelligence",
    374       "authors": ["Kimi"],
    375       "year": 2025,
    376       "arxiv_id": "2507.20534",
    377       "relevance": "Open-weight agentic LLM evaluated in CoreCodeBench showing large capability misalignment."
    378     },
    379     {
    380       "title": "MultiPL-E: A scalable and extensible approach to benchmarking neural code generation",
    381       "authors": ["Federico Cassano"],
    382       "year": 2022,
    383       "arxiv_id": "2208.08227",
    384       "relevance": "Multi-language code benchmark used in external validity comparison."
    385     },
    386     {
    387       "title": "On the impacts of contexts on repository-level code generation",
    388       "authors": ["Nam Le Hai"],
    389       "year": 2025,
    390       "arxiv_id": "2406.11927",
    391       "relevance": "Studies context impact on repository-level code generation, directly relevant to CoreCodeBench's context-based task design."
    392     },
    393     {
    394       "title": "Qwen2.5-coder technical report",
    395       "authors": ["Binyuan Hui"],
    396       "year": 2024,
    397       "arxiv_id": "2409.12186",
    398       "relevance": "Technical report for Qwen coder models evaluated in the benchmark."
    399     }
    400   ]
    401 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs