scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (20354B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Dynamic Benchmarking of Reasoning Capabilities in Code Large Language Models Under Data Contamination",
      6     "authors": [
      7       "Simin Chen",
      8       "Pranav Pusarla",
      9       "Baishakhi Ray"
     10     ],
     11     "year": 2025,
     12     "venue": "International Conference on Machine Learning",
     13     "arxiv_id": "2503.04149",
     14     "doi": "10.48550/arXiv.2503.04149"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract's three main claims — effective assessment under contamination (§4.2), diverse problem generation (§4.4, Table 1), and robust/consistent benchmarking (§4.5, Fig. 6) — are all supported by corresponding experimental sections.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The main causal claim ('data contamination creates a false sense of code reasoning capability') is supported by a controlled intervention design (§4.2) where the authors deliberately introduce contamination at varying levels and measure its effect on both static and dynamic benchmarks.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The title claims results for 'Code Large Language Models' broadly, but experiments are limited to Python code generation on two benchmarks (HumanEval and MBPP). The contamination experiment uses only 3 small models (1B-3B parameters). No discussion of whether findings extend to other languages or larger models.",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper hypothesizes about the DEEPSEEK-CODER anomaly (§4.2) but does not systematically discuss alternative explanations for the main findings, such as whether benchmark difficulty differences could explain performance gaps, or whether the regression outlier detection could produce false positives.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper measures Pass@1 on code generation benchmarks but repeatedly claims to measure 'reasoning capabilities.' Pass@1 on HumanEval/MBPP is a proxy for reasoning ability, but the paper does not acknowledge this gap or discuss what reasoning capability actually entails beyond benchmark performance.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 6 (Conclusion) contains a substantive limitations paragraph identifying two specific limitations: (1) computational cost requiring large LLMs for high consistency rates, and (2) excessive information in generated prompts that may confuse readers.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The limitations are specific to this study: the consistency rate drops from 95% to 83% with a smaller foundation LLM (§4.6), and generated questions sometimes contain excessive information. These are concrete, study-specific observations.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of limitations to specific languages (Python only), model sizes (only 1B-13B for contamination experiments), or task types (only NL-to-code generation).",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Acknowledgements section states: 'This work was supported in part by CCF 2313055, CCF 2107405, CAREER 2025082, and FAI: 2040961.'",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All authors are disclosed as being from Columbia University Department of Computer Science. They evaluate models from external companies (Meta, DeepSeek, Alibaba, Anthropic) and use Anthropic's Claude as their foundation model.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Funding is from NSF grants (CCF, CAREER, FAI programs), which are government research grants independent of any LLM vendor's commercial interests.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is present in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "'Reasoning capabilities' is central to the paper's title and claims but is never precisely defined or distinguished from memorization beyond qualitative framing; 'semantic equivalence' is used extensively but defined only through examples rather than formally.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 1 explicitly enumerates three contributions: novel problem characterization, new methodology design (DyCodeEval), and empirical findings; readers know exactly what the paper adds.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 systematically covers three research lines (benchmarking Code LLMs, contamination-free strategies, LLM-as-judge) and situates DyCodeEval explicitly against PPM, LiveCodeBench, DyVal, and ITD.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "benchmark-creation": {
    118       "construct_design": {
    119         "construct_validity_argued": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Section 3.1 argues via metamorphic testing principles that modifying 'complexity-unrelated context description' while preserving 'complexity-related algorithmic abstraction' isolates reasoning from memorization; the theoretical grounding is explicit.",
    123           "source": "haiku"
    124         },
    125         "difficulty_distribution_characterized": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "The paper claims difficulty is preserved from seed datasets but provides no empirical analysis of the difficulty distribution of generated problems — no easy/medium/hard tiers and no difficulty measurements beyond aggregate Pass@1.",
    129           "source": "haiku"
    130         },
    131         "ceiling_floor_effects_checked": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No explicit ceiling/floor analysis is performed on the generated benchmark; the stability experiment (Fig 6) shows consistent means but does not report whether score distributions cluster near 0 or 1.",
    135           "source": "haiku"
    136         },
    137         "human_baseline_included": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No human baseline is provided; the benchmark is evaluated only against LLMs with no measurement of human performance on either seed or generated problems.",
    141           "source": "haiku"
    142         },
    143         "scoring_rubric_justified": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Section 5 introduces DyPass with explicit justification for its superiority over Pass@K for contamination detection; Tables 2–3 empirically validate the metric's discriminative ability.",
    147           "source": "haiku"
    148         }
    149       },
    150       "robustness": {
    151         "contamination_resistance_designed": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Contamination resistance is the core design goal; dynamic generation with randomized scenario/context selection, theoretical collision bounds (Theorems 3.1–3.3), and a validation agent all explicitly target resistance to data leakage.",
    155           "source": "haiku"
    156         },
    157         "temporal_robustness_discussed": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "The paper does not discuss whether DyCodeEval could be gamed if future LLMs train on its outputs, whether the scenario pool will become exhausted, or whether there is a plan for benchmark updates over time.",
    161           "source": "haiku"
    162         },
    163         "failure_modes_discussed": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Only engineering limitations are mentioned (compute cost, excessive information); no discussion of benchmark-level failure modes such as systematic validation agent errors, canonical solution mismatches after rewriting, or scenario pool exhaustion.",
    167           "source": "haiku"
    168         },
    169         "baseline_implementations_provided": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Only a project webpage URL is mentioned; no explicit statement of code release, dataset availability, or reproducibility package is provided, making it unclear whether others can replicate reported numbers.",
    173           "source": "haiku"
    174         }
    175       },
    176       "documentation": {
    177         "dataset_documentation_complete": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "No data card or formal documentation of the DyCodeEval-generated benchmark is provided; Appendix B describes only the seed datasets (HumanEval, MBPP) and not the generated variants.",
    181           "source": "haiku"
    182         },
    183         "licensing_and_access_clear": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "No license or access terms are specified; only a project webpage URL is mentioned without terms of use or distribution rights.",
    187           "source": "haiku"
    188         },
    189         "intended_use_specified": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "The Impact Statement describes intended use in general terms but does not specify what should NOT be concluded from DyCodeEval results or explicit boundaries on valid use cases.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "Data contamination creates a false sense of code reasoning capability under static benchmarks, inflating Pass@1 scores",
    201       "evidence": "Section 4.2 shows models fine-tuned on leaked HumanEval or MBPP data achieve higher Pass@1 on contaminated benchmarks while performance on uncontaminated benchmarks remains stable across contamination levels",
    202       "supported": "strong"
    203     },
    204     {
    205       "claim": "DyCodeEval prevents contaminated models from achieving artificially inflated Pass@1 scores through dynamic generation",
    206       "evidence": "Fig 4 shows the dynamic benchmarking rows do not exhibit the Pass@1 inflation seen in static rows for all three contamination-tested models",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "DyCodeEval generates more syntactically and semantically diverse problems than existing mutation-based approaches",
    211       "evidence": "Table 1 shows DyCodeEval achieves substantially lower BLEU-4 (0.27/0.18 internal, 0.17/0.02 external) and semantic similarity compared to all baselines including PPM and character/token mutations",
    212       "supported": "strong"
    213     },
    214     {
    215       "claim": "DyCodeEval produces stable benchmarking results despite randomness",
    216       "evidence": "Fig 6 shows 10-run experiment with minimal variance in Pass@1 scores, though numeric standard deviations are not explicitly tabulated",
    217       "supported": "moderate"
    218     },
    219     {
    220       "claim": "QWEN2.5-CODER-7B is potentially contaminated on both HumanEval and MBPP",
    221       "evidence": "Fig 5 shows QWEN2.5-CODER-7B consistently falls outside the 95% confidence interval of the in-the-wild model regression area on both datasets",
    222       "supported": "moderate"
    223     },
    224     {
    225       "claim": "Using CLAUDE-3.5-HAIKU as foundation model reduces problem consistency rate from 95% to 83%",
    226       "evidence": "Section 4.6 reports manual sampling assessment of generated problems from each model, but sample size and methodology are not fully detailed",
    227       "supported": "weak"
    228     }
    229   ],
    230   "methodology_tags": [
    231     "benchmark-eval",
    232     "theoretical"
    233   ],
    234   "key_findings": "DyCodeEval combats data contamination in Code LLM benchmarking by dynamically generating semantically equivalent yet contextually varied Python programming problems using four LLM agents (Scenario Proposer, Context Generator, Prompt Rewriter, Validator) grounded in metamorphic testing principles. Controlled contamination experiments show that models fine-tuned on leaked benchmark data achieve inflated Pass@1 scores on static benchmarks but not on DyCodeEval-generated variants. Generated problems are substantially more diverse than mutation-based baselines (BLEU-4 as low as 0.02 vs 0.57–1.00 for alternatives). QWEN2.5-CODER-7B was flagged as likely contaminated on both HumanEval and MBPP based on its outlier position in regression analysis of 15 in-the-wild models.",
    235   "red_flags": [
    236     {
    237       "flag": "No human baseline",
    238       "detail": "No human performance data is provided to validate that generated problems preserve difficulty; claims about complexity equivalence rely entirely on the LLM validator and a 30-pair human check."
    239     },
    240     {
    241       "flag": "Contamination study uses only tiny models",
    242       "detail": "The controlled contamination experiment (Section 4.2) uses only three small models (≤3B parameters); it is unclear if the findings generalize to larger frontier models or instruction-tuned variants."
    243     },
    244     {
    245       "flag": "Human verification sample too small",
    246       "detail": "Only 30 pairs per dataset (60 total) were reviewed by two graduate students; the 95% agreement rate is based on this small sample with no inter-rater reliability statistics reported."
    247     },
    248     {
    249       "flag": "Generator model dependency",
    250       "detail": "DyCodeEval relies on CLAUDE-3.5-SONNET for generation; future API changes or model updates would make exact reproduction impossible, and the same model is also evaluated as a benchmark subject (conflict of interest)."
    251     },
    252     {
    253       "flag": "No confirmed code or data release",
    254       "detail": "Only a project webpage URL is mentioned; no explicit code repository, dataset download, or API access is stated, making reproducibility of reported numbers unclear."
    255     },
    256     {
    257       "flag": "Python-only scope not declared",
    258       "detail": "All evaluation uses Python-only benchmarks (HumanEval: 164 problems, MBPP: 427 problems), but claims are made about 'Code LLMs' generally without explicitly bounding the scope."
    259     }
    260   ],
    261   "cited_papers": [
    262     {
    263       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    264       "relevance": "Primary seed dataset; foundational Python code generation benchmark used as input to DyCodeEval"
    265     },
    266     {
    267       "title": "Program Synthesis with Large Language Models (MBPP)",
    268       "relevance": "Second seed dataset; crowdsourced Python programming benchmark used for evaluation"
    269     },
    270     {
    271       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    272       "relevance": "Direct prior work on contamination-free benchmarking via temporal splits from online platforms; key baseline comparison"
    273     },
    274     {
    275       "title": "PPM: Automated Generation of Diverse Programming Problems for Benchmarking Code Generation Models",
    276       "relevance": "Direct baseline using manually-defined operators to generate problem variants; DyCodeEval outperforms on diversity metrics"
    277     },
    278     {
    279       "title": "DyVal: Dynamic Evaluation of Large Language Models for Reasoning Tasks",
    280       "relevance": "Related dynamic benchmarking approach using DAG structures; conceptual predecessor to DyCodeEval's dynamic evaluation framework"
    281     },
    282     {
    283       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation (EvalPlus)",
    284       "relevance": "Addresses HumanEval's limited test case coverage; motivates need for rigorous benchmark design"
    285     },
    286     {
    287       "title": "Recent Advances in Large Language Model Benchmarks Against Data Contamination: From Static to Dynamic Evaluation",
    288       "relevance": "Survey of contamination-free benchmarking landscape that directly contextualizes DyCodeEval's contribution"
    289     },
    290     {
    291       "title": "Generalization or Memorization: Data Contamination and Trustworthy Evaluation for Large Language Models",
    292       "relevance": "Evidence that contamination is already present in current LLMs; motivates DyCodeEval's approach"
    293     }
    294   ],
    295   "engagement_factors": {
    296     "practical_relevance": {
    297       "score": 2,
    298       "justification": "DyCodeEval provides a usable framework for contamination-resistant benchmarking, though setup requires an LLM agent pipeline."
    299     },
    300     "surprise_contrarian": {
    301       "score": 1,
    302       "justification": "Data contamination in benchmarks is a known concern; the specific finding about QWEN2.5-CODER-7B adds a mildly surprising result."
    303     },
    304     "fear_safety": {
    305       "score": 1,
    306       "justification": "Raises concerns about unreliable LLM evaluations due to contamination but does not demonstrate novel attacks or safety threats."
    307     },
    308     "drama_conflict": {
    309       "score": 2,
    310       "justification": "Publicly flagging QWEN2.5-CODER-7B as potentially contaminated and arguing that 'static benchmarks create a false sense of accuracy' has moderate controversy potential."
    311     },
    312     "demo_ability": {
    313       "score": 1,
    314       "justification": "A project webpage exists but no readily installable tool or live demo is provided in the paper."
    315     },
    316     "brand_recognition": {
    317       "score": 1,
    318       "justification": "Columbia University authors, published at ICML. Evaluates well-known models but is not from a major AI lab."
    319     }
    320   },
    321   "hn_data": {
    322     "threads": [
    323       {
    324         "hn_id": "45537698",
    325         "title": "Virtual Memory for Real-time RISC-V systems using hPMP",
    326         "points": 22,
    327         "comments": 4,
    328         "url": "https://news.ycombinator.com/item?id=45537698"
    329       },
    330       {
    331         "hn_id": "45115249",
    332         "title": "When Do Consumers Lose from Variable Electricity Pricing?",
    333         "points": 3,
    334         "comments": 0,
    335         "url": "https://news.ycombinator.com/item?id=45115249"
    336       },
    337       {
    338         "hn_id": "42966672",
    339         "title": "Develop AI Agents for System Engineering in Factorio",
    340         "points": 3,
    341         "comments": 0,
    342         "url": "https://news.ycombinator.com/item?id=42966672"
    343       },
    344       {
    345         "hn_id": "26591284",
    346         "title": "The Unreasonable Ineffectiveness of Mathematics in Biology",
    347         "points": 3,
    348         "comments": 0,
    349         "url": "https://news.ycombinator.com/item?id=26591284"
    350       },
    351       {
    352         "hn_id": "45165536",
    353         "title": "How to Hack Transformers: Steering LLMs via Prompts, States, and Weight Edits",
    354         "points": 2,
    355         "comments": 1,
    356         "url": "https://news.ycombinator.com/item?id=45165536"
    357       },
    358       {
    359         "hn_id": "44798220",
    360         "title": "An Efficient End-to-End Dynamic Activation Framework for On-Device DNN Training",
    361         "points": 1,
    362         "comments": 0,
    363         "url": "https://news.ycombinator.com/item?id=44798220"
    364       },
    365       {
    366         "hn_id": "44030713",
    367         "title": "Cosmos: Predictable and Cost-Effective Adaptation of LLMs",
    368         "points": 1,
    369         "comments": 0,
    370         "url": "https://news.ycombinator.com/item?id=44030713"
    371       }
    372     ],
    373     "top_points": 22,
    374     "total_points": 35,
    375     "total_comments": 5
    376   }
    377 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs