scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (20341B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Dynamic Benchmarking of Reasoning Capabilities in Code Large Language Models Under Data Contamination",
      6     "authors": [
      7       "Simin Chen",
      8       "Pranav Pusarla",
      9       "Baishakhi Ray"
     10     ],
     11     "year": 2025,
     12     "venue": "International Conference on Machine Learning",
     13     "arxiv_id": "2503.04149",
     14     "doi": "10.48550/arXiv.2503.04149"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All major abstract claims are supported: contamination detection via controlled fine-tuning experiments (§4.2), semantic diversity via BLEU-4 and cosine similarity metrics (§4.4), and stable results across 10 runs (§4.5).",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The controlled contamination experiment (§4.2) intentionally fine-tunes models on leaked benchmark data at 0–100% rates, providing a reasonably controlled basis for the causal claim that DyCodeEval is resistant to contamination-inflated scores.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper evaluates only Python NL-to-code generation on two datasets (HumanEval, MBPP) but makes unqualified claims about 'Code LLMs' broadly; no scope boundary is stated for other languages, code tasks like completion or repair, or non-English prompts.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper briefly hypothesizes about the DeepSeek-Coder anomaly but does not discuss whether performance degradation on DyCodeEval for 'potentially contaminated' models could be explained by domain shift from the scenario rewriting or prompt sensitivity rather than contamination.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper explicitly proposes DyPass as a better proxy for reasoning capability versus memorization (§5), distinguishing what Pass@K measures from what DyPass@K measures under contamination conditions.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "Limitations are embedded in the conclusion (Section 6) — computational cost and verbose output — but there is no dedicated limitations or threats-to-validity section separate from the conclusion.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The paper mentions only operational limitations (cost, verbose output) but does not discuss validity threats such as whether fine-tuning contamination simulation reflects real pretraining contamination, or whether the two seed datasets are representative.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No explicit scope boundaries are stated: the paper does not clarify that results are restricted to Python NL-to-code tasks, or that DyCodeEval may not generalize to multilingual settings, code completion, or repair tasks.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Funding is disclosed in the acknowledgements: NSF grants CCF 2313055, CCF 2107405, CAREER 2025082, and FAI: 2040961.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors are affiliated with Columbia University's Department of Computer Science, disclosed in the paper header.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The funders appear to be NSF/government grants independent of the evaluation outcome; no commercial funder with a financial stake in the benchmark results is disclosed.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is present in the paper; only NSF grants are acknowledged.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "The central term 'reasoning capability' — which DyCodeEval claims to measure — is never formally defined; the paper conflates reasoning with non-memorization without operationalizing what distinguishes the two.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 1 explicitly lists three contributions: novel problem characterization of static benchmark limitations, the DyCodeEval 4-agent methodology, and empirical findings on contamination resistance and diversity.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 systematically reviews three lines of contamination-free benchmarking and explicitly positions DyCodeEval against PPM (manual effort) and LiveCodeBench (semantic imbalance), showing how it addresses their specific limitations.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "benchmark-creation": {
    118       "construct_design": {
    119         "construct_validity_argued": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The paper grounds construct validity in metamorphic testing principles: modifying complexity-unrelated context preserves canonical solutions and algorithmic complexity, so the benchmark measures the same underlying capability — a principled if not deeply formalized argument.",
    123           "source": "haiku"
    124         },
    125         "difficulty_distribution_characterized": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "The paper claims complexity-equivalence between seed and generated problems but does not empirically characterize or verify the difficulty distribution (easy/medium/hard tiers) of the generated benchmark items.",
    129           "source": "haiku"
    130         },
    131         "ceiling_floor_effects_checked": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No explicit analysis of ceiling or floor effects is conducted; some models in Fig. 5 score very high on static benchmarks, but the paper does not investigate whether the dynamic benchmark resolves or merely shifts these effects.",
    135           "source": "haiku"
    136         },
    137         "human_baseline_included": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No human baseline for solving benchmark problems is provided; the human verification step (Appendix D) checks consistency of generated problems but does not measure human solve rates.",
    141           "source": "haiku"
    142         },
    143         "scoring_rubric_justified": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Pass@K is formally defined (Equation 1) and the new DyPass@K metric is introduced and explicitly justified as expanding the input space beyond Pass@K to better distinguish reasoning from memorization under contamination (§5).",
    147           "source": "haiku"
    148         }
    149       },
    150       "robustness": {
    151         "contamination_resistance_designed": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Contamination resistance is the core design principle: a 50×50 scenario-context space, dynamic randomness, and formal collision probability bounds (Theorems 3.1–3.3) collectively make identical problem regeneration extremely unlikely.",
    155           "source": "haiku"
    156         },
    157         "temporal_robustness_discussed": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "The paper does not discuss whether DyCodeEval will remain useful as models improve or as the scenario pool becomes known; no plan for updating scenarios, expanding coverage, or managing benchmark drift over time is provided.",
    161           "source": "haiku"
    162         },
    163         "failure_modes_discussed": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Only superficial failure modes are mentioned (computational cost, verbose prompts); deeper failure modes such as models learning to perform context-stripping, scenario pool exhaustion, or LLM-generated problems sharing statistical patterns are not discussed.",
    167           "source": "haiku"
    168         },
    169         "baseline_implementations_provided": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "A project webpage is referenced in the abstract but no explicit code release or reproduction package is described in the paper; full reproduction requires API access to Claude-3.5-Sonnet and reimplementing the generation pipeline from the appendix prompts.",
    173           "source": "haiku"
    174         }
    175       },
    176       "documentation": {
    177         "dataset_documentation_complete": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "The seed datasets are described in Appendix B and generation prompts in Appendix C, but the generated dataset itself has no data card, and the scenario pool collection methodology is only partially described.",
    181           "source": "haiku"
    182         },
    183         "licensing_and_access_clear": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "No licensing or access terms for DyCodeEval or its generated datasets are stated; the project webpage is referenced but terms of use are not specified in the paper.",
    187           "source": "haiku"
    188         },
    189         "intended_use_specified": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "The Impact Statement discusses societal benefits but does not specify what should NOT be concluded from DyCodeEval results, such as inapplicability to non-Python tasks or non-NL-to-code settings.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "Static benchmarks create inflated Pass@1 scores for contaminated models, misrepresenting true reasoning capability",
    201       "evidence": "Controlled fine-tuning with 0–100% leaked data shows steadily increasing Pass@1 on contaminated benchmarks while performance on other benchmarks remains stable (Fig. 4)",
    202       "supported": "strong"
    203     },
    204     {
    205       "claim": "DyCodeEval prevents contaminated models from achieving artificially inflated benchmark scores",
    206       "evidence": "In §4.2, models fine-tuned on leaked data show minimal or no improvement on DyCodeEval-generated problems compared to uncontaminated baselines, unlike on static benchmarks",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "QWEN2.5-CODER-7B is potentially contaminated on both HumanEval and MBPP",
    211       "evidence": "In Fig. 5, QWEN2.5-CODER-7B consistently falls outside the 95% CI of the in-the-wild regression area for both seed datasets",
    212       "supported": "moderate"
    213     },
    214     {
    215       "claim": "DyCodeEval generates semantically diverse problems while maintaining stable benchmarking results across runs",
    216       "evidence": "Table 1 shows low BLEU-4 (0.27/0.18) and cosine similarity (0.74/0.73) versus baselines; Fig. 6 shows low standard deviation across 10 independent runs",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "DyPass@K better detects contamination than Pass@K by exposing memorization",
    221       "evidence": "Table 2: contaminated models show Pass@K inflated to 0.82–0.89 while DyPass@K stays at 0.13–0.17, versus uncontaminated models where both metrics align",
    222       "supported": "strong"
    223     },
    224     {
    225       "claim": "The probability of generating identical problems across DyCodeEval runs is negligibly low",
    226       "evidence": "Theorems 3.1–3.3 provide formal probability bounds under uniform sampling from a 50×50 scenario-context space with mathematical proofs in Appendix A",
    227       "supported": "strong"
    228     }
    229   ],
    230   "methodology_tags": [
    231     "benchmark-eval",
    232     "theoretical"
    233   ],
    234   "key_findings": "DyCodeEval is a 4-agent pipeline (Scenario Proposer, Context Generator, Prompt Rewriter, Validator) that generates semantically diverse but algorithmically equivalent programming problems by modifying complexity-unrelated context while preserving canonical solutions, using metamorphic testing principles to resist data contamination. Controlled contamination experiments show that static benchmarks inflate Pass@1 for models fine-tuned on leaked data, while DyCodeEval remains resistant. In-the-wild evaluation across 12+ models identifies QWEN2.5-CODER-7B as a regression outlier suggesting potential contamination on both HumanEval and MBPP. The new DyPass metric more faithfully measures reasoning capability under contamination than Pass@K by varying the prompt context rather than sampling multiple solutions.",
    235   "red_flags": [
    236     {
    237       "flag": "Contamination simulation mismatch",
    238       "detail": "The paper simulates contamination via fine-tuning on leaked benchmark data, but real pretraining contamination occurs during large-scale training; models may respond very differently to these two types, and this validity threat is not discussed."
    239     },
    240     {
    241       "flag": "Reasoning undefined",
    242       "detail": "The central claim — that DyCodeEval measures 'reasoning capability' rather than memorization — relies on an undefined distinction; 'reasoning' is never operationalized, leaving the core theoretical contribution unverifiable."
    243     },
    244     {
    245       "flag": "Scope overclaim",
    246       "detail": "Evaluated only on Python NL-to-code on two benchmarks, but conclusions are framed as applying to 'Code LLMs' broadly without bounding results to this specific task type and language."
    247     },
    248     {
    249       "flag": "No human baseline",
    250       "detail": "No human performance baseline on benchmark problems is provided; it is unknown whether scenario-rewritten variants introduce unintended difficulty beyond what the metamorphic testing argument predicts."
    251     },
    252     {
    253       "flag": "QWEN contamination claim is weak",
    254       "detail": "Labeling QWEN2.5-CODER-7B as 'potentially contaminated' based solely on regression outlier status is a weak statistical claim; alternative explanations such as training methodology differences are not considered."
    255     },
    256     {
    257       "flag": "Reproducibility barrier",
    258       "detail": "Full reproduction requires API access to Claude-3.5-Sonnet for problem generation; no explicit code release is described in the paper, and the generated datasets are not shared as a static artifact."
    259     }
    260   ],
    261   "cited_papers": [
    262     {
    263       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    264       "relevance": "Foundational code generation benchmark used as the primary seed dataset for DyCodeEval"
    265     },
    266     {
    267       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    268       "relevance": "Key prior work on contamination-aware temporal benchmarking that DyCodeEval explicitly improves upon"
    269     },
    270     {
    271       "title": "PPM: Automated Generation of Diverse Programming Problems for Benchmarking Code Generation Models",
    272       "relevance": "Direct predecessor that DyCodeEval addresses, overcoming its manual operator definition and semantic imbalance limitations"
    273     },
    274     {
    275       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of LLMs for Code Generation (EvalPlus)",
    276       "relevance": "Prior work identifying HumanEval's limited test coverage, relevant to the benchmark rigor discussion"
    277     },
    278     {
    279       "title": "DyVal: Dynamic Evaluation of Large Language Models for Reasoning Tasks",
    280       "relevance": "Prior dynamic evaluation approach using DAG structures — related methodology and conceptual predecessor"
    281     },
    282     {
    283       "title": "ReCode: Robustness Evaluation of Code Generation Models",
    284       "relevance": "Used as comparison baseline for mutation-based diversity methods in Table 1"
    285     },
    286     {
    287       "title": "Program Synthesis with Large Language Models (MBPP)",
    288       "relevance": "Second seed dataset used across all DyCodeEval experiments"
    289     },
    290     {
    291       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    292       "relevance": "Representative code LLM benchmark cited in the broader benchmarking landscape survey"
    293     }
    294   ],
    295   "engagement_factors": {
    296     "practical_relevance": {
    297       "score": 3,
    298       "justification": "Directly addresses data contamination in LLM evaluation — a critical problem for any team benchmarking or comparing code LLMs in practice."
    299     },
    300     "surprise_contrarian": {
    301       "score": 2,
    302       "justification": "Provides suggestive evidence that QWEN2.5-CODER-7B may be contaminated on standard benchmarks, implicitly challenging published benchmark results."
    303     },
    304     "fear_safety": {
    305       "score": 1,
    306       "justification": "Contaminated benchmarks give false confidence in AI capabilities, with mild but real implications for deployment decisions — not primarily a safety paper."
    307     },
    308     "drama_conflict": {
    309       "score": 2,
    310       "justification": "Implicitly accuses QWEN2.5-CODER-7B of potential benchmark contamination, which is a commercially and reputationally charged claim against a specific named model."
    311     },
    312     "demo_ability": {
    313       "score": 2,
    314       "justification": "Project webpage is provided and all prompts are fully documented in the appendix; practitioners could reproduce the pipeline with API access to Claude-3.5-Sonnet."
    315     },
    316     "brand_recognition": {
    317       "score": 1,
    318       "justification": "Columbia University is a reputable institution but not a top AI lab; the paper uses Claude models but is not authored by Anthropic or Google or Meta."
    319     }
    320   },
    321   "hn_data": {
    322     "threads": [
    323       {
    324         "hn_id": "45537698",
    325         "title": "Virtual Memory for Real-time RISC-V systems using hPMP",
    326         "points": 22,
    327         "comments": 4,
    328         "url": "https://news.ycombinator.com/item?id=45537698"
    329       },
    330       {
    331         "hn_id": "45115249",
    332         "title": "When Do Consumers Lose from Variable Electricity Pricing?",
    333         "points": 3,
    334         "comments": 0,
    335         "url": "https://news.ycombinator.com/item?id=45115249"
    336       },
    337       {
    338         "hn_id": "42966672",
    339         "title": "Develop AI Agents for System Engineering in Factorio",
    340         "points": 3,
    341         "comments": 0,
    342         "url": "https://news.ycombinator.com/item?id=42966672"
    343       },
    344       {
    345         "hn_id": "26591284",
    346         "title": "The Unreasonable Ineffectiveness of Mathematics in Biology",
    347         "points": 3,
    348         "comments": 0,
    349         "url": "https://news.ycombinator.com/item?id=26591284"
    350       },
    351       {
    352         "hn_id": "45165536",
    353         "title": "How to Hack Transformers: Steering LLMs via Prompts, States, and Weight Edits",
    354         "points": 2,
    355         "comments": 1,
    356         "url": "https://news.ycombinator.com/item?id=45165536"
    357       },
    358       {
    359         "hn_id": "44798220",
    360         "title": "An Efficient End-to-End Dynamic Activation Framework for On-Device DNN Training",
    361         "points": 1,
    362         "comments": 0,
    363         "url": "https://news.ycombinator.com/item?id=44798220"
    364       },
    365       {
    366         "hn_id": "44030713",
    367         "title": "Cosmos: Predictable and Cost-Effective Adaptation of LLMs",
    368         "points": 1,
    369         "comments": 0,
    370         "url": "https://news.ycombinator.com/item?id=44030713"
    371       }
    372     ],
    373     "top_points": 22,
    374     "total_points": 35,
    375     "total_comments": 5
    376   }
    377 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs