scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (21937B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "DynaCode: A Dynamic Complexity-Aware Code Benchmark for Evaluating Large Language Models in Code Generation",
      6     "authors": [
      7       "Wenhao Hu",
      8       "Jinhao Duan",
      9       "Chunchen Wei",
     10       "Li Zhang",
     11       "Yue Zhang",
     12       "Kaidi Xu"
     13     ],
     14     "year": 2025,
     15     "venue": "Annual Meeting of the Association for Computational Linguistics",
     16     "arxiv_id": "2503.10452",
     17     "doi": "10.48550/arXiv.2503.10452"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract's core claims — 189M unique problems, 16.8–45.7% average performance drop vs MBPP+, and contamination resistance — are all verified by Table 1 (combinatorial problem counts), Table 2 (performance results per model), and the fine-tuning experiment in Figure 5.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper claims DynaCode 'limits memorization' and that performance drops are caused by complexity, but the fine-tuning experiment cannot separate 'harder to memorize' from 'harder in general' — both predict smaller gains on DynaCode after fine-tuning. No control isolates these confounds.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Claims like 'LLMs are good at sequential execution' and 'LLMs struggle with problem understanding as complexity increases' are stated as general conclusions about LLMs, but the error analysis covers only GPT-3.5-Turbo and the call-graph analysis covers only 4 of 12 models.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Performance drops with complexity could be attributable to longer prompt length, larger number of required functions, or type-alignment constraints in call-graph construction — none of these alternative explanations are discussed.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Pass@1 measures whether generated code passes execution-based test cases; the paper consistently uses this to claim evaluation of 'code generation capability,' which is a reasonable direct measurement rather than a proxy.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "A dedicated Limitations section appears at the end of the paper, discussing maximum node count of 5 in call graphs and plans for future extension to more complex structures.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The limitations section only notes 'maximum node count of 5' without discussing validity threats such as: cyclomatic complexity as a difficulty proxy, quality of base problems from MBPP+/LeetCode, Python-only scope, or type-alignment sampling bias. Too thin to count as specific threats-to-validity.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper does not explicitly state scope boundaries — e.g., that results apply only to Python code generation, only to function-level tasks, or that findings should not be generalized to repository-level or multi-file generation tasks.",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No acknowledgment or funding section appears anywhere in the paper. Absence of any funding disclosure.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Authors' affiliations are clearly stated: University of Electronic Science and Technology of China (Hu, Wei) and Drexel University (Duan, L. Zhang, Y. Zhang, Xu).",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No funding is disclosed, so this criterion does not apply.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement, no patent or equity declarations appear anywhere in the paper.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key terms are formally defined: cyclomatic complexity with equation (1), call graph as a directed acyclic graph with specified properties, 'unit' and 'level' with explicit threshold formulas (equations 2–4), and 'complexity matrix' (equation 5).",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Three explicit contributions are enumerated in a bullet list at the end of the introduction: a dynamic evaluation strategy, complexity-aware metrics combining code and graph complexity, and the DynaCode benchmark with multi-LLM evaluation.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 covers related work on dynamic evaluation (DyVal, NPHardEval, DARG, Benchmark Self-Evolving) and coding benchmarks (HumanEval, MBPP, BigCodeBench, SWE-Bench), explicitly positioning DynaCode against each and explaining its differences.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "benchmark-creation": {
    121       "construct_design": {
    122         "construct_validity_argued": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "The paper argues that cyclomatic complexity captures control-flow branching that LLMs demonstrably struggle with (citing Jiang et al., 2025; Beger and Dutta, 2025) and that call-graph structures test inter-function dependency handling — both are linked to identified LLM failure modes.",
    126           "source": "haiku"
    127         },
    128         "difficulty_distribution_characterized": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "Table 1 shows problem distribution across 4 complexity units and 4 graph levels. Figure 10 validates that Pass@1 scores degrade systematically with both unit and level complexity across all 12 models, and Figure 8 shows examples of increasing problem complexity.",
    132           "source": "haiku"
    133         },
    134         "ceiling_floor_effects_checked": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The paper does not explicitly check for ceiling or floor effects. codegemma-7b-it scores 2.9% on DynaCode (near floor), and several weak models drop below 10% on harder units — potential floor effects for weak models are not discussed.",
    138           "source": "haiku"
    139         },
    140         "human_baseline_included": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No human performance baseline is provided anywhere in the paper. There is no discussion of how a human programmer would perform on DynaCode problems at any complexity level.",
    144           "source": "haiku"
    145         },
    146         "scoring_rubric_justified": {
    147           "applies": true,
    148           "answer": true,
    149           "justification": "Pass@1 is justified by reference to prior work (Chen et al., 2021) and its use enables consistent comparison with MBPP and MBPP+. Pass@3 results are also reported in Appendix D for robustness validation.",
    150           "source": "haiku"
    151         }
    152       },
    153       "robustness": {
    154         "contamination_resistance_designed": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Contamination resistance is a central design goal: 189M unique combinations make memorization impractical, new LeetCode problems are actively sourced for continuous refresh, and the fine-tuning ablation (Figure 5) empirically tests and supports contamination resistance.",
    158           "source": "haiku"
    159         },
    160         "temporal_robustness_discussed": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "The Limitations section acknowledges that advanced LLMs may eventually handle current call-graph structures and mentions plans to extend to more complex structures. The dynamic generation mechanism (continually adding new base problems from the web) provides inherent temporal adaptability.",
    164           "source": "haiku"
    165         },
    166         "failure_modes_discussed": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The paper does not discuss failure modes of the benchmark itself — e.g., type-alignment constraints that may bias which problems can be combined, Monkeytype annotation quality issues, or cases where dynamically combined prompts become semantically incoherent.",
    170           "source": "haiku"
    171         },
    172         "baseline_implementations_provided": {
    173           "applies": true,
    174           "answer": true,
    175           "justification": "Benchmark and evaluation code are publicly available at https://github.com/HWH-2000/DynaCode, as stated in the abstract and confirmed by the detailed appendix examples.",
    176           "source": "haiku"
    177         }
    178       },
    179       "documentation": {
    180         "dataset_documentation_complete": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Section 3 and appendices document sources (MBPP+, LeetCode), preprocessing pipeline (Monkeytype type annotation, bad-generation filtering), benchmark statistics (Tables 1, 5, 6), example prompts (Table 8), and all 16 call-graph structures (Figure 9).",
    184           "source": "haiku"
    185         },
    186         "licensing_and_access_clear": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "No licensing information is provided in the paper. The use of LeetCode problems raises unaddressed copyright concerns. Only a GitHub link is provided with no stated license terms for the benchmark or its components.",
    190           "source": "haiku"
    191         },
    192         "intended_use_specified": {
    193           "applies": true,
    194           "answer": false,
    195           "justification": "The paper states DynaCode is for 'evaluating LLMs on code generation tasks' but does not specify what should NOT be concluded — e.g., that high DynaCode scores don't imply real-world coding capability or multi-file task competence.",
    196           "source": "haiku"
    197         }
    198       }
    199     }
    200   },
    201   "claims": [
    202     {
    203       "claim": "DynaCode generates up to 189 million unique nested code problems across 4 complexity units and 16 call-graph structures",
    204       "evidence": "Table 1 shows combinatorial counts per unit/level; Table 5 in the appendix provides per-graph-type breakdowns summing to 189,263,141 total problems",
    205       "supported": "strong"
    206     },
    207     {
    208       "claim": "Evaluated LLMs show an average performance drop of 16.8% to 45.7% on DynaCode compared to MBPP+",
    209       "evidence": "Table 2: GPT-4o drops from 72.2% (MBPP+) to 55.4% (DynaCode) = 16.8pp; Meta-Llama-3.1-8B drops from 55.6% to 9.9% = 45.7pp, matching the stated range exactly",
    210       "supported": "strong"
    211     },
    212     {
    213       "claim": "DynaCode resists memorization: fine-tuned models show large gains on MBPP+ but minimal gains on DynaCode",
    214       "evidence": "Figure 5: GPT-3.5-Turbo fine-tuned on MBPP+ jumps 69.7%→88.6% on MBPP+ but only 32.6%→36.0% on DynaCode; Meta-Llama-3.1-8B jumps 55.6%→98.1% on MBPP+ but only 10.6%→23.6% on DynaCode",
    215       "supported": "moderate"
    216     },
    217     {
    218       "claim": "LLMs perform significantly better on sequential call graphs (G1–G4, G8) than multi-branch complex call graphs (G9–G16)",
    219       "evidence": "Figure 6 shows consistent higher Pass@1 on sequential graphs vs. complex branches across all 4 tested models (GPT-4o, GPT-3.5-Turbo, WizardLM-2-8x22B, Meta-Llama-3.1-405B)",
    220       "supported": "strong"
    221     },
    222     {
    223       "claim": "Problem Understanding error rate increases from 64.1% (Unit 1) to 88.8% (Unit 4) as complexity rises",
    224       "evidence": "Table 3 reports error categorization for GPT-3.5-Turbo only, based on 100 questions per call graph; generalizing to 'LLMs' is unsupported by this single-model analysis",
    225       "supported": "moderate"
    226     },
    227     {
    228       "claim": "Models with known data contamination (Meta-Llama-3-8B-Instruct) show disproportionately large performance drops on DynaCode",
    229       "evidence": "Figure 1 shows Meta-Llama-3-8B drops from 64.6% (MBPP) to 8.4% (DynaCode) while DeepSeek-V3 drops from 87.6% to 52.1%; only 2 models shown, contamination status asserted not proven",
    230       "supported": "weak"
    231     }
    232   ],
    233   "methodology_tags": [
    234     "benchmark-eval"
    235   ],
    236   "key_findings": "DynaCode generates 189M unique Python code generation problems by combining base problems from MBPP+ and LeetCode via directed acyclic call-graph structures, classifying problems by cyclomatic complexity (4 units) and graph topology (4 levels). Evaluated on 12 LLMs, all models show systematic performance degradation as both code and graph complexity increase, with average drops of 16.8–45.7pp versus MBPP+. Fine-tuning experiments provide evidence that DynaCode is more resistant to memorization than static benchmarks, as models that achieve near-perfect scores on MBPP+ after fine-tuning show only marginal improvements on DynaCode. Error analysis on GPT-3.5-Turbo reveals that Problem Understanding failures (not syntax or context errors) become dominant at higher complexity levels, growing from 64.1% to 88.8% of errors across units.",
    237   "red_flags": [
    238     {
    239       "flag": "No human baseline",
    240       "detail": "No human programmer performance is reported at any complexity level, making it impossible to assess whether DynaCode's harder units discriminate meaningful capability differences or represent poorly constructed tasks beyond reasonable human ability."
    241     },
    242     {
    243       "flag": "Causal contamination claim confounded",
    244       "detail": "The fine-tuning experiment cannot separate 'DynaCode resists memorization' from 'DynaCode is harder regardless of memorization' — both hypotheses predict identical patterns of smaller gains after fine-tuning on DynaCode vs. MBPP+."
    245     },
    246     {
    247       "flag": "Error analysis on single model only",
    248       "detail": "Table 3's error distribution analysis covers only GPT-3.5-Turbo; generalizing conclusions about error types across 'LLMs' broadly from one model is unsupported."
    249     },
    250     {
    251       "flag": "LeetCode licensing unaddressed",
    252       "detail": "The benchmark incorporates LeetCode problems and their official test cases without any discussion of copyright, licensing, or terms of use — this could limit legal reproducibility and distribution of the dataset."
    253     },
    254     {
    255       "flag": "Python-only scope never stated",
    256       "detail": "All benchmark problems are in Python, but this critical scope limitation is never explicitly called out; conclusions about LLM code generation capabilities cannot be assumed to transfer to other programming languages."
    257     },
    258     {
    259       "flag": "Type-alignment constraint biases problem selection",
    260       "detail": "Call-graph construction requires output types of parent functions to match input types of children — this constraint silently filters the combinatorial space and may systematically favor functions with common return types, biasing difficulty distribution."
    261     }
    262   ],
    263   "cited_papers": [
    264     {
    265       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    266       "relevance": "Foundational code generation benchmark and source of Pass@k metric adopted in DynaCode; primary baseline for comparison"
    267     },
    268     {
    269       "title": "Program Synthesis with Large Language Models (MBPP)",
    270       "relevance": "Primary source dataset for DynaCode unit functions; main static benchmark comparison throughout"
    271     },
    272     {
    273       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation (EvalPlus/MBPP+)",
    274       "relevance": "MBPP+ (EvalPlus-processed) is the direct unit function source for DynaCode; provides enhanced test cases"
    275     },
    276     {
    277       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    278       "relevance": "Contemporary benchmark representing state-of-the-art that DynaCode is positioned against in Table 6"
    279     },
    280     {
    281       "title": "DyVal: Graph-Informed Dynamic Evaluation of Large Language Models",
    282       "relevance": "Prior graph-based dynamic evaluation work; DynaCode extends this approach to the code generation domain"
    283     },
    284     {
    285       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    286       "relevance": "Alternative contamination-resistant benchmark that uses temporal splits rather than dynamic generation"
    287     },
    288     {
    289       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    290       "relevance": "Representative real-world code benchmark discussed as complementary, targeting repository-level tasks vs. DynaCode's function-level focus"
    291     },
    292     {
    293       "title": "DyVal 2: Dynamic Evaluation of Large Language Models by Meta Probing Agents",
    294       "relevance": "Related LLM-agent-driven dynamic evaluation approach that DynaCode explicitly distinguishes from by avoiding LLM-as-evaluator instability"
    295     }
    296   ],
    297   "engagement_factors": {
    298     "practical_relevance": {
    299       "score": 2,
    300       "justification": "Practitioners evaluating LLMs for code generation tasks can use DynaCode to stress-test models under controlled complexity, though the Python-only scope and pipeline setup overhead limit immediate broad applicability."
    301     },
    302     "surprise_contrarian": {
    303       "score": 2,
    304       "justification": "The magnitude of performance drops (up to 45.7pp below MBPP+ scores) is striking and challenges confidence in existing benchmark scores as reliable indicators of real-world coding capability."
    305     },
    306     "fear_safety": {
    307       "score": 0,
    308       "justification": "No AI safety or risk concerns are raised; the paper is purely about benchmark methodology and evaluation."
    309     },
    310     "drama_conflict": {
    311       "score": 1,
    312       "justification": "Implicit challenge to the validity of widely-used static benchmarks (HumanEval, MBPP) as reliable evaluation tools, framed constructively rather than controversially."
    313     },
    314     "demo_ability": {
    315       "score": 2,
    316       "justification": "Code is publicly available on GitHub and the dynamic generation pipeline allows others to generate benchmarks and run evaluations immediately on their own models."
    317     },
    318     "brand_recognition": {
    319       "score": 1,
    320       "justification": "Authors from Drexel University and UESTC — credible academic institutions but no major AI lab (OpenAI, Google, Meta, DeepMind) affiliation to drive recognition."
    321     }
    322   },
    323   "hn_data": {
    324     "threads": [
    325       {
    326         "hn_id": "44592304",
    327         "title": "Mixture-of-Recursions: Learning Adaptive Token-Level Computation",
    328         "points": 3,
    329         "comments": 0,
    330         "url": "https://news.ycombinator.com/item?id=44592304"
    331       },
    332       {
    333         "hn_id": "43288456",
    334         "title": "Computation-Aware ControlNet with Dynamic Router for Text-to-Image Generation",
    335         "points": 3,
    336         "comments": 0,
    337         "url": "https://news.ycombinator.com/item?id=43288456"
    338       },
    339       {
    340         "hn_id": "45328070",
    341         "title": "Why Johnny Cant Use Agents: Aspirations vs. Realities with AI Agents",
    342         "points": 2,
    343         "comments": 0,
    344         "url": "https://news.ycombinator.com/item?id=45328070"
    345       },
    346       {
    347         "hn_id": "35263649",
    348         "title": "A comprehensive capacity analysis of GPT-3 and GPT-3.5 models",
    349         "points": 2,
    350         "comments": 0,
    351         "url": "https://news.ycombinator.com/item?id=35263649"
    352       },
    353       {
    354         "hn_id": "26536350",
    355         "title": "Dynamic Kernel Matching for Non-Conforming Data: A Study of T-Cell Receptors",
    356         "points": 2,
    357         "comments": 0,
    358         "url": "https://news.ycombinator.com/item?id=26536350"
    359       },
    360       {
    361         "hn_id": "45467729",
    362         "title": "AegisShield: Democratizing Cyber Threat Modeling with Generative AI",
    363         "points": 1,
    364         "comments": 0,
    365         "url": "https://news.ycombinator.com/item?id=45467729"
    366       },
    367       {
    368         "hn_id": "44634645",
    369         "title": "Mixture-of-Recursions: Learning Dynamic Recursive Depths",
    370         "points": 1,
    371         "comments": 0,
    372         "url": "https://news.ycombinator.com/item?id=44634645"
    373       },
    374       {
    375         "hn_id": "44579442",
    376         "title": "Mixture-of-Recursions",
    377         "points": 1,
    378         "comments": 0,
    379         "url": "https://news.ycombinator.com/item?id=44579442"
    380       },
    381       {
    382         "hn_id": "44008034",
    383         "title": "Emotion-Sensitive Explanation Model",
    384         "points": 1,
    385         "comments": 0,
    386         "url": "https://news.ycombinator.com/item?id=44008034"
    387       },
    388       {
    389         "hn_id": "43349900",
    390         "title": "FlexControl: Dynamic Block Activation for Diffusion Models",
    391         "points": 1,
    392         "comments": 0,
    393         "url": "https://news.ycombinator.com/item?id=43349900"
    394       }
    395     ],
    396     "top_points": 3,
    397     "total_points": 17,
    398     "total_comments": 0
    399   }
    400 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs