scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (21285B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "DynaCode: A Dynamic Complexity-Aware Code Benchmark for Evaluating Large Language Models in Code Generation",
      6     "authors": [
      7       "Wenhao Hu",
      8       "Jinhao Duan",
      9       "Chunchen Wei",
     10       "Li Zhang",
     11       "Yue Zhang",
     12       "Kaidi Xu"
     13     ],
     14     "year": 2025,
     15     "venue": "Annual Meeting of the Association for Computational Linguistics",
     16     "arxiv_id": "2503.10452",
     17     "doi": "10.48550/arXiv.2503.10452"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract claims 'average performance drop of 16.8% to 45.7% compared to MBPP+' — Table 2 supports this across models. '189 million unique nested code problems' is supported by Table 1 (total 189,263,141). 'Performance progressively decreasing as complexity increases' is shown in Figure 3 and Table 2.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper claims 'DynaCode's dynamic evaluation strategy effectively mitigates data contamination' and 'LLMs struggle with parallel function dependencies.' These are causal claims. The fine-tuning experiment provides some evidence for the contamination claim but has confounds (different training set sizes, different data distributions). The complexity claims don't control for prompt length, which increases with call-graph complexity.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title claims 'Evaluating Large Language Models in Code Generation' broadly, but the benchmark tests only Python, only function-level composition from MBPP+, with a maximum of 5 nodes in call graphs. The paper does not bound its claims to these constraints. The Limitations section mentions the 5-node limit but does not address the language or task-type restrictions.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper does not discuss alternative explanations for the observed performance drops. For example, longer prompts from nested problems could independently cause degradation. Performance differences across call-graph types could be due to prompt structure rather than genuine code understanding differences. No confounds are discussed.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper measures Pass@1 on code generation tasks and frames results in terms of Pass@1 scores and code generation capability. The claims largely match the granularity of the measurements — they don't inflate Pass@1 into broader 'programming ability' claims.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing the maximum node count of 5 in call graphs and future extensions to more complex structures.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The Limitations section only mentions the 5-node call-graph constraint. It does not discuss specific threats like prompt-length confounds, reliance on MBPP+ as the sole base problem source, or potential biases in cyclomatic complexity as the sole complexity metric.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper does not explicitly state what the results do NOT show. It doesn't note that results are limited to Python, to function-level composition, to a specific set of base problems, or that the complexity metric captures only one dimension of real-world code difficulty.",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are clearly listed: University of Electronic Science and Technology of China and Drexel University. The authors are not affiliated with any of the model vendors being evaluated.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No funding information is disclosed, so independence cannot be verified. The authors are academic researchers not affiliated with model vendors, but without a funding disclosure, this criterion is not satisfied.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial interest declaration is present in the paper.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Cyclomatic complexity is defined with the formal formula (Eq. 1), call graphs are formally defined as directed acyclic graphs with specified properties, complexity units and levels are defined via threshold intervals (Eq. 2, 4), and the call-graph complexity metric M is explicitly stated (Eq. 3).",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 1 explicitly lists three contributions: the dynamic evaluation strategy, the complexity-aware metric combining code and call-graph complexity, and the DynaCode benchmark itself with multi-LLM evaluation.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 covers both dynamic evaluation methods (DyVal, NPHardEval, DyVal2, DARG) and static code benchmarks (HumanEval, MBPP, EvalPlus, BigCodeBench, SWE-Bench), explicitly contrasting DynaCode's approach against each category.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "benchmark-creation": {
    121       "construct_design": {
    122         "construct_validity_argued": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "The paper argues that cyclomatic complexity is chosen specifically because 'LLMs often struggle with code generation tasks involving complex control flow and branching logic,' and that call-graph nesting captures inter-function dependencies absent from isolated-function benchmarks.",
    126           "source": "haiku"
    127         },
    128         "difficulty_distribution_characterized": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "Table 1 reports problem counts across 4 complexity units and 4 graph levels; Table 2 and Figure 3 empirically confirm monotonic performance degradation across units, validating that the difficulty tiers separate as intended.",
    132           "source": "haiku"
    133         },
    134         "ceiling_floor_effects_checked": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "codegemma-7b-it averages 2.9% on DynaCode (near floor) and the paper does not flag or discuss this floor effect; no ceiling analysis is performed for top models.",
    138           "source": "haiku"
    139         },
    140         "human_baseline_included": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No human baseline performance is reported for any DynaCode problem set, leaving it unclear whether benchmark items are solvable by competent programmers and what a ceiling should look like.",
    144           "source": "haiku"
    145         },
    146         "scoring_rubric_justified": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Pass@1 is adopted with the sole justification of 'following previous work (Chen et al., 2021)'; no argument is made for why Pass@1 is appropriate for multi-function nested code or how partial credit for individual subfunctions should be handled.",
    150           "source": "haiku"
    151         }
    152       },
    153       "robustness": {
    154         "contamination_resistance_designed": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Dynamic generation of ~189M combinations and the demonstrated fine-tuning ablation (Figure 5) directly show that memorizing unit functions provides little benefit on nested variants, constituting an explicit contamination-resistance mechanism.",
    158           "source": "haiku"
    159         },
    160         "temporal_robustness_discussed": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "The paper notes active sourcing of new LeetCode problems to refresh the unit pool and explicitly states in the Limitations section that call-graph structures will be extended as models improve, constituting a rudimentary update plan.",
    164           "source": "haiku"
    165         },
    166         "failure_modes_discussed": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The only failure mode acknowledged is the 5-node cap on call graphs; the paper does not discuss failure modes such as models gaming call-graph patterns, prompt length confounds, or the benchmark's unsuitability for multi-file or non-Python tasks.",
    170           "source": "haiku"
    171         },
    172         "baseline_implementations_provided": {
    173           "applies": true,
    174           "answer": true,
    175           "justification": "The abstract and Section 5 state that 'our benchmark and evaluation code are available at https://github.com/HWH-2000/DynaCode,' enabling reproduction of reported numbers.",
    176           "source": "haiku"
    177         }
    178       },
    179       "documentation": {
    180         "dataset_documentation_complete": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "The paper documents source data (MBPP+, LeetCode), construction methodology (complexity classification, call-graph construction, test-case generation), filtering steps (bad generation removal), and detailed statistics in Tables 1 and 5.",
    184           "source": "haiku"
    185         },
    186         "licensing_and_access_clear": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "A GitHub URL is provided but no license for DynaCode itself is stated; the LeetCode problems incorporated have restrictive terms of service that the paper does not address, leaving the legal status of redistribution unclear.",
    190           "source": "haiku"
    191         },
    192         "intended_use_specified": {
    193           "applies": true,
    194           "answer": false,
    195           "justification": "The paper does not state what DynaCode is NOT appropriate for (e.g., evaluating multi-file completion, non-Python languages, agents), nor does it warn against over-interpreting results from models already trained on MBPP+.",
    196           "source": "haiku"
    197         }
    198       }
    199     }
    200   },
    201   "claims": [
    202     {
    203       "claim": "DynaCode can generate up to 189 million unique nested code problems across 4 complexity units and 16 call-graph structures.",
    204       "evidence": "Table 1 and Table 5 provide detailed combinatorial counts summing to 189,263,141 problems.",
    205       "supported": "strong"
    206     },
    207     {
    208       "claim": "All 12 evaluated LLMs show average performance drops of 16.8% to 45.7% on DynaCode compared to MBPP+.",
    209       "evidence": "Table 2 reports Pass@1 scores for all 12 models on both MBPP+ and DynaCode; the ranges are directly computable from the data.",
    210       "supported": "strong"
    211     },
    212     {
    213       "claim": "Fine-tuning on MBPP+ causes large gains on MBPP+ but minimal gains on DynaCode, demonstrating DynaCode's contamination resistance.",
    214       "evidence": "Figure 5 shows GPT-3.5-Turbo improves from 69.7% to 88.6% on MBPP+ but only 32.6% to 36.0% on DynaCode after fine-tuning.",
    215       "supported": "moderate"
    216     },
    217     {
    218       "claim": "LLMs perform significantly better on sequential call graphs (G1–G4, G8) than on multi-branch structures (G9–G16).",
    219       "evidence": "Figure 6 shows this pattern for 4 evaluated models, though the analysis does not cover all 12 benchmark participants.",
    220       "supported": "moderate"
    221     },
    222     {
    223       "claim": "Problem Understanding errors in GPT-3.5-Turbo increase monotonically from 64.1% in Unit 1 to 88.8% in Unit 4 as complexity rises.",
    224       "evidence": "Table 3 reports error distribution across 4 units for GPT-3.5-Turbo; findings generalized to 'LLMs' in section heading despite single-model scope.",
    225       "supported": "weak"
    226     }
    227   ],
    228   "methodology_tags": [
    229     "benchmark-eval"
    230   ],
    231   "key_findings": "DynaCode dynamically generates ~189M nested Python code problems by combining existing unit functions via call-graph structures with 4 cyclomatic complexity tiers and 16 graph topologies, achieving massive diversity that resists memorization. Evaluation of 12 LLMs shows consistent performance degradation as complexity increases, with average drops of 16.8–45.7% versus the static MBPP+ benchmark. A controlled fine-tuning experiment demonstrates that memorizing unit functions provides minimal benefit on full DynaCode problems, empirically validating contamination resistance. LLMs systematically favor sequential call graphs over multi-branch structures, with error analysis on GPT-3.5-Turbo showing problem-understanding failures dominating at higher complexity levels.",
    232   "red_flags": [
    233     {
    234       "flag": "No human baseline",
    235       "detail": "No human performance data is provided for any complexity level; it is unknown whether the hardest problems are solvable by competent programmers or whether low model scores reflect task validity issues."
    236     },
    237     {
    238       "flag": "Floor effects uninvestigated",
    239       "detail": "codegemma-7b-it averages 2.9% on DynaCode and several models approach floor-level performance, but the paper does not flag or analyze floor effects."
    240     },
    241     {
    242       "flag": "Error analysis single-model overgeneralization",
    243       "detail": "Table 3 covers only GPT-3.5-Turbo, but Section 4.3 titles are written as universal claims about 'LLMs' without qualifying that findings apply to one model."
    244     },
    245     {
    246       "flag": "Call-graph complexity metric unjustified",
    247       "detail": "The metric M = Lmax × B × |E| (product of path length, branch count, and edge count) is proposed without theoretical or empirical justification for why the multiplicative form is appropriate versus, e.g., additive or weighted alternatives."
    248     },
    249     {
    250       "flag": "LeetCode ToS not addressed",
    251       "detail": "LeetCode problems incorporated into DynaCode have restrictive terms of service; the paper does not address whether these problems can be redistributed as part of the benchmark."
    252     },
    253     {
    254       "flag": "No funding disclosure",
    255       "detail": "The paper contains no acknowledgments section or funding disclosure despite multiple institutional affiliations."
    256     },
    257     {
    258       "flag": "Thin limitations section",
    259       "detail": "The Limitations section is a single paragraph mentioning only the 5-node cap on call graphs, leaving major scope limitations (Python-only, isolated functions, single-attempt evaluation) unacknowledged."
    260     }
    261   ],
    262   "cited_papers": [
    263     {
    264       "title": "Evaluating large language models trained on code (HumanEval)",
    265       "relevance": "Foundational code benchmark and Pass@k metric that DynaCode builds upon and compares against"
    266     },
    267     {
    268       "title": "Program synthesis with large language models (MBPP)",
    269       "relevance": "Primary source of unit functions used in DynaCode; the MBPP+ variant is the main comparison baseline"
    270     },
    271     {
    272       "title": "Is your code generated by ChatGPT really correct? rigorous evaluation of LLMs for code generation (EvalPlus)",
    273       "relevance": "Produces MBPP+ (the enhanced unit function set) and the augmented test case methodology adopted here"
    274     },
    275     {
    276       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    277       "relevance": "Contemporary code benchmark positioned as a point of comparison in Table 6 on problem scale"
    278     },
    279     {
    280       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    281       "relevance": "Real-world code benchmark situating DynaCode within the landscape of increasingly complex code evaluation"
    282     },
    283     {
    284       "title": "DyVal: Graph-informed dynamic evaluation of large language models",
    285       "relevance": "Direct methodological precedent for graph-based dynamic evaluation; DynaCode extends this idea to code generation"
    286     },
    287     {
    288       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    289       "relevance": "Another contamination-aware code benchmark; directly relevant for situating DynaCode's anti-contamination claims"
    290     },
    291     {
    292       "title": "EvoEval: Evolving coding benchmarks via LLM",
    293       "relevance": "Dynamic benchmark creation via LLM transformation; contrasted with DynaCode's automated structural approach"
    294     }
    295   ],
    296   "engagement_factors": {
    297     "practical_relevance": {
    298       "score": 2,
    299       "justification": "Released benchmark tool that practitioners can use to evaluate LLMs on code generation, though limited to Python function composition."
    300     },
    301     "surprise_contrarian": {
    302       "score": 1,
    303       "justification": "Confirms the expected finding that LLMs struggle with increased complexity; the magnitude of performance drops (up to 45.7pp) is noteworthy but not fundamentally surprising."
    304     },
    305     "fear_safety": {
    306       "score": 0,
    307       "justification": "No AI safety or security concerns raised; purely a benchmark evaluation paper."
    308     },
    309     "drama_conflict": {
    310       "score": 1,
    311       "justification": "Shows popular benchmarks like MBPP are contaminated and unreliable, but data contamination in LLM benchmarks is already well-established."
    312     },
    313     "demo_ability": {
    314       "score": 2,
    315       "justification": "GitHub repository released with benchmark generation and evaluation code; users could run it on their own models."
    316     },
    317     "brand_recognition": {
    318       "score": 1,
    319       "justification": "Evaluates well-known models (GPT-4o, DeepSeek-V3, Llama-3) but paper is from UESTC/Drexel, not a major AI lab."
    320     }
    321   },
    322   "hn_data": {
    323     "threads": [
    324       {
    325         "hn_id": "44592304",
    326         "title": "Mixture-of-Recursions: Learning Adaptive Token-Level Computation",
    327         "points": 3,
    328         "comments": 0,
    329         "url": "https://news.ycombinator.com/item?id=44592304"
    330       },
    331       {
    332         "hn_id": "43288456",
    333         "title": "Computation-Aware ControlNet with Dynamic Router for Text-to-Image Generation",
    334         "points": 3,
    335         "comments": 0,
    336         "url": "https://news.ycombinator.com/item?id=43288456"
    337       },
    338       {
    339         "hn_id": "45328070",
    340         "title": "Why Johnny Cant Use Agents: Aspirations vs. Realities with AI Agents",
    341         "points": 2,
    342         "comments": 0,
    343         "url": "https://news.ycombinator.com/item?id=45328070"
    344       },
    345       {
    346         "hn_id": "35263649",
    347         "title": "A comprehensive capacity analysis of GPT-3 and GPT-3.5 models",
    348         "points": 2,
    349         "comments": 0,
    350         "url": "https://news.ycombinator.com/item?id=35263649"
    351       },
    352       {
    353         "hn_id": "26536350",
    354         "title": "Dynamic Kernel Matching for Non-Conforming Data: A Study of T-Cell Receptors",
    355         "points": 2,
    356         "comments": 0,
    357         "url": "https://news.ycombinator.com/item?id=26536350"
    358       },
    359       {
    360         "hn_id": "45467729",
    361         "title": "AegisShield: Democratizing Cyber Threat Modeling with Generative AI",
    362         "points": 1,
    363         "comments": 0,
    364         "url": "https://news.ycombinator.com/item?id=45467729"
    365       },
    366       {
    367         "hn_id": "44634645",
    368         "title": "Mixture-of-Recursions: Learning Dynamic Recursive Depths",
    369         "points": 1,
    370         "comments": 0,
    371         "url": "https://news.ycombinator.com/item?id=44634645"
    372       },
    373       {
    374         "hn_id": "44579442",
    375         "title": "Mixture-of-Recursions",
    376         "points": 1,
    377         "comments": 0,
    378         "url": "https://news.ycombinator.com/item?id=44579442"
    379       },
    380       {
    381         "hn_id": "44008034",
    382         "title": "Emotion-Sensitive Explanation Model",
    383         "points": 1,
    384         "comments": 0,
    385         "url": "https://news.ycombinator.com/item?id=44008034"
    386       },
    387       {
    388         "hn_id": "43349900",
    389         "title": "FlexControl: Dynamic Block Activation for Diffusion Models",
    390         "points": 1,
    391         "comments": 0,
    392         "url": "https://news.ycombinator.com/item?id=43349900"
    393       }
    394     ],
    395     "top_points": 3,
    396     "total_points": 17,
    397     "total_comments": 0
    398   }
    399 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs