scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (23173B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "DOMAINEVAL: An Auto-Constructed Benchmark for Multi-Domain Code Generation",
      6     "authors": [
      7       "Qiming Zhu",
      8       "Jialun Cao",
      9       "Yaojie Lu",
     10       "Hongyu Lin",
     11       "Xianpei Han",
     12       "Le Sun",
     13       "Shing-Chi Cheung"
     14     ],
     15     "year": 2024,
     16     "venue": "AAAI Conference on Artificial Intelligence",
     17     "arxiv_id": "2408.13204",
     18     "doi": "10.48550/arXiv.2408.13204"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "All abstract claims are directly supported by experimental results in Table 1. The 68.94% gap (Llama-2-13b), domain distribution, and Pass@1→Pass@5 dynamics are explicitly reported.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The claim 'generating more samples can increase overall performance' conflates correlation with causation. Pass@1 (greedy decoding) vs Pass@5 (sampling with temperature 0.2) are different metrics, so improvement may stem from sampling strategy, not sample count alone. No controlled ablation isolates sample quantity as the causal variable.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Claims generalize from 12 tested models to 'LLMs' broadly (e.g., 'LLMs are generally good at computation'). No discussion of whether findings apply to other model families, instruction-tuning approaches, or code lengths outside the restricted 3-100 line range.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "Two case studies explain specific failures (RSA square root overflow, Linux locale-sensitive command parsing) but no systematic discussion of why cryptography/system domains are hard. Paper attributes failures to 'lack of domain knowledge' without ruling out training data distribution, context length, or task complexity confounds.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Pass@k (functional correctness via test suite execution) is a standard, well-justified proxy for code generation capability. The paper does not claim broader outcomes like 'production-readiness' or 'real-world utility.'",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No dedicated limitations or threats-to-validity section exists. The conclusion lists future work directions but not actual limitations of the benchmark, evaluation setup, or generalizability constraints.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No specific threats are discussed. The paper describes security mitigations (banned keywords) but not threats like LLM instruction-generation bias (Qwen2-72B used), test suite comprehensiveness, domain classification reliability, or selection bias in GitHub repos.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Scope boundaries are implicit (Python only, 3-100 lines, 6 specific domains, 12 instruction-tuned models, GitHub repos with ≥100 stars) but never formally stated. No explicit statement of what the benchmark does NOT measure or which model/language families are out-of-scope.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding source is disclosed. The paper neither states a funder nor declares itself unfunded independent work.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "All author affiliations are clearly listed (Chinese Academy of Sciences, Hong Kong University of Science and Technology). No apparent conflicts with evaluated LLM companies.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No funder disclosed; independence cannot be assessed. Assumes unfunded if not stated.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial interests statement is provided. The leaderboard platform (domaineval.github.io) is mentioned with no disclosure of financial stakes.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Key terms like 'domain-specific code' are demonstrated via examples (Figure 1) and implicitly defined by the six domains, but not formally defined. 'Pass@k' is cited but not defined in the paper. 'Fully automated pipeline' is explained procedurally but not formally defined.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Three contributions are explicitly listed in the abstract: (1) the DOMAINEVAL dataset, (2) the fully automated test-guided construction pipeline, and (3) identification of LLM limitations in domain-specific code generation.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The related work section engages substantively with 10+ prior benchmarks (HumanEval, MBPP, APPS, CoderEval, etc.), explicitly contrasting DOMAINEVAL's design (fully automated, multi-domain, real code from GitHub) vs. prior approaches (manual curation, single-domain, synthetic tasks, API-centric).",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "benchmark-creation": {
    122       "construct_design": {
    123         "construct_validity_argued": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "The paper selects six domains referencing prior work (Zhuo et al. 2024) but does not argue why these domains collectively measure domain-specific code generation capability. Design goal (align with 'real-world code requirements') is stated, not construct validity justification.",
    127           "source": "haiku"
    128         },
    129         "difficulty_distribution_characterized": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "Figure 5 shows line count distribution by domain (4-198 lines, avg 55.69) and the paper restricts to 3-100 lines as 'appropriate difficulty,' but no explicit difficulty tiers (easy/medium/hard) are defined, measured, or validated via item-response analysis.",
    133           "source": "haiku"
    134         },
    135         "ceiling_floor_effects_checked": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Table 1 shows ceiling effects (computation: 82-91% for top models) and floor effects (Llama-2-13b at 12% on cryptography), but these are observed, not discussed. No analysis of whether ceiling/floor limits benchmark discriminability or benchmark redesign is warranted.",
    139           "source": "haiku"
    140         },
    141         "human_baseline_included": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No human baseline is provided. The paper reports only LLM performance without human reference data, making it impossible to assess whether 82% in computation or 33% in cryptography represents good or poor model capability.",
    145           "source": "haiku"
    146         },
    147         "scoring_rubric_justified": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Pass@k is cited as standard (Chen et al. 2021) but not justified relative to alternatives (e.g., code similarity, syntax, runtime safety metrics). Import auto-completion in evaluation is a scoring decision ('tolerable flaw') that is not debated—missing imports may be a legitimate failure mode.",
    151           "source": "haiku"
    152         }
    153       },
    154       "robustness": {
    155         "contamination_resistance_designed": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "The paper claims continuous updates via the automated pipeline resist data contamination (citing Cao et al. 2024b) but provides no explicit contamination-resistance design (e.g., temporal splits, canary strings, dynamic generation). GitHub repos are selected by star count with no date cutoff, risking overlap with pre-2024 training data.",
    159           "source": "haiku"
    160         },
    161         "temporal_robustness_discussed": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "No discussion of benchmark longevity, versioning strategy, gaming risk ('will LLMs memorize this in 6 months?'), or maintenance plan. The pipeline's scalability for future updates is mentioned, but not temporal robustness strategy.",
    165           "source": "haiku"
    166         },
    167         "failure_modes_discussed": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Two case studies (Figures 7-8) explain LLM failures, not benchmark failure modes. The paper does not discuss what the benchmark does NOT measure, biases in domain selection, gaps in test coverage, or how the benchmark could be misused.",
    171           "source": "haiku"
    172         },
    173         "baseline_implementations_provided": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "The leaderboard URL is provided, but the paper does not explicitly state whether code for the benchmark construction pipeline, evaluation harness, or baseline models is published or reproducible.",
    177           "source": "haiku"
    178         }
    179       },
    180       "documentation": {
    181         "dataset_documentation_complete": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "Source repositories are listed by domain (Figure 2) but not version-pinned or linked. Collection methodology (test-method matching) is explained procedurally but lacks data quality metrics (e.g., how many candidates filtered at each step?). No formal data card provided.",
    185           "source": "haiku"
    186         },
    187         "licensing_and_access_clear": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "The leaderboard (domaineval.github.io) is referenced but no license (MIT, Apache, CC-BY, etc.) is specified. Access terms are not stated—is the benchmark downloadable, or leaderboard-only evaluation?",
    191           "source": "haiku"
    192         },
    193         "intended_use_specified": {
    194           "applies": true,
    195           "answer": false,
    196           "justification": "General intended use is clear ('evaluate LLMs' domain-specific coding capabilities'), but specific constraints are not formally stated. No guidance on what conclusions should NOT be drawn (e.g., 'not for evaluating system design' or 'not applicable to non-instruction-tuned models').",
    197           "source": "haiku"
    198         }
    199       }
    200     }
    201   },
    202   "claims": [
    203     {
    204       "claim": "LLMs are generally good at computation tasks (average 82.44% Pass@1) while falling short on cryptography (33.08%) and system coding (37.50%) domains.",
    205       "evidence": "Table 1, Pass@1 column: computation domain macro-average 82.44%, cryptography 33.08%, system 37.50% across 12 models.",
    206       "supported": "strong"
    207     },
    208     {
    209       "claim": "The performance gap between domains can be as much as 68.94% (80.94% - 12.0%) in Llama-2-13b-chat.",
    210       "evidence": "Table 1, Pass@1: Llama-2-13b computation 80.94%, cryptography 12.0%.",
    211       "supported": "strong"
    212     },
    213     {
    214       "claim": "Generating more samples increases overall LLM performance (Pass@1 53.42% → Pass@5 59.60%).",
    215       "evidence": "Table 1, average rows: Pass@1 macro-average 53.42%, Pass@5 macro-average 59.60%.",
    216       "supported": "strong"
    217     },
    218     {
    219       "claim": "Domain bias may increase with more samples; CodeLlama-13b-instruct standard deviation increases from 19.90 to 20.55.",
    220       "evidence": "Table 1, Std column: CodeLlama-13b Pass@1 Std 19.90, Pass@5 Std 20.55.",
    221       "supported": "moderate"
    222     },
    223     {
    224       "claim": "The fully automated pipeline provides contamination resistance by enabling continuous benchmark updates.",
    225       "evidence": "Abstract and introduction claim updates via pipeline maintain 'integrity and novelty' citing Cao et al. 2024b; no empirical demonstration provided.",
    226       "supported": "weak"
    227     },
    228     {
    229       "claim": "CodeLlama fine-tuning achieves 11.25% average improvement over Llama-2-13b (57.74% - 46.49% Pass@5) but domain gaps persist.",
    230       "evidence": "Table 1, Pass@5: CodeLlama-13b 57.74%, Llama-2-13b 46.49%; Std values still substantial (20.55 vs 24.10).",
    231       "supported": "strong"
    232     }
    233   ],
    234   "methodology_tags": [
    235     "benchmark-eval",
    236     "observational"
    237   ],
    238   "key_findings": "DOMAINEVAL—a 2,454-problem multi-domain code generation benchmark across six Python domains—reveals stark LLM performance disparities: computation achieves 82.44% Pass@1 (ceiling effects at 90%+), while cryptography and system domains score 33.08% and 37.50% respectively, with gaps exceeding 68% in some models. Increasing sample count (Pass@1 to Pass@5) improves aggregate performance by ~6%, but paradoxically increases domain bias variability in some models (CodeLlama std increases from 19.90 to 20.55). Two case studies highlight specific failure modes—LLMs mishandle large-integer square roots in RSA attack tasks and fail to account for locale-dependent Linux command output—suggesting domain-specific knowledge gaps rather than general capability limitations.",
    239   "red_flags": [
    240     {
    241       "flag": "No human baseline",
    242       "detail": "Without human performance reference, it is unclear whether 82% on computation or 33% on cryptography reflects LLM capability or artifacts of benchmark construction (e.g., instruction phrasing, test case design)."
    243     },
    244     {
    245       "flag": "Ceiling effects in computation domain",
    246       "detail": "Computation tasks show ceiling effects (82-91% for top models), limiting discriminative power for ranking models in this domain. Benchmark redesign may be needed."
    247     },
    248     {
    249       "flag": "Contamination resistance unvalidated",
    250       "detail": "Claims about resisting data contamination via continuous updates are speculative. GitHub repos with ≥100 stars lack date stamps, risking overlap with pre-2024 training data. No temporal validation provided."
    251     },
    252     {
    253       "flag": "No formal limitations section",
    254       "detail": "Absence of explicit scope boundaries, threats to validity, or generalizability caveats. Readers cannot assess applicability to non-instruction-tuned models, other languages, or different code lengths."
    255     },
    256     {
    257       "flag": "LLM-generated instructions introduce bias",
    258       "detail": "Instructions are generated by Qwen2-72B-Instruct, which may systematically bias task phrasing toward certain model families, favoring the Qwen and related model series."
    259     },
    260     {
    261       "flag": "Import auto-completion in evaluation",
    262       "detail": "Missing imports are auto-completed during evaluation ('tolerable flaw'), changing the true failure modes. Models that forget imports are not penalized, inflating scores for careless implementations."
    263     },
    264     {
    265       "flag": "Domain selection not justified",
    266       "detail": "Six domains are chosen by reference to prior work (Zhuo et al. 2024) but no argument is made for why these six construct domains collectively measure domain-specific capability or whether other domains should be included."
    267     },
    268     {
    269       "flag": "Limited model diversity",
    270       "detail": "12 models tested are predominantly instruction-tuned variants (GPT, Qwen, CodeLlama, DeepSeek). Generalization to non-instruction-tuned, multilingual, or non-English models is unclear."
    271     },
    272     {
    273       "flag": "No disclosure of funding source",
    274       "detail": "Funding source is not disclosed, raising questions about independence and whether institutional interests (e.g., promoting Chinese Academy of Sciences research) influence benchmark design."
    275     }
    276   ],
    277   "cited_papers": [
    278     {
    279       "title": "Evaluating Large Language Models Trained on Code",
    280       "relevance": "Foundational HumanEval benchmark for code generation evaluation; establishes Pass@k metric used in DOMAINEVAL."
    281     },
    282     {
    283       "title": "Program Synthesis with Large Language Models",
    284       "relevance": "MBPP benchmark; prior code generation benchmark that DOMAINEVAL extends to multi-domain scenarios."
    285     },
    286     {
    287       "title": "Measuring Coding Challenge Competence With APPS",
    288       "relevance": "APPS benchmark sourced from competitive programming; prior dataset for code generation evaluation."
    289     },
    290     {
    291       "title": "Concerned with Data Contamination? Assessing Countermeasures in Code Language Model",
    292       "relevance": "Addresses data contamination threat in LLM benchmarks; cited to justify DOMAINEVAL's continuous-update contamination resistance claim."
    293     },
    294     {
    295       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    296       "relevance": "Related multi-domain benchmark exploring LLM capability across domains via API calls; differentiates from DOMAINEVAL's focus on direct implementation."
    297     },
    298     {
    299       "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation",
    300       "relevance": "Class-level code generation benchmark; shows progression from function-level (HumanEval) to larger code structures."
    301     },
    302     {
    303       "title": "CodeBenchGen: Creating Scalable Execution-based Code Generation Benchmarks",
    304       "relevance": "Automated benchmark construction pipeline; directly related to DOMAINEVAL's fully-automated construction approach."
    305     },
    306     {
    307       "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation",
    308       "relevance": "Repository-level code generation; contrasts with DOMAINEVAL's function-level scope to clarify contribution boundaries."
    309     }
    310   ],
    311   "engagement_factors": {
    312     "practical_relevance": {
    313       "score": 2,
    314       "justification": "Benchmark covers real Python packages (numpy, pandas, cryptography), so practitioners can see LLM capability on familiar libraries. Leaderboard enables direct tool comparison. However, no discussion of production integration or how results translate to development workflows."
    315     },
    316     "surprise_contrarian": {
    317       "score": 2,
    318       "justification": "LLMs excelling at computation vs. struggling with cryptography aligns with intuition (training data bias toward tutorials). The observation that more samples increase domain bias contradicts scaling intuitions, but is based on one model's std deviation change and lacks systematic evidence."
    319     },
    320     "fear_safety": {
    321       "score": 1,
    322       "justification": "Cryptography domain is security-adjacent, and failures in RSA and key derivation are concerning. However, the paper frames this as a capability gap, not a safety risk. No discussion of whether weak cryptography implementation should be flagged as dangerous."
    323     },
    324     "demo_ability": {
    325       "score": 2,
    326       "justification": "The leaderboard (domaineval.github.io) allows live model submission and comparison. Practitioners can run their own models on the benchmark via the site. However, the paper does not describe how to download the benchmark locally or integrate it into development tools."
    327     },
    328     "drama_conflict": {
    329       "score": 1,
    330       "justification": "No controversial claims or conflict angles. Straightforward benchmark paper with findings presented as technical observations rather than provocative conclusions."
    331     },
    332     "brand_recognition": {
    333       "score": 1,
    334       "justification": "Authors are from Chinese Academy of Sciences (ISCAS) and Hong Kong University of Science and Technology—respected but not tier-1 labs (OpenAI, DeepMind, Meta AI). No prominent researchers named that would drive social media amplification."
    335     }
    336   },
    337   "hn_data": {
    338     "threads": [
    339       {
    340         "hn_id": "39831754",
    341         "title": "GPT-4V(ision) Unsuitable for Clinical Care and Education: An Evaluation",
    342         "points": 75,
    343         "comments": 52,
    344         "url": "https://news.ycombinator.com/item?id=39831754"
    345       },
    346       {
    347         "hn_id": "41663273",
    348         "title": "Unsafe Impedance: Safe Languages and Safe by Design Software",
    349         "points": 7,
    350         "comments": 1,
    351         "url": "https://news.ycombinator.com/item?id=41663273"
    352       },
    353       {
    354         "hn_id": "40135927",
    355         "title": "OpenAI: Training LLMs to Prioritize Privileged Instructions",
    356         "points": 3,
    357         "comments": 0,
    358         "url": "https://news.ycombinator.com/item?id=40135927"
    359       },
    360       {
    361         "hn_id": "41418082",
    362         "title": "Data Exposure from LLM Apps: An In-Depth Investigation of OpenAI's GPTs",
    363         "points": 2,
    364         "comments": 0,
    365         "url": "https://news.ycombinator.com/item?id=41418082"
    366       },
    367       {
    368         "hn_id": "41408373",
    369         "title": "Data Exposure from LLM Apps: An In-Depth Investigation of OpenAI's GPTs",
    370         "points": 2,
    371         "comments": 0,
    372         "url": "https://news.ycombinator.com/item?id=41408373"
    373       },
    374       {
    375         "hn_id": "39139543",
    376         "title": "Exploring Parent's Needs for Children-Centered AI to Support Preschoolers",
    377         "points": 2,
    378         "comments": 1,
    379         "url": "https://news.ycombinator.com/item?id=39139543"
    380       },
    381       {
    382         "hn_id": "37345839",
    383         "title": "Relighting Neural Radiance Fields with Shadow and Highlight Hints",
    384         "points": 2,
    385         "comments": 0,
    386         "url": "https://news.ycombinator.com/item?id=37345839"
    387       },
    388       {
    389         "hn_id": "41227450",
    390         "title": "Τ-Bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains",
    391         "points": 1,
    392         "comments": 0,
    393         "url": "https://news.ycombinator.com/item?id=41227450"
    394       },
    395       {
    396         "hn_id": "40965488",
    397         "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions",
    398         "points": 1,
    399         "comments": 0,
    400         "url": "https://news.ycombinator.com/item?id=40965488"
    401       },
    402       {
    403         "hn_id": "40157957",
    404         "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions",
    405         "points": 1,
    406         "comments": 0,
    407         "url": "https://news.ycombinator.com/item?id=40157957"
    408       }
    409     ],
    410     "top_points": 75,
    411     "total_points": 96,
    412     "total_comments": 54
    413   }
    414 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs