scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (23042B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "DOMAINEVAL: An Auto-Constructed Benchmark for Multi-Domain Code Generation",
      6     "authors": [
      7       "Qiming Zhu",
      8       "Jialun Cao",
      9       "Yaojie Lu",
     10       "Hongyu Lin",
     11       "Xianpei Han",
     12       "Le Sun",
     13       "Shing-Chi Cheung"
     14     ],
     15     "year": 2024,
     16     "venue": "AAAI Conference on Artificial Intelligence",
     17     "arxiv_id": "2408.13204",
     18     "doi": "10.48550/arXiv.2408.13204"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Abstract claims about domain performance gaps (82.44% computation vs 33.08% cryptography), the 68.94% gap in Llama-2-13b-chat, and increasing bias with more samples are all directly supported by Table 1.",
     26         "source": "opus"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper claims 'fine-tuning can bring about overall improvement, while the domain gaps still exist' based on comparing Llama-2-13b-chat vs CodeLlama-13b-Instruct. This is a causal claim, but confounds (different instruction tuning, different training data beyond code) are not addressed.",
     32         "source": "opus"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper's title claims 'Multi-Domain Code Generation' broadly, but all experiments are Python-only. The paper does not caveat that results may not generalize to other programming languages.",
     38         "source": "opus"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No discussion of alternative explanations for domain performance differences. For example, whether differences stem from training data distribution, domain-specific complexity, or test difficulty is not explored.",
     44         "source": "opus"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper measures Pass@k (functional correctness via test execution) and frames results in terms of 'code generation capability.' Pass@k directly measures functional correctness, and the claims match the granularity of measurements.",
     50         "source": "opus"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper has no limitations, threats to validity, or similar section. The conclusion mentions 'future research directions' but does not discuss limitations of the current work.",
     58         "source": "opus"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No threats to validity are discussed. There is no analysis of potential biases in the benchmark construction, domain classification, or evaluation methodology.",
     64         "source": "opus"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The paper does not state what the results do NOT show. It does not acknowledge that results are limited to Python, to function-level code generation, or to the specific repositories selected.",
     70         "source": "opus"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding sources, grants, or sponsorships are mentioned anywhere in the paper.",
     78         "source": "opus"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations are clearly listed: Chinese Academy of Sciences (Institute of Software) and Hong Kong University of Science and Technology. None of the authors are affiliated with the companies whose models are evaluated.",
     84         "source": "opus"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No funding is disclosed, so independence of funder cannot be assessed.",
     90         "source": "opus"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial interests statement is present in the paper.",
     96         "source": "opus"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Key terms ('domain-specific code generation', 'Pass@k', 'macro-average') are either defined in context, exemplified (Figure 1), or referenced as standard prior work (Chen et al. 2021). Sufficient precision for readers.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Abstract explicitly lists three contributions: (1) benchmark dataset with six domains, (2) automated pipeline, (3) findings on LLM limitations. Each is restated in conclusion.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Related Work (2 pages) systematically compares DOMAINEVAL against HumanEval/MBPP (common vs domain tasks), CoderEval/ClassEval (automation level), and Zhuo et al. 2024 (API usage vs implementation). Shows clear positioning.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "benchmark-creation": {
    122       "construct_design": {
    123         "construct_validity_argued": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "Paper assumes functional correctness on computation/cryptography/system tasks measures LLM capability in those domains, but never argues *why* (e.g., 'implementing RSA measures cryptographic understanding because X'). Validity is implicit, not justified.",
    127           "source": "haiku"
    128         },
    129         "difficulty_distribution_characterized": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "Figure 5 shows line-count distribution (4-198 lines, avg 55.69). Pass@1 results reveal difficulty gradient (computation 82.44% vs cryptography 33.08%). Constraints (3-100 lines) are stated. Not explicit tiers, but distribution is characterized.",
    133           "source": "haiku"
    134         },
    135         "ceiling_floor_effects_checked": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Table 1 reveals ceiling effects (computation >75% for most models) and moderate floors (cryptography 33.08%). Paper reports results but does not discuss ceiling/floor as a limitation or validity threat.",
    139           "source": "haiku"
    140         },
    141         "human_baseline_included": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No human evaluation, no human performance data, no validation that benchmark difficulty is calibrated appropriately. Cannot compare LLM performance to human baseline.",
    145           "source": "haiku"
    146         },
    147         "scoring_rubric_justified": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Pass@k borrowed from Chen et al. 2021 without domain-specific justification. Handling missing imports via automated completion is justified pragmatically ('tolerable flaw') but not principled. No discussion of why Pass@k is optimal for domain evaluation.",
    151           "source": "haiku"
    152         }
    153       },
    154       "robustness": {
    155         "contamination_resistance_designed": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "Abstract claims pipeline 'fortifies DOMAINEVAL against data contamination threat' but provides no mechanism: no temporal splits, canaries, or versioning strategy described. Claim is aspirational, not engineered.",
    159           "source": "haiku"
    160         },
    161         "temporal_robustness_discussed": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Paper states pipeline enables 'exceptional scalability, capable of incorporating the ever-evolving code corpus.' Plans for continuous updates mentioned. Lacks detail on managing obsolescence or model overfitting to benchmark.",
    165           "source": "haiku"
    166         },
    167         "failure_modes_discussed": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Case studies (Figs 7-8) show LLM failure modes, not benchmark failure modes. No discussion of what the benchmark cannot measure (e.g., code maintainability, security, real-world complexity).",
    171           "source": "haiku"
    172         },
    173         "baseline_implementations_provided": {
    174           "applies": true,
    175           "answer": true,
    176           "justification": "Leaderboard available at domaineval.github.io; evaluation procedure detailed in Section 'Experiment Setup' and 'Evaluation Process'; prompt template provided (Figure 6); sufficient for reproduction. Code availability not explicitly stated but procedural clarity is high.",
    177           "source": "haiku"
    178         }
    179       },
    180       "documentation": {
    181         "dataset_documentation_complete": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Collection methodology detailed (Domain Repository Collection, Test-Method Matching, Instruction Generation). Source description (GitHub >100 stars), preprocessing steps (filtering criteria), and dataset statistics (2454 subjects, 5892 tests) all provided.",
    185           "source": "haiku"
    186         },
    187         "licensing_and_access_clear": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "Leaderboard URL given but licensing of benchmark data, commercial use rights, and derivative work permissions are not specified. Unclear if code/test data are available under open license.",
    191           "source": "haiku"
    192         },
    193         "intended_use_specified": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Paper states benchmark 'designed to evaluate LLMs' coding capabilities thoroughly' and can be used for 'custom domain benchmarks.' Caveats (e.g., Pass@k doesn't measure maintainability) not discussed.",
    197           "source": "haiku"
    198         }
    199       }
    200     }
    201   },
    202   "claims": [
    203     {
    204       "claim": "LLMs are generally good at computation tasks while falling short on cryptography and system coding tasks",
    205       "evidence": "Table 1: Computation 82.44% Pass@1, Cryptography 33.08%, System 37.50% (macro-averages across 12 models)",
    206       "supported": "strong"
    207     },
    208     {
    209       "claim": "Performance gap between domains can exceed 68%",
    210       "evidence": "Table 1: Llama-2-13b-chat shows 80.94% (Computation) - 12.0% (Cryptography) = 68.94% gap",
    211       "supported": "strong"
    212     },
    213     {
    214       "claim": "Generating more samples increases overall performance",
    215       "evidence": "Table 1: Pass@1 average 53.42% vs Pass@5 average 59.60% (6.18pp improvement across all models and domains)",
    216       "supported": "strong"
    217     },
    218     {
    219       "claim": "Generating more samples increases domain bias",
    220       "evidence": "Section 'Impact of Generated Samples': Standard deviation (bias measure) decreases slightly on average (18.33→17.72) but CodeLlama-13b shows increase (19.90→20.55), indicating bias may increase for certain models",
    221       "supported": "moderate"
    222     },
    223     {
    224       "claim": "GPT-4o-mini exhibits the most stable performance across domains",
    225       "evidence": "Table 1: GPT-4o-mini has lowest standard deviation in Pass@5 (14.75) compared to 15.45-24.10 for other models",
    226       "supported": "strong"
    227     },
    228     {
    229       "claim": "Fine-tuning improves overall performance but domain gaps persist",
    230       "evidence": "Section 'LLMs Biases': CodeLlama-13b (fine-tuned from Llama-2-13b) achieves 11.25% improvement overall but domain gaps remain unresolved",
    231       "supported": "strong"
    232     },
    233     {
    234       "claim": "DOMAINEVAL provides an automated, scalable pipeline for benchmark construction",
    235       "evidence": "Section 'Benchmark Construction': Describes fully automated three-step pipeline (repository collection, test-method matching, instruction generation) applied to construct 2454 subjects",
    236       "supported": "strong"
    237     },
    238     {
    239       "claim": "Code from different domains requires different types of knowledge and skills",
    240       "evidence": "Figure 1 and case studies show computation involves mathematical operations, cryptography requires algorithm knowledge (RSA attacks), system tasks require OS understanding; all show different error patterns in Figure 7-8",
    241       "supported": "strong"
    242     }
    243   ],
    244   "methodology_tags": [
    245     "benchmark-eval",
    246     "empirical"
    247   ],
    248   "key_findings": "DOMAINEVAL reveals pronounced domain biases in LLM code generation: computation tasks average 82.44% Pass@1 while cryptography and system domains average 33.08% and 37.50% respectively, with individual models showing gaps exceeding 68%. GPT-4o-mini and Qwen2-72B-Instruct lead in overall performance with 67.13% and 64.25% Pass@5. Increasing sampling from 1 to 5 uniformly improves performance (+6.18pp average) but paradoxically increases domain bias in some models (CodeLlama-13b), suggesting models amplify their weaknesses rather than fixing them with more attempts.",
    249   "red_flags": [
    250     {
    251       "flag": "No human baseline",
    252       "detail": "Benchmark difficulty not validated against human performance. Unknown if 82% Pass@1 on computation is 'easy' (should be >95%) or 'appropriately hard' (should be 60-80%)."
    253     },
    254     {
    255       "flag": "No limitations section",
    256       "detail": "Paper omits systematic discussion of threats to validity, benchmark design limitations, or scope constraints beyond stating basic boundaries."
    257     },
    258     {
    259       "flag": "Instructions generated by LLM",
    260       "detail": "Uses Qwen2-72B to generate task descriptions rather than human-written instructions. Quality variation and consistency not discussed; could introduce artifact-specific biases."
    261     },
    262     {
    263       "flag": "GitHub selection bias",
    264       "detail": "Repositories selected for >100 stars may skew toward popular, well-maintained code. Small/niche domain projects underrepresented."
    265     },
    266     {
    267       "flag": "Contamination resistance claimed without mechanism",
    268       "detail": "Abstract claims automated pipeline 'fortifies DOMAINEVAL against data contamination threat' but provides no concrete mechanism (temporal splits, canary strings, versioning) beyond vague 'continuous updates'."
    269     },
    270     {
    271       "flag": "Ceiling effects in computation domain unaddressed",
    272       "detail": "Computation domain scores >75% for nearly all models, suggesting task difficulty may be too low to discriminate performance. Not flagged as a limitation."
    273     },
    274     {
    275       "flag": "Missing imports corrected automatically",
    276       "detail": "Paper adds missing import statements during evaluation to prevent 'tolerable flaw.' Raises fairness questions: are models being graded on import knowledge or just logic? Correction not standard in prior benchmarks."
    277     },
    278     {
    279       "flag": "Licensing and access unclear",
    280       "detail": "Paper mentions leaderboard availability but does not specify whether benchmark data (2454 subjects, 5892 tests) is released, under what license, or with what usage restrictions."
    281     },
    282     {
    283       "flag": "Construct validity assumed not argued",
    284       "detail": "Paper assumes functional correctness on domain-specific code reveals 'domain capability' but doesn't justify why (e.g., why RSA attack code measures cryptographic understanding)."
    285     },
    286     {
    287       "flag": "Alternative explanations not explored",
    288       "detail": "Why is computation intrinsically easier? Is it because training data emphasizes math? Because cryptography is rarer? Root causes of domain bias asserted but not analyzed."
    289     }
    290   ],
    291   "cited_papers": [
    292     {
    293       "title": "Evaluating Large Language Models Trained on Code",
    294       "authors": "Chen et al.",
    295       "year": 2021,
    296       "venue": "NeurIPS",
    297       "relevance": "Foundational HumanEval benchmark; establishes Pass@k metric and function-level code generation evaluation paradigm that DOMAINEVAL extends to multi-domain setting"
    298     },
    299     {
    300       "title": "Program Synthesis with Large Language Models",
    301       "authors": "Austin et al.",
    302       "year": 2021,
    303       "venue": "NeurIPS",
    304       "relevance": "MBPP benchmark for programming tasks; one of the primary 'common task' baselines that DOMAINEVAL contrasts against by adding domain diversity"
    305     },
    306     {
    307       "title": "Measuring Coding Challenge Competence With APPS",
    308       "authors": "Hendrycks et al.",
    309       "year": 2021,
    310       "venue": "NeurIPS",
    311       "relevance": "APPS algorithm competition dataset; related benchmark emphasizing complexity but not domain specialization that DOMAINEVAL addresses"
    312     },
    313     {
    314       "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models",
    315       "authors": "Yu et al.",
    316       "year": 2024,
    317       "venue": "ICML",
    318       "relevance": "Recent pragmatic code benchmark using GitHub; DOMAINEVAL explicitly compares against CoderEval's approach to real-world code sourcing"
    319     },
    320     {
    321       "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation",
    322       "authors": "Du et al.",
    323       "year": 2023,
    324       "venue": "ICLR",
    325       "relevance": "Class-level code generation benchmark; demonstrates escalation from function to class granularity, which DOMAINEVAL complements with domain orthogonalization"
    326     },
    327     {
    328       "title": "Concerned with Data Contamination? Assessing Countermeasures in Code Language Model",
    329       "authors": "Cao et al.",
    330       "year": 2024,
    331       "relevance": "Data contamination threat in code LMs; directly cited in DOMAINEVAL as motivation for automated pipeline's claimed contamination resistance"
    332     },
    333     {
    334       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    335       "authors": "Guo et al. et al.",
    336       "year": 2024,
    337       "relevance": "Code-specific LLM series; DeepSeek-Coder included in DOMAINEVAL evaluation alongside GPT and Llama models"
    338     },
    339     {
    340       "title": "CERT: Continual Pre-Training on Sketches for Library-Oriented Code Generation",
    341       "authors": "Zan et al.",
    342       "year": 2022,
    343       "relevance": "Library-oriented domain-specific code generation; prior work on domain-tailored evaluation that DOMAINEVAL systematically extends"
    344     },
    345     {
    346       "title": "MultiPL-E: A Scalable and Extensible Approach to Benchmarking Neural Code Generation",
    347       "authors": "Cassano et al.",
    348       "year": 2022,
    349       "relevance": "Multi-language code benchmark via translation; DOMAINEVAL contrasts its multi-domain approach against MultiPL-E's multi-language orthogonalization"
    350     },
    351     {
    352       "title": "CodeBenchGen: Creating Scalable Execution-based Code Generation Benchmarks",
    353       "authors": "Xie et al.",
    354       "year": 2024,
    355       "relevance": "Scalable automated benchmark construction using LLM; directly relevant precedent for DOMAINEVAL's automated pipeline approach"
    356     }
    357   ],
    358   "engagement_factors": {
    359     "practical_relevance": {
    360       "score": 2,
    361       "justification": "The benchmark helps practitioners understand LLM strengths/weaknesses across programming domains, useful for tool selection decisions."
    362     },
    363     "surprise_contrarian": {
    364       "score": 1,
    365       "justification": "Finding that LLMs struggle with cryptography and system code is somewhat expected given training data distributions, though the magnitude of the gap (68.94%) is notable."
    366     },
    367     "fear_safety": {
    368       "score": 0,
    369       "justification": "No safety or security concerns raised by the findings."
    370     },
    371     "drama_conflict": {
    372       "score": 0,
    373       "justification": "No controversy or conflict with prior claims; the paper positions itself as complementary to existing work."
    374     },
    375     "demo_ability": {
    376       "score": 1,
    377       "justification": "A leaderboard website exists at https://domaineval.github.io/ but no interactive demo or pip-installable tool is provided."
    378     },
    379     "brand_recognition": {
    380       "score": 1,
    381       "justification": "Evaluates well-known models (GPT-4o-mini, DeepSeek-Coder) but authors are from academic institutions, not major AI labs."
    382     }
    383   },
    384   "hn_data": {
    385     "threads": [
    386       {
    387         "hn_id": "39831754",
    388         "title": "GPT-4V(ision) Unsuitable for Clinical Care and Education: An Evaluation",
    389         "points": 75,
    390         "comments": 52,
    391         "url": "https://news.ycombinator.com/item?id=39831754"
    392       },
    393       {
    394         "hn_id": "41663273",
    395         "title": "Unsafe Impedance: Safe Languages and Safe by Design Software",
    396         "points": 7,
    397         "comments": 1,
    398         "url": "https://news.ycombinator.com/item?id=41663273"
    399       },
    400       {
    401         "hn_id": "40135927",
    402         "title": "OpenAI: Training LLMs to Prioritize Privileged Instructions",
    403         "points": 3,
    404         "comments": 0,
    405         "url": "https://news.ycombinator.com/item?id=40135927"
    406       },
    407       {
    408         "hn_id": "41418082",
    409         "title": "Data Exposure from LLM Apps: An In-Depth Investigation of OpenAI's GPTs",
    410         "points": 2,
    411         "comments": 0,
    412         "url": "https://news.ycombinator.com/item?id=41418082"
    413       },
    414       {
    415         "hn_id": "41408373",
    416         "title": "Data Exposure from LLM Apps: An In-Depth Investigation of OpenAI's GPTs",
    417         "points": 2,
    418         "comments": 0,
    419         "url": "https://news.ycombinator.com/item?id=41408373"
    420       },
    421       {
    422         "hn_id": "39139543",
    423         "title": "Exploring Parent's Needs for Children-Centered AI to Support Preschoolers",
    424         "points": 2,
    425         "comments": 1,
    426         "url": "https://news.ycombinator.com/item?id=39139543"
    427       },
    428       {
    429         "hn_id": "37345839",
    430         "title": "Relighting Neural Radiance Fields with Shadow and Highlight Hints",
    431         "points": 2,
    432         "comments": 0,
    433         "url": "https://news.ycombinator.com/item?id=37345839"
    434       },
    435       {
    436         "hn_id": "41227450",
    437         "title": "Τ-Bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains",
    438         "points": 1,
    439         "comments": 0,
    440         "url": "https://news.ycombinator.com/item?id=41227450"
    441       },
    442       {
    443         "hn_id": "40965488",
    444         "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions",
    445         "points": 1,
    446         "comments": 0,
    447         "url": "https://news.ycombinator.com/item?id=40965488"
    448       },
    449       {
    450         "hn_id": "40157957",
    451         "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions",
    452         "points": 1,
    453         "comments": 0,
    454         "url": "https://news.ycombinator.com/item?id=40157957"
    455       }
    456     ],
    457     "top_points": 75,
    458     "total_points": 96,
    459     "total_comments": 54
    460   }
    461 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs