scan-v4.json (23042B)
1 { 2 "scan_version": 4, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "DOMAINEVAL: An Auto-Constructed Benchmark for Multi-Domain Code Generation", 6 "authors": [ 7 "Qiming Zhu", 8 "Jialun Cao", 9 "Yaojie Lu", 10 "Hongyu Lin", 11 "Xianpei Han", 12 "Le Sun", 13 "Shing-Chi Cheung" 14 ], 15 "year": 2024, 16 "venue": "AAAI Conference on Artificial Intelligence", 17 "arxiv_id": "2408.13204", 18 "doi": "10.48550/arXiv.2408.13204" 19 }, 20 "checklist": { 21 "claims_and_evidence": { 22 "abstract_claims_supported": { 23 "applies": true, 24 "answer": true, 25 "justification": "Abstract claims about domain performance gaps (82.44% computation vs 33.08% cryptography), the 68.94% gap in Llama-2-13b-chat, and increasing bias with more samples are all directly supported by Table 1.", 26 "source": "opus" 27 }, 28 "causal_claims_justified": { 29 "applies": true, 30 "answer": false, 31 "justification": "The paper claims 'fine-tuning can bring about overall improvement, while the domain gaps still exist' based on comparing Llama-2-13b-chat vs CodeLlama-13b-Instruct. This is a causal claim, but confounds (different instruction tuning, different training data beyond code) are not addressed.", 32 "source": "opus" 33 }, 34 "generalization_bounded": { 35 "applies": true, 36 "answer": false, 37 "justification": "The paper's title claims 'Multi-Domain Code Generation' broadly, but all experiments are Python-only. The paper does not caveat that results may not generalize to other programming languages.", 38 "source": "opus" 39 }, 40 "alternative_explanations_discussed": { 41 "applies": true, 42 "answer": false, 43 "justification": "No discussion of alternative explanations for domain performance differences. For example, whether differences stem from training data distribution, domain-specific complexity, or test difficulty is not explored.", 44 "source": "opus" 45 }, 46 "proxy_outcome_distinction": { 47 "applies": true, 48 "answer": true, 49 "justification": "The paper measures Pass@k (functional correctness via test execution) and frames results in terms of 'code generation capability.' Pass@k directly measures functional correctness, and the claims match the granularity of measurements.", 50 "source": "opus" 51 } 52 }, 53 "limitations_and_scope": { 54 "limitations_section_present": { 55 "applies": true, 56 "answer": false, 57 "justification": "The paper has no limitations, threats to validity, or similar section. The conclusion mentions 'future research directions' but does not discuss limitations of the current work.", 58 "source": "opus" 59 }, 60 "threats_to_validity_specific": { 61 "applies": true, 62 "answer": false, 63 "justification": "No threats to validity are discussed. There is no analysis of potential biases in the benchmark construction, domain classification, or evaluation methodology.", 64 "source": "opus" 65 }, 66 "scope_boundaries_stated": { 67 "applies": true, 68 "answer": false, 69 "justification": "The paper does not state what the results do NOT show. It does not acknowledge that results are limited to Python, to function-level code generation, or to the specific repositories selected.", 70 "source": "opus" 71 } 72 }, 73 "conflicts_of_interest": { 74 "funding_disclosed": { 75 "applies": true, 76 "answer": false, 77 "justification": "No funding sources, grants, or sponsorships are mentioned anywhere in the paper.", 78 "source": "opus" 79 }, 80 "affiliations_disclosed": { 81 "applies": true, 82 "answer": true, 83 "justification": "Author affiliations are clearly listed: Chinese Academy of Sciences (Institute of Software) and Hong Kong University of Science and Technology. None of the authors are affiliated with the companies whose models are evaluated.", 84 "source": "opus" 85 }, 86 "funder_independent_of_outcome": { 87 "applies": true, 88 "answer": false, 89 "justification": "No funding is disclosed, so independence of funder cannot be assessed.", 90 "source": "opus" 91 }, 92 "financial_interests_declared": { 93 "applies": true, 94 "answer": false, 95 "justification": "No competing interests or financial interests statement is present in the paper.", 96 "source": "opus" 97 } 98 }, 99 "scope_and_framing": { 100 "key_terms_defined": { 101 "applies": true, 102 "answer": true, 103 "justification": "Key terms ('domain-specific code generation', 'Pass@k', 'macro-average') are either defined in context, exemplified (Figure 1), or referenced as standard prior work (Chen et al. 2021). Sufficient precision for readers.", 104 "source": "haiku" 105 }, 106 "intended_contribution_clear": { 107 "applies": true, 108 "answer": true, 109 "justification": "Abstract explicitly lists three contributions: (1) benchmark dataset with six domains, (2) automated pipeline, (3) findings on LLM limitations. Each is restated in conclusion.", 110 "source": "haiku" 111 }, 112 "engagement_with_prior_work": { 113 "applies": true, 114 "answer": true, 115 "justification": "Related Work (2 pages) systematically compares DOMAINEVAL against HumanEval/MBPP (common vs domain tasks), CoderEval/ClassEval (automation level), and Zhuo et al. 2024 (API usage vs implementation). Shows clear positioning.", 116 "source": "haiku" 117 } 118 } 119 }, 120 "type_checklist": { 121 "benchmark-creation": { 122 "construct_design": { 123 "construct_validity_argued": { 124 "applies": true, 125 "answer": false, 126 "justification": "Paper assumes functional correctness on computation/cryptography/system tasks measures LLM capability in those domains, but never argues *why* (e.g., 'implementing RSA measures cryptographic understanding because X'). Validity is implicit, not justified.", 127 "source": "haiku" 128 }, 129 "difficulty_distribution_characterized": { 130 "applies": true, 131 "answer": true, 132 "justification": "Figure 5 shows line-count distribution (4-198 lines, avg 55.69). Pass@1 results reveal difficulty gradient (computation 82.44% vs cryptography 33.08%). Constraints (3-100 lines) are stated. Not explicit tiers, but distribution is characterized.", 133 "source": "haiku" 134 }, 135 "ceiling_floor_effects_checked": { 136 "applies": true, 137 "answer": false, 138 "justification": "Table 1 reveals ceiling effects (computation >75% for most models) and moderate floors (cryptography 33.08%). Paper reports results but does not discuss ceiling/floor as a limitation or validity threat.", 139 "source": "haiku" 140 }, 141 "human_baseline_included": { 142 "applies": true, 143 "answer": false, 144 "justification": "No human evaluation, no human performance data, no validation that benchmark difficulty is calibrated appropriately. Cannot compare LLM performance to human baseline.", 145 "source": "haiku" 146 }, 147 "scoring_rubric_justified": { 148 "applies": true, 149 "answer": false, 150 "justification": "Pass@k borrowed from Chen et al. 2021 without domain-specific justification. Handling missing imports via automated completion is justified pragmatically ('tolerable flaw') but not principled. No discussion of why Pass@k is optimal for domain evaluation.", 151 "source": "haiku" 152 } 153 }, 154 "robustness": { 155 "contamination_resistance_designed": { 156 "applies": true, 157 "answer": false, 158 "justification": "Abstract claims pipeline 'fortifies DOMAINEVAL against data contamination threat' but provides no mechanism: no temporal splits, canaries, or versioning strategy described. Claim is aspirational, not engineered.", 159 "source": "haiku" 160 }, 161 "temporal_robustness_discussed": { 162 "applies": true, 163 "answer": true, 164 "justification": "Paper states pipeline enables 'exceptional scalability, capable of incorporating the ever-evolving code corpus.' Plans for continuous updates mentioned. Lacks detail on managing obsolescence or model overfitting to benchmark.", 165 "source": "haiku" 166 }, 167 "failure_modes_discussed": { 168 "applies": true, 169 "answer": false, 170 "justification": "Case studies (Figs 7-8) show LLM failure modes, not benchmark failure modes. No discussion of what the benchmark cannot measure (e.g., code maintainability, security, real-world complexity).", 171 "source": "haiku" 172 }, 173 "baseline_implementations_provided": { 174 "applies": true, 175 "answer": true, 176 "justification": "Leaderboard available at domaineval.github.io; evaluation procedure detailed in Section 'Experiment Setup' and 'Evaluation Process'; prompt template provided (Figure 6); sufficient for reproduction. Code availability not explicitly stated but procedural clarity is high.", 177 "source": "haiku" 178 } 179 }, 180 "documentation": { 181 "dataset_documentation_complete": { 182 "applies": true, 183 "answer": true, 184 "justification": "Collection methodology detailed (Domain Repository Collection, Test-Method Matching, Instruction Generation). Source description (GitHub >100 stars), preprocessing steps (filtering criteria), and dataset statistics (2454 subjects, 5892 tests) all provided.", 185 "source": "haiku" 186 }, 187 "licensing_and_access_clear": { 188 "applies": true, 189 "answer": false, 190 "justification": "Leaderboard URL given but licensing of benchmark data, commercial use rights, and derivative work permissions are not specified. Unclear if code/test data are available under open license.", 191 "source": "haiku" 192 }, 193 "intended_use_specified": { 194 "applies": true, 195 "answer": true, 196 "justification": "Paper states benchmark 'designed to evaluate LLMs' coding capabilities thoroughly' and can be used for 'custom domain benchmarks.' Caveats (e.g., Pass@k doesn't measure maintainability) not discussed.", 197 "source": "haiku" 198 } 199 } 200 } 201 }, 202 "claims": [ 203 { 204 "claim": "LLMs are generally good at computation tasks while falling short on cryptography and system coding tasks", 205 "evidence": "Table 1: Computation 82.44% Pass@1, Cryptography 33.08%, System 37.50% (macro-averages across 12 models)", 206 "supported": "strong" 207 }, 208 { 209 "claim": "Performance gap between domains can exceed 68%", 210 "evidence": "Table 1: Llama-2-13b-chat shows 80.94% (Computation) - 12.0% (Cryptography) = 68.94% gap", 211 "supported": "strong" 212 }, 213 { 214 "claim": "Generating more samples increases overall performance", 215 "evidence": "Table 1: Pass@1 average 53.42% vs Pass@5 average 59.60% (6.18pp improvement across all models and domains)", 216 "supported": "strong" 217 }, 218 { 219 "claim": "Generating more samples increases domain bias", 220 "evidence": "Section 'Impact of Generated Samples': Standard deviation (bias measure) decreases slightly on average (18.33→17.72) but CodeLlama-13b shows increase (19.90→20.55), indicating bias may increase for certain models", 221 "supported": "moderate" 222 }, 223 { 224 "claim": "GPT-4o-mini exhibits the most stable performance across domains", 225 "evidence": "Table 1: GPT-4o-mini has lowest standard deviation in Pass@5 (14.75) compared to 15.45-24.10 for other models", 226 "supported": "strong" 227 }, 228 { 229 "claim": "Fine-tuning improves overall performance but domain gaps persist", 230 "evidence": "Section 'LLMs Biases': CodeLlama-13b (fine-tuned from Llama-2-13b) achieves 11.25% improvement overall but domain gaps remain unresolved", 231 "supported": "strong" 232 }, 233 { 234 "claim": "DOMAINEVAL provides an automated, scalable pipeline for benchmark construction", 235 "evidence": "Section 'Benchmark Construction': Describes fully automated three-step pipeline (repository collection, test-method matching, instruction generation) applied to construct 2454 subjects", 236 "supported": "strong" 237 }, 238 { 239 "claim": "Code from different domains requires different types of knowledge and skills", 240 "evidence": "Figure 1 and case studies show computation involves mathematical operations, cryptography requires algorithm knowledge (RSA attacks), system tasks require OS understanding; all show different error patterns in Figure 7-8", 241 "supported": "strong" 242 } 243 ], 244 "methodology_tags": [ 245 "benchmark-eval", 246 "empirical" 247 ], 248 "key_findings": "DOMAINEVAL reveals pronounced domain biases in LLM code generation: computation tasks average 82.44% Pass@1 while cryptography and system domains average 33.08% and 37.50% respectively, with individual models showing gaps exceeding 68%. GPT-4o-mini and Qwen2-72B-Instruct lead in overall performance with 67.13% and 64.25% Pass@5. Increasing sampling from 1 to 5 uniformly improves performance (+6.18pp average) but paradoxically increases domain bias in some models (CodeLlama-13b), suggesting models amplify their weaknesses rather than fixing them with more attempts.", 249 "red_flags": [ 250 { 251 "flag": "No human baseline", 252 "detail": "Benchmark difficulty not validated against human performance. Unknown if 82% Pass@1 on computation is 'easy' (should be >95%) or 'appropriately hard' (should be 60-80%)." 253 }, 254 { 255 "flag": "No limitations section", 256 "detail": "Paper omits systematic discussion of threats to validity, benchmark design limitations, or scope constraints beyond stating basic boundaries." 257 }, 258 { 259 "flag": "Instructions generated by LLM", 260 "detail": "Uses Qwen2-72B to generate task descriptions rather than human-written instructions. Quality variation and consistency not discussed; could introduce artifact-specific biases." 261 }, 262 { 263 "flag": "GitHub selection bias", 264 "detail": "Repositories selected for >100 stars may skew toward popular, well-maintained code. Small/niche domain projects underrepresented." 265 }, 266 { 267 "flag": "Contamination resistance claimed without mechanism", 268 "detail": "Abstract claims automated pipeline 'fortifies DOMAINEVAL against data contamination threat' but provides no concrete mechanism (temporal splits, canary strings, versioning) beyond vague 'continuous updates'." 269 }, 270 { 271 "flag": "Ceiling effects in computation domain unaddressed", 272 "detail": "Computation domain scores >75% for nearly all models, suggesting task difficulty may be too low to discriminate performance. Not flagged as a limitation." 273 }, 274 { 275 "flag": "Missing imports corrected automatically", 276 "detail": "Paper adds missing import statements during evaluation to prevent 'tolerable flaw.' Raises fairness questions: are models being graded on import knowledge or just logic? Correction not standard in prior benchmarks." 277 }, 278 { 279 "flag": "Licensing and access unclear", 280 "detail": "Paper mentions leaderboard availability but does not specify whether benchmark data (2454 subjects, 5892 tests) is released, under what license, or with what usage restrictions." 281 }, 282 { 283 "flag": "Construct validity assumed not argued", 284 "detail": "Paper assumes functional correctness on domain-specific code reveals 'domain capability' but doesn't justify why (e.g., why RSA attack code measures cryptographic understanding)." 285 }, 286 { 287 "flag": "Alternative explanations not explored", 288 "detail": "Why is computation intrinsically easier? Is it because training data emphasizes math? Because cryptography is rarer? Root causes of domain bias asserted but not analyzed." 289 } 290 ], 291 "cited_papers": [ 292 { 293 "title": "Evaluating Large Language Models Trained on Code", 294 "authors": "Chen et al.", 295 "year": 2021, 296 "venue": "NeurIPS", 297 "relevance": "Foundational HumanEval benchmark; establishes Pass@k metric and function-level code generation evaluation paradigm that DOMAINEVAL extends to multi-domain setting" 298 }, 299 { 300 "title": "Program Synthesis with Large Language Models", 301 "authors": "Austin et al.", 302 "year": 2021, 303 "venue": "NeurIPS", 304 "relevance": "MBPP benchmark for programming tasks; one of the primary 'common task' baselines that DOMAINEVAL contrasts against by adding domain diversity" 305 }, 306 { 307 "title": "Measuring Coding Challenge Competence With APPS", 308 "authors": "Hendrycks et al.", 309 "year": 2021, 310 "venue": "NeurIPS", 311 "relevance": "APPS algorithm competition dataset; related benchmark emphasizing complexity but not domain specialization that DOMAINEVAL addresses" 312 }, 313 { 314 "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models", 315 "authors": "Yu et al.", 316 "year": 2024, 317 "venue": "ICML", 318 "relevance": "Recent pragmatic code benchmark using GitHub; DOMAINEVAL explicitly compares against CoderEval's approach to real-world code sourcing" 319 }, 320 { 321 "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation", 322 "authors": "Du et al.", 323 "year": 2023, 324 "venue": "ICLR", 325 "relevance": "Class-level code generation benchmark; demonstrates escalation from function to class granularity, which DOMAINEVAL complements with domain orthogonalization" 326 }, 327 { 328 "title": "Concerned with Data Contamination? Assessing Countermeasures in Code Language Model", 329 "authors": "Cao et al.", 330 "year": 2024, 331 "relevance": "Data contamination threat in code LMs; directly cited in DOMAINEVAL as motivation for automated pipeline's claimed contamination resistance" 332 }, 333 { 334 "title": "DeepSeek-Coder: When the Large Language Model Meets Programming", 335 "authors": "Guo et al. et al.", 336 "year": 2024, 337 "relevance": "Code-specific LLM series; DeepSeek-Coder included in DOMAINEVAL evaluation alongside GPT and Llama models" 338 }, 339 { 340 "title": "CERT: Continual Pre-Training on Sketches for Library-Oriented Code Generation", 341 "authors": "Zan et al.", 342 "year": 2022, 343 "relevance": "Library-oriented domain-specific code generation; prior work on domain-tailored evaluation that DOMAINEVAL systematically extends" 344 }, 345 { 346 "title": "MultiPL-E: A Scalable and Extensible Approach to Benchmarking Neural Code Generation", 347 "authors": "Cassano et al.", 348 "year": 2022, 349 "relevance": "Multi-language code benchmark via translation; DOMAINEVAL contrasts its multi-domain approach against MultiPL-E's multi-language orthogonalization" 350 }, 351 { 352 "title": "CodeBenchGen: Creating Scalable Execution-based Code Generation Benchmarks", 353 "authors": "Xie et al.", 354 "year": 2024, 355 "relevance": "Scalable automated benchmark construction using LLM; directly relevant precedent for DOMAINEVAL's automated pipeline approach" 356 } 357 ], 358 "engagement_factors": { 359 "practical_relevance": { 360 "score": 2, 361 "justification": "The benchmark helps practitioners understand LLM strengths/weaknesses across programming domains, useful for tool selection decisions." 362 }, 363 "surprise_contrarian": { 364 "score": 1, 365 "justification": "Finding that LLMs struggle with cryptography and system code is somewhat expected given training data distributions, though the magnitude of the gap (68.94%) is notable." 366 }, 367 "fear_safety": { 368 "score": 0, 369 "justification": "No safety or security concerns raised by the findings." 370 }, 371 "drama_conflict": { 372 "score": 0, 373 "justification": "No controversy or conflict with prior claims; the paper positions itself as complementary to existing work." 374 }, 375 "demo_ability": { 376 "score": 1, 377 "justification": "A leaderboard website exists at https://domaineval.github.io/ but no interactive demo or pip-installable tool is provided." 378 }, 379 "brand_recognition": { 380 "score": 1, 381 "justification": "Evaluates well-known models (GPT-4o-mini, DeepSeek-Coder) but authors are from academic institutions, not major AI labs." 382 } 383 }, 384 "hn_data": { 385 "threads": [ 386 { 387 "hn_id": "39831754", 388 "title": "GPT-4V(ision) Unsuitable for Clinical Care and Education: An Evaluation", 389 "points": 75, 390 "comments": 52, 391 "url": "https://news.ycombinator.com/item?id=39831754" 392 }, 393 { 394 "hn_id": "41663273", 395 "title": "Unsafe Impedance: Safe Languages and Safe by Design Software", 396 "points": 7, 397 "comments": 1, 398 "url": "https://news.ycombinator.com/item?id=41663273" 399 }, 400 { 401 "hn_id": "40135927", 402 "title": "OpenAI: Training LLMs to Prioritize Privileged Instructions", 403 "points": 3, 404 "comments": 0, 405 "url": "https://news.ycombinator.com/item?id=40135927" 406 }, 407 { 408 "hn_id": "41418082", 409 "title": "Data Exposure from LLM Apps: An In-Depth Investigation of OpenAI's GPTs", 410 "points": 2, 411 "comments": 0, 412 "url": "https://news.ycombinator.com/item?id=41418082" 413 }, 414 { 415 "hn_id": "41408373", 416 "title": "Data Exposure from LLM Apps: An In-Depth Investigation of OpenAI's GPTs", 417 "points": 2, 418 "comments": 0, 419 "url": "https://news.ycombinator.com/item?id=41408373" 420 }, 421 { 422 "hn_id": "39139543", 423 "title": "Exploring Parent's Needs for Children-Centered AI to Support Preschoolers", 424 "points": 2, 425 "comments": 1, 426 "url": "https://news.ycombinator.com/item?id=39139543" 427 }, 428 { 429 "hn_id": "37345839", 430 "title": "Relighting Neural Radiance Fields with Shadow and Highlight Hints", 431 "points": 2, 432 "comments": 0, 433 "url": "https://news.ycombinator.com/item?id=37345839" 434 }, 435 { 436 "hn_id": "41227450", 437 "title": "Τ-Bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains", 438 "points": 1, 439 "comments": 0, 440 "url": "https://news.ycombinator.com/item?id=41227450" 441 }, 442 { 443 "hn_id": "40965488", 444 "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", 445 "points": 1, 446 "comments": 0, 447 "url": "https://news.ycombinator.com/item?id=40965488" 448 }, 449 { 450 "hn_id": "40157957", 451 "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", 452 "points": 1, 453 "comments": 0, 454 "url": "https://news.ycombinator.com/item?id=40157957" 455 } 456 ], 457 "top_points": 75, 458 "total_points": 96, 459 "total_comments": 54 460 } 461 }