scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30053B)
      1 {
      2   "paper": {
      3     "title": "DOMAINEVAL: An Auto-Constructed Benchmark for Multi-Domain Code Generation",
      4     "authors": [
      5       "Qiming Zhu",
      6       "Jialun Cao",
      7       "Yaojie Lu",
      8       "Hongyu Lin",
      9       "Xianpei Han",
     10       "Le Sun",
     11       "Shing-Chi Cheung"
     12     ],
     13     "year": 2024,
     14     "venue": "AAAI Conference on Artificial Intelligence 2025",
     15     "arxiv_id": "2408.13204",
     16     "doi": "10.48550/arXiv.2408.13204"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "DOMAINEVAL is a multi-domain code generation benchmark with 2454 subjects across six domains (computation, network, visualization, basic, system, cryptography), constructed via a fully automated pipeline from GitHub repositories. Evaluation of 12 LLMs shows strong computation performance (82.44% avg Pass@1) but poor cryptography (33.08%) and system (37.50%) performance, with gaps up to 68.94% within a single model. Generating more samples (Pass@5) improves overall performance but does not reduce domain bias. GPT-4o-mini achieves the highest and most stable performance across domains.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper provides a leaderboard URL (https://domaineval.github.io/) but no explicit source code repository URL (e.g., GitHub link) for the benchmark construction pipeline or evaluation code."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper provides a leaderboard at https://domaineval.github.io/, which is a GitHub Pages site for the benchmark. As a published AAAI benchmark paper with a public leaderboard, the benchmark data is made available for submission and evaluation."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions 'a basic Python environment and necessary packages' and 'torch.bfloat16 when loading LLMs' but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions, README with commands, or scripts for replicating experiments are described in the paper."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Results in Table 1 report only point estimates (e.g., '82.44% Pass@1'). The 'Std' column measures standard deviation across domains (not across experimental runs), and no confidence intervals or error bars are provided."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper claims performance differences between domains and models (e.g., 'LLMs are generally good at computation tasks while falling short on cryptography') but uses no statistical significance tests to support these comparative claims."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper reports absolute performance values with context, e.g., 'The performance gap can be as much as 68.94% (80.94% - 12.0%) Pass@1 in Llama-2-13b-chat' and provides full baseline context in Table 1."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The benchmark has 2454 subjects total but with severe domain imbalance (1705 computation vs 100 for system and cryptography). No justification for these sample sizes or discussion of whether 100 subjects per domain provides sufficient statistical power."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "For Pass@1, greedy decoding is deterministic (no variance to report). For Pass@5, temperature sampling is used but no variance across runs or seeds is reported. The 'Std' column measures cross-domain variation, not experimental variance."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The paper evaluates 12 LLMs including GPT-4o-mini, GPT-3.5-turbo, DeepSeek-Coder series, Llama-2, CodeLlama series, Qwen2, Phi-3, and CodeQwen1.5, providing extensive cross-model comparison."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The evaluated models include contemporary models as of 2024: GPT-4o-mini, DeepSeek-Coder-V2-Lite-Instruct, Qwen2-72B, and Phi-3, alongside older models for comparison."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "The benchmark construction pipeline has multiple components (test-method matching, three selection criteria, instruction generation) but no ablation study examines the contribution of each component to benchmark quality."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The paper reports both Pass@1 (greedy decoding) and Pass@5 (sampling), along with macro-average and standard deviation across domains."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No human evaluation of generated code quality is performed. Evaluation is entirely automated via test case execution (Pass@k)."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "All models are evaluated zero-shot with no tuning on the benchmark data. The entire benchmark serves as a held-out test set since no development or validation split is used for model selection."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Table 1 provides complete per-domain breakdowns across all six domains (computation, network, visualization, basic, system, cryptography) for every model and metric."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The Case Study section (Figures 7-8) discusses two specific failure cases: GPT-4o-mini's incorrect use of float exponentiation for large numbers in cryptography, and incorrect parsing of the 'free' command output in the system domain."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper reports that LLMs perform poorly on cryptography (33.08%) and system (37.50%) domains, and that 'the domain bias may even increase' with more samples (CodeLlama-13b-Instruct deviation increased from 19.90 to 20.55)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims about domain performance gaps (82.44% computation vs 33.08% cryptography), the 68.94% gap in Llama-2-13b-chat, and increasing bias with more samples are all directly supported by Table 1."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper claims 'fine-tuning can bring about overall improvement, while the domain gaps still exist' based on comparing Llama-2-13b-chat vs CodeLlama-13b-Instruct. This is a causal claim, but confounds (different instruction tuning, different training data beyond code) are not addressed."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper's title claims 'Multi-Domain Code Generation' broadly, but all experiments are Python-only. The paper does not caveat that results may not generalize to other programming languages."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No discussion of alternative explanations for domain performance differences. For example, whether differences stem from training data distribution, domain-specific complexity, or test difficulty is not explored."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures Pass@k (functional correctness via test execution) and frames results in terms of 'code generation capability.' Pass@k directly measures functional correctness, and the claims match the granularity of measurements."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Open-source models are specified with detail (e.g., 'DeepSeek-Coder-33b-instruct', 'Qwen2-72B-Instruct-GPTQ-Int4'), but closed-source models are listed only as 'GPT-3.5-turbo' and 'GPT-4o-mini' without API version dates or snapshot identifiers."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Figure 4 provides the full instruction generation prompt template, and Figure 6 provides the full code generation task prompt, both with actual text used in experiments."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The paper states: 'For Pass@1 metric, we use greedy decoding, i.e. set temperature to 0.0. For Pass@5 metric, we opt for the minimum sample size N = 5 and maintain temperature at 0.2 and top-p at 0.95. We use torch.bfloat16 when loading LLMs.'"
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. The evaluation is direct prompt-to-completion code generation."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The three-step pipeline is documented in detail: Step I (domain repository collection from 100+ star GitHub repos), Step II (test-method matching via AST parsing and selection via executable/significant/appropriate difficulty criteria), Step III (LLM-based instruction generation)."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper has no limitations, threats to validity, or similar section. The conclusion mentions 'future research directions' but does not discuss limitations of the current work."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No threats to validity are discussed. There is no analysis of potential biases in the benchmark construction, domain classification, or evaluation methodology."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The paper does not state what the results do NOT show. It does not acknowledge that results are limited to Python, to function-level code generation, or to the specific repositories selected."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "The leaderboard URL is provided but the paper does not provide a direct download link for the raw benchmark data (subjects, test cases, reference solutions) or intermediate pipeline outputs."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The data collection is described in detail: GitHub repositories with 100+ stars classified into six domains, AST-based test-method matching, filtering via three criteria (executable, significant, appropriate difficulty with 3-100 line limits), and LLM-based instruction generation."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Repository selection criteria are described: 'GitHub repositories... particularly those with at least 100 stars, as these are considered high-quality code data.' Domain assignment uses 'the repository's topic labels and README files.' Figure 2 lists all 91 repositories used."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The full pipeline from collection to final dataset is documented with three explicit steps (Figure 2), including filtering criteria and the final count of 2454 subjects with 5892 test cases across six domains."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding sources, grants, or sponsorships are mentioned anywhere in the paper."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: Chinese Academy of Sciences (Institute of Software) and Hong Kong University of Science and Technology. None of the authors are affiliated with the companies whose models are evaluated."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding is disclosed, so independence of funder cannot be assessed."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates are stated for any of the 12 evaluated models, despite evaluating them on benchmark data sourced from public GitHub repositories."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "The paper mentions that the pipeline 'fortifies DOMAINEVAL against the data contamination threat' through continuous updates, but does not analyze whether the current benchmark data from well-known repos (numpy, pandas, scikit-learn, tensorflow) overlaps with any model's training data."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "The benchmark draws from extremely popular Python repositories (numpy, pandas, scikit-learn, tensorflow, django, matplotlib, etc.) that are almost certainly in every evaluated LLM's training data. No contamination analysis is performed for the current evaluation, only a conceptual claim about future pipeline updates."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants. This is a benchmark construction and automated model evaluation study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in the study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in the study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in the study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in the study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in the study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in the study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference costs, API costs, tokens consumed, or wall-clock time are reported for evaluating 12 models on 2454 subjects."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No total computational budget (GPU hours, API spend, hardware used) is stated for either benchmark construction or model evaluation."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Pass@1 uses greedy decoding (deterministic, no seed sensitivity). For Pass@5 with temperature=0.2, no seed sensitivity analysis is reported across different random seeds."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "The paper explicitly states N=1 for Pass@1 (greedy) and N=5 for Pass@5 (sampling), which fully specifies the number of runs per evaluation setting."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No discussion of whether alternative evaluation settings (different temperatures, prompts, or decoding strategies) were tried. The chosen settings follow prior work but no search budget is stated."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "The evaluation configuration follows established protocol from prior work (Zhuo et al. 2024): greedy decoding for Pass@1, temperature=0.2 and top-p=0.95 for Pass@5. The choices are referenced and standard."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The paper makes numerous comparisons across 12 models and 6 domains but uses no statistical tests at all, so no multiple comparison correction is applied."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "Qwen2-72B-Instruct-GPTQ-Int4 is used to generate task instructions AND is one of the 12 evaluated models. This circular construction-evaluation dependency, which could bias Qwen2's performance, is not discussed."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Models ranging from 6.7B to 72B parameters are compared without discussing compute requirements. No performance-per-compute analysis is provided."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper does not discuss whether Pass@k on code extracted from GitHub repositories actually measures 'domain-specific code generation capability' as claimed, or whether the 6-domain classification scheme has construct validity."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is used. All models are evaluated via direct prompt-to-completion code generation."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "The benchmark uses code from established repositories (numpy, pandas, scikit-learn, etc.) that have been public for years before any evaluated model's training cutoff. This temporal leakage risk is not analyzed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "The evaluation provides import statements and class context as input (mentioned in 'Evaluation Process' section), which could leak information about expected solutions. This is not discussed as a potential leakage concern."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "Multiple subjects are drawn from the same repositories (e.g., 1705 computation subjects from 15 repos), creating dependencies between test examples. This non-independence is not discussed."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection method (canary strings, n-gram overlap, membership inference) is applied. The paper only mentions the pipeline's potential for future updates as a conceptual defense against contamination."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "LLMs are generally good at computation tasks (82.44% avg Pass@1) while falling short on cryptography (33.08%) and system (37.50%) domains.",
    373       "evidence": "Table 1 shows average Pass@1 across 12 LLMs for each domain, with computation consistently highest and cryptography consistently lowest.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "The performance gap across domains can be as much as 68.94% within a single LLM (Llama-2-13b-chat: 80.94% computation vs 12.0% cryptography).",
    378       "evidence": "Table 1, Llama-2-13b-chat row shows 80.94% Pass@1 on computation and 12.0% on cryptography, a 68.94 percentage point gap.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Generating more samples increases overall performance but domain bias may even increase.",
    383       "evidence": "Table 1 comparison of Pass@1 (avg 53.42%, std 18.33) vs Pass@5 (avg 59.60%, std 17.72). CodeLlama-13b-Instruct std increased from 19.90 to 20.55.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "GPT-4o-mini exhibits the highest and most stable performance across domains.",
    388       "evidence": "Table 1 shows GPT-4o-mini achieves 67.13% Pass@5 (highest average) with 14.75 standard deviation across domains (lowest variance).",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Fine-tuning (CodeLlama from Llama-2) brings overall improvement but domain gaps persist.",
    393       "evidence": "Table 1 shows CodeLlama-13b achieves 57.74% Pass@5 vs Llama-2-13b-chat at 46.49% (11.25% improvement), but standard deviation remains high (20.55 vs 24.10).",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "The automated pipeline enables scalable benchmark construction that can mitigate data contamination through continuous code updates.",
    398       "evidence": "The pipeline is described in detail (Section 'Benchmark Construction') but no empirical evidence demonstrates that updates actually mitigate contamination.",
    399       "supported": "weak"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "Severely imbalanced domain sizes",
    405       "detail": "Computation has 1705 subjects (69.5% of all 2454) while system and cryptography each have only 100. This 17:1 imbalance means the benchmark is dominated by one domain. While macro-averaging addresses this for overall scores, statistical power differs dramatically across domains."
    406     },
    407     {
    408       "flag": "No contamination analysis despite high-risk data sources",
    409       "detail": "The benchmark draws from some of the most popular Python repositories in existence (numpy, pandas, scikit-learn, tensorflow, django, matplotlib). These repositories and their test suites are almost certainly in every evaluated LLM's training data. The paper's only defense — 'the pipeline can be updated' — is aspirational, not actual."
    410     },
    411     {
    412       "flag": "Circular construction-evaluation dependency",
    413       "detail": "Qwen2-72B-Instruct-GPTQ-Int4 generates all task instructions (Figure 4) and is also one of the 12 evaluated models. This creates a potential self-favoring bias: the model may perform better on instructions it generated because they align with its own understanding and phrasing. This conflict is not disclosed or discussed."
    414     },
    415     {
    416       "flag": "No limitations section",
    417       "detail": "The paper contains no limitations, threats to validity, or scope boundaries discussion. For a benchmark paper proposing a novel evaluation methodology, this is a significant omission."
    418     },
    419     {
    420       "flag": "Python-only with broad domain claims",
    421       "detail": "All 2454 subjects are Python-only, yet the paper makes general claims about 'multi-domain code generation' capabilities without acknowledging this language restriction. Domain-specific challenges may differ significantly in other languages (e.g., system programming in C/Rust)."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Evaluating Large Language Models Trained on Code",
    427       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    428       "year": 2021,
    429       "relevance": "Introduces HumanEval and the Pass@k metric, which DOMAINEVAL directly builds upon for evaluation."
    430     },
    431     {
    432       "title": "Program Synthesis with Large Language Models",
    433       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    434       "year": 2021,
    435       "relevance": "Introduces MBPP, another foundational code generation benchmark that DOMAINEVAL aims to complement."
    436     },
    437     {
    438       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    439       "authors": ["Terry Yue Zhuo", "Minh Chien Vu", "Jenny Chim"],
    440       "year": 2024,
    441       "relevance": "Closely related multi-domain code benchmark focusing on API usage; DOMAINEVAL positions itself as complementary by testing code implementation rather than API invocation."
    442     },
    443     {
    444       "title": "MultiPL-E: A Scalable and Extensible Approach to Benchmarking Neural Code Generation",
    445       "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"],
    446       "year": 2022,
    447       "arxiv_id": "2208.08227",
    448       "relevance": "Multi-language code generation benchmark addressing linguistic diversity, a different dimension from DOMAINEVAL's domain diversity."
    449     },
    450     {
    451       "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation",
    452       "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"],
    453       "year": 2023,
    454       "relevance": "Class-level code generation benchmark, representing the code-scale diversity dimension that complements DOMAINEVAL's domain diversity."
    455     },
    456     {
    457       "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models",
    458       "authors": ["Hao Yu", "Bo Shen", "Dezhi Ran"],
    459       "year": 2024,
    460       "relevance": "Pragmatic code generation benchmark from real-world code, sharing DOMAINEVAL's goal of testing non-standalone functions."
    461     },
    462     {
    463       "title": "DevEval: A Manually-Annotated Code Generation Benchmark Aligned with Real-World Code Repositories",
    464       "authors": ["Jia Li", "Ge Li", "Yunfei Zhao"],
    465       "year": 2024,
    466       "relevance": "Repository-aligned code generation benchmark requiring significant human effort, contrasting with DOMAINEVAL's automated pipeline."
    467     },
    468     {
    469       "title": "CodeBenchGen: Creating Scalable Execution-based Code Generation Benchmarks",
    470       "authors": ["Yiqing Xie", "Alex Xie", "Divyanshu Sheth"],
    471       "year": 2024,
    472       "relevance": "Automated benchmark construction approach using LLMs to curate CodeSearchNet, directly comparable to DOMAINEVAL's automated pipeline."
    473     },
    474     {
    475       "title": "Concerned with Data Contamination? Assessing Countermeasures in Code Language Model",
    476       "authors": ["Jialun Cao", "Wuqi Zhang", "Shing-Chi Cheung"],
    477       "year": 2024,
    478       "arxiv_id": "2403.16898",
    479       "relevance": "Directly addresses data contamination in code LLMs, the key threat to DOMAINEVAL's validity that the paper cites but does not empirically address."
    480     },
    481     {
    482       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming–The Rise of Code Intelligence",
    483       "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"],
    484       "year": 2024,
    485       "relevance": "One of the evaluated model families in DOMAINEVAL, representative of specialized code LLMs."
    486     },
    487     {
    488       "title": "Execution-Based Evaluation for Open-Domain Code Generation",
    489       "authors": ["Zhiruo Wang", "Shuyan Zhou", "Daniel Fried"],
    490       "year": 2023,
    491       "relevance": "Execution-based code evaluation benchmark (ODEX) for open-domain code generation, sharing DOMAINEVAL's goal of testing beyond algorithmic problems."
    492     },
    493     {
    494       "title": "McEval: Massively Multilingual Code Evaluation",
    495       "authors": ["Linzheng Chai", "Shukai Liu", "Jian Yang"],
    496       "year": 2024,
    497       "relevance": "Multilingual code evaluation benchmark addressing linguistic diversity dimension complementary to DOMAINEVAL's domain diversity."
    498     }
    499   ],
    500   "engagement_factors": {
    501     "practical_relevance": {
    502       "score": 2,
    503       "justification": "The benchmark helps practitioners understand LLM strengths/weaknesses across programming domains, useful for tool selection decisions."
    504     },
    505     "surprise_contrarian": {
    506       "score": 1,
    507       "justification": "Finding that LLMs struggle with cryptography and system code is somewhat expected given training data distributions, though the magnitude of the gap (68.94%) is notable."
    508     },
    509     "fear_safety": {
    510       "score": 0,
    511       "justification": "No safety or security concerns raised by the findings."
    512     },
    513     "drama_conflict": {
    514       "score": 0,
    515       "justification": "No controversy or conflict with prior claims; the paper positions itself as complementary to existing work."
    516     },
    517     "demo_ability": {
    518       "score": 1,
    519       "justification": "A leaderboard website exists at https://domaineval.github.io/ but no interactive demo or pip-installable tool is provided."
    520     },
    521     "brand_recognition": {
    522       "score": 1,
    523       "justification": "Evaluates well-known models (GPT-4o-mini, DeepSeek-Coder) but authors are from academic institutions, not major AI labs."
    524     }
    525   }
    526 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs