scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31793B)
      1 {
      2   "paper": {
      3     "title": "Top General Performance = Top Domain Performance? DomainCodeBench: A Multi-domain Code Generation Benchmark",
      4     "authors": [
      5       "Dewu Zheng",
      6       "Yanlin Wang",
      7       "Ensheng Shi",
      8       "Xilin Liu",
      9       "Yuchi Ma",
     10       "Hongyu Zhang",
     11       "Zibin Zheng"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv",
     15     "arxiv_id": "2412.18573"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval", "qualitative"],
     20   "key_findings": "DomainCodeBench evaluates 10 LLMs across 12 application domains using 2,400 manually verified tasks and finds that top general-domain performance (e.g., GPT-4 on HumanEval) does not predict top domain-specific performance (DeepSeekCoder-33B dominates most domains despite ranking 4th on HumanEval). Manual error analysis identifies domain knowledge gaps and third-party library misusage as primary failure causes. Augmenting prompts with domain-specific context (similar code, dependency info, API docs) improves CodeBLEU scores by up to ~38% on average, with similar-context retrieval yielding the largest gains.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper states: 'Our replication package, including the benchmark, source code, and experimental results, is available at https://github.com/DeepSoftwareAnalytics/DomainCodeBench.'"
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The replication package at the GitHub URL includes the benchmark dataset of 2,400 tasks."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper specifies hardware ('Ubuntu 18.04.6 LTS with 128 Intel(R) Xeon(R) Platinum 8336C @ 2.30GHz CPUs and 8 NVIDIA A800 80GB PCIe GPUs') but provides no software dependency versions, requirements.txt, or environment setup details."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are provided in the paper. The replication package is referenced but the paper itself contains no instructions for replicating experiments."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Tables 4, 5, and 7 report only point estimates (CodeBLEU scores) with no confidence intervals or error bars, despite averaging over 3 runs."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper makes numerous comparative claims (e.g., 'DeepSeekCoder-33B ranks 4th on HumanEval yet achieves top performance in most application domains') based solely on comparing raw CodeBLEU numbers with no statistical significance tests."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper reports absolute CodeBLEU improvements in Table 7 (e.g., +17.14 for CodeLLaMa-7B with all contexts), percentage improvements ('around 38.17%'), and variance across domains (σ² column in Table 4), providing sufficient context for effect magnitudes."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification is given for why 200 tasks per domain, why 10 LLMs were selected, or why 10% of outputs were sampled for error analysis. The numbers appear chosen by convenience."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper states 'the experimental results presented in this paper are obtained by conducting three repeated experiments and averaging the results' but reports no standard deviation, IQR, or any spread measure across runs. The σ² column in Table 4 is variance across domains, not across runs."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The paper compares 10 LLMs against each other and against their HumanEval rankings, establishing a baseline comparison framework across general and domain-specific performance."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Models used include GPT-4-0125-preview, StarCoder2 (2024), DeepSeekCoder (2024), and CodeLLaMa (2023), all within 1-2 years of the paper's submission. However, notable late-2024 models (GPT-4o, Claude 3.5, Llama 3) are missing."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "RQ3 (Table 7) systematically tests individual context types (API, Dependency, Similar) and their combinations (API+Dependency, API+Similar, API+Similar+Dependency), effectively ablating each component's contribution to performance improvement."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "The paper uses only CodeBLEU as its evaluation metric. The authors explicitly acknowledge not using execution-based metrics and state 'We use CodeBLEU as the primary metric for evaluating code generation ability.'"
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Section 5.2 describes manual error analysis by five annotators with 5+ years of programming experience who analyzed LLM-generated outputs to classify failure causes, with case studies in Figure 4."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "The authors acknowledge in the threats section that they 'choose the prompt template that yields the best results after testing several options,' indicating the test data was used for prompt selection decisions without a separate validation set."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Table 4 provides per-domain breakdowns across all 12 domains for every model. Table 5 further breaks down into sub-domains for Enterprise Application and Game development."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 5.2 provides an extensive failure taxonomy (Table 6) with three detailed case studies in Figure 4 covering insufficient project understanding, limited domain library knowledge, and unfamiliarity with domain algorithms."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper reports that GPT-4 (best on HumanEval) underperforms domain-specifically, that larger models don't always outperform smaller ones, and unexpectedly that similar context outperforms oracle dependency+API context — 'LLMs perform better in the experiment with the only similar context than in the one where all dependencies and domain-specific third-party library knowledge are provided.'"
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract's three main claims — performance decoupling (supported by Table 4), domain-specific weaknesses (supported by Section 5.2), and ~38.17% contextual enhancement (supported by Table 7) — are all backed by experimental results."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The primary causal claim — that augmenting prompts with domain-specific knowledge improves performance — is tested through controlled experiments in RQ3 where the only variable changed is the additional context provided, constituting adequate single-variable manipulation."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Findings generalize beyond the tested scope. Finding 1 states 'Current LLMs exhibit substantial performance gaps in application-domain code generation' — generalizing from 10 tested models to 'current LLMs.' The paper tests 10 models on 12 domains but draws broad conclusions about LLMs generally."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The threats-to-validity section (Section 6) discusses methodological limitations but does not explore alternative explanations for the observed findings. For example, it does not consider whether performance differences across domains could be explained by CodeBLEU's varying reliability across programming languages, or whether training data distribution effects explain domain performance differences."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper acknowledges that CodeBLEU 'does not fully capture whether the generated code can execute correctly or pass the corresponding test cases' (Section 6), explicitly recognizing the gap between the measured proxy (CodeBLEU) and functional correctness."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "GPT-4 is specified as 'GPT-4-0125-preview.' Open-source models are identified with specific parameter sizes (CodeLLaMa-7B/13B/34B, DeepSeekCoder-6.7B/33B, StarCoder-15.5B, StarCoder2-3B/7B/15B), which uniquely identify the model checkpoints."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "The paper describes the prompt structure ('we combine the docstring and function signature in accordance with the characteristics of the task's corresponding programming language') but does not provide the actual prompt text or template used."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 4.2 states: 'we set the temperature to 0.4 and top-p to 0.95' following prior work."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. The paper performs direct code generation from prompts."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 3.2 documents the full pipeline: topic mining from tech websites (2020-2024), LDA-based domain identification, project selection by GitHub star count, manual function verification for domain relevance, cross-validated docstring annotation by 5 annotators, and automated dependency extraction."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 6 'Threats to Validity' provides a substantive discussion of internal and external threats spanning multiple paragraphs."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The threats section identifies study-specific concerns: limited domain coverage (12 of many domains), computational constraints preventing testing all LLMs, CodeBLEU not capturing execution correctness, GitHub data potentially in training sets (mitigated by docstring rewriting), model generation randomness (mitigated by 3 runs), and prompt template selection effects."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper describes limitations but does not explicitly state what the results do NOT show. There is no explicit bounding of claims — e.g., no statement that results apply only to the tested 10 models and 12 domains and should not be extrapolated to other LLMs or domains."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "The replication package at GitHub includes 'the benchmark, source code, and experimental results,' making the raw benchmark data and model outputs available for verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 3.2 describes the full data collection process: crawling posts from 7 tech websites (2020-2024), LDA topic mining, project selection by star count on GitHub, manual function sampling with domain-relevance verification, and cross-validated docstring annotation."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "Five annotators are described as having 'more than five years of programming experience' but no details are provided about how they were recruited, their specific backgrounds, or whether their selection could introduce bias in the annotation process."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The pipeline from domain identification to final benchmark is well-documented in Section 3.2 and Figure 2, with clear stages: domain mining → project selection → function sampling → docstring annotation → dependency extraction, resulting in 2,400 verified tasks (200 per domain)."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding sources, grants, or acknowledgments are mentioned anywhere in the paper, despite three co-authors being from Huawei Cloud Computing Technologies."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Sun Yat-sen University, Huawei Cloud Computing Technologies Co., Ltd, and Chongqing University."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "Funding is not disclosed, so independence cannot be assessed. Three co-authors are from Huawei Cloud, which has commercial interests in LLM development, though the paper does not evaluate Huawei products specifically."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interest statement is included in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The paper does not state the training data cutoff dates for any of the 10 evaluated models, despite acknowledging that 'DomainCodeBench contains data from GitHub before 2023, which may have already been included in the training datasets.'"
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": true,
    242         "justification": "Section 6 (External Threats) explicitly discusses that benchmark data from GitHub before 2023 'may have already been included in the training datasets of the evaluated models' and describes mitigation via manual docstring rewriting."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "The paper acknowledges that benchmark code originates from pre-2023 GitHub projects that may be in training data, and mitigates by providing 'each task instance in DomainCodeBench with a manually annotated docstring, avoiding the use of docstrings from online code repositories to prevent LLMs from directly associating training data with the programming tasks.'"
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in the experimental evaluation. Annotators are part of benchmark construction, not study subjects."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in the experimental evaluation."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in the experimental evaluation."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in the experimental evaluation."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in the experimental evaluation."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in the experimental evaluation."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in the experimental evaluation."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No inference costs, API costs, or latency figures are reported despite running GPT-4 API calls on 2,400 tasks across multiple context configurations and repeating each experiment 3 times."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Hardware is described (128 CPUs, 8 NVIDIA A800 GPUs) but no total compute budget (GPU hours, API spend, wall-clock time) is reported."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "The paper conducts 3 repeated experiments and averages results but reports no variance, standard deviation, or any seed sensitivity analysis across these runs."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": true,
    308         "justification": "Section 4.2 states: 'the experimental results presented in this paper are obtained by conducting three repeated experiments and averaging the results.'"
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The paper acknowledges testing 'several' prompt templates and selecting the best-performing one, but does not report how many were tested, what alternatives were tried, or the selection methodology."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The threats section admits 'We ultimately choose the prompt template that yields the best results after testing several options' without describing the selection process, whether a validation set was used, or what alternatives were tested."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper makes dozens of comparisons across 10 models and 12 domains but uses no statistical tests at all, let alone multiple comparison corrections."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "No discussion of author-evaluation bias. The authors constructed the benchmark, selected the tasks, wrote the docstrings, chose which models to test, and selected the best prompt template — all without acknowledging potential self-comparison biases."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The paper notes that 'the larger parameter scale of LLMs does not necessarily correlate with better code generation performance' but does not analyze performance as a function of compute budget or normalize results by model cost/size."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": true,
    338         "justification": "The paper explicitly acknowledges that CodeBLEU 'does not fully capture whether the generated code can execute correctly or pass the corresponding test cases' and discusses why execution-based metrics were impractical, demonstrating awareness of construct validity limitations."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No scaffolding is used; all models are evaluated via direct code generation from prompts."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": true,
    350         "justification": "Section 6 discusses that 'DomainCodeBench contains data from GitHub before 2023, which may have already been included in the training datasets of the evaluated models' and mitigates by manually rewriting docstrings."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "The RQ3 experiments intentionally provide oracle dependency and API information extracted from the ground truth, but the paper does not discuss whether the base evaluation setup (RQ1) might inadvertently leak information through function signatures or import statements."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether tasks sampled from the same repository share dependencies or correlated solutions, or whether the 200-per-domain sampling introduces structural dependencies."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": true,
    365         "justification": "Manual docstring rewriting is used as a concrete leakage prevention method: 'we manually rewrite the docstrings for every programming problem in DomainCodeBench... to prevent LLMs from directly associating training data with the programming tasks.'"
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Top general-domain models do not consistently excel in specific application domains (performance decoupling).",
    372       "evidence": "Table 4 shows DeepSeekCoder-33B (ranked 4th on HumanEval at 50.6) achieves top performance in most domains, while GPT-4 (ranked 1st on HumanEval at 83.5) underperforms domain-specifically. CodeLLaMa-34B surpasses DeepSeekCoder-33B on HumanEval but trails in all application domains except Mobile.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "LLMs fail at domain-specific code generation primarily due to insufficient project understanding (FOC 0.97), limited domain library knowledge (FOC 0.77), and unfamiliarity with domain algorithms (FOC 0.66).",
    377       "evidence": "Section 5.2 presents a failure taxonomy (Table 6) based on manual error analysis by 5 annotators, with detailed case studies in Figure 4 for each failure category.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Augmenting prompts with domain-specific knowledge improves performance by around 38.17% on average.",
    382       "evidence": "Table 7 shows that combining API + Similar + Dependency context yields mean CodeBLEU improvements of 10.70-17.64 across all models (Section 5.3).",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Similar context (retrieval-based) yields more substantial performance improvement than oracle dependency context or oracle API context.",
    387       "evidence": "Table 7 shows mean improvements of 7.24-12.65 for similar context alone vs. 3.87-6.08 for dependency context and 1.85-4.18 for API context across all models.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "LLMs generally perform well in blockchain but show weaker performance in web development and enterprise applications.",
    392       "evidence": "Table 4 shows blockchain scores consistently highest across models (e.g., DeepSeekCoder-6.7B: 49.04 blockchain vs. 34.93 web), with web and enterprise application scores among the lowest.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "LLMs with similar general-domain performance can exhibit significant divergences in domain-specific performance.",
    397       "evidence": "Figure 3 shows DeepSeekCoder-6.7B and StarCoder2-15B (both 45.1 on HumanEval) diverge substantially across domains, with different strengths in different application areas.",
    398       "supported": "moderate"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "Single evaluation metric (CodeBLEU only)",
    404       "detail": "The entire evaluation relies solely on CodeBLEU, a textual/semantic similarity metric, without any execution-based evaluation. CodeBLEU may not capture functional correctness, especially for domain-specific code where API calling conventions matter more than textual similarity. The authors acknowledge this but proceed with no alternative metric."
    405     },
    406     {
    407       "flag": "No variance reported across runs",
    408       "detail": "Despite conducting 3 repeated experiments, the paper reports only averaged results with no standard deviation, confidence intervals, or any measure of result stability. This makes it impossible to assess whether reported differences between models are meaningful or within noise."
    409     },
    410     {
    411       "flag": "Prompt selection on test data",
    412       "detail": "The authors admit selecting 'the prompt template that yields the best results after testing several options,' which means the reported results reflect an optimized prompt without a separate validation set. This inflates reported performance for all models."
    413     },
    414     {
    415       "flag": "Missing contemporary models",
    416       "detail": "For a December 2024 paper, the evaluation misses widely available models including GPT-4o, Claude 3.5, Llama 3.1, and DeepSeek-Coder-V2. Only one closed-source model (GPT-4) is tested, weakening claims about 'mainstream LLMs' generally."
    417     },
    418     {
    419       "flag": "Oracle contexts inflate RQ3 improvement claims",
    420       "detail": "The 'Dependency Context' and 'API Context' experiments in RQ3 use oracle information extracted from the ground truth, making the reported improvements upper bounds rather than achievable real-world gains. The ~38.17% improvement claim in the abstract does not clearly distinguish oracle vs. realistic settings."
    421     },
    422     {
    423       "flag": "No funding disclosure with industry co-authors",
    424       "detail": "Three of seven co-authors are from Huawei Cloud Computing Technologies, yet no funding sources, grants, or competing interests are disclosed."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Evaluating large language models trained on code",
    430       "authors": ["Mark Chen"],
    431       "year": 2021,
    432       "arxiv_id": "2107.03374",
    433       "relevance": "Introduces HumanEval, the foundational code generation benchmark that DomainCodeBench's analysis directly contrasts against."
    434     },
    435     {
    436       "title": "Program synthesis with large language models",
    437       "authors": ["Jacob Austin"],
    438       "year": 2021,
    439       "arxiv_id": "2108.07732",
    440       "relevance": "Introduces MBPP, a widely-used general-domain code generation benchmark for LLMs."
    441     },
    442     {
    443       "title": "EvoCodeBench: An Evolving Code Generation Benchmark with Domain-Specific Evaluations",
    444       "authors": ["Jia Li"],
    445       "year": 2024,
    446       "relevance": "Contemporary evolving code generation benchmark with domain-specific evaluation, directly related to benchmark design in this space."
    447     },
    448     {
    449       "title": "CoderEval: A benchmark of pragmatic code generation with generative pre-trained models",
    450       "authors": ["Hao Yu"],
    451       "year": 2024,
    452       "relevance": "Repository-level code generation benchmark evaluating LLMs on practical coding tasks."
    453     },
    454     {
    455       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    456       "authors": ["Terry Yue Zhuo"],
    457       "year": 2024,
    458       "arxiv_id": "2406.15877",
    459       "relevance": "Code generation benchmark with diverse function calls across 139 libraries, closely related domain-aware evaluation."
    460     },
    461     {
    462       "title": "DOMAINEVAL: An Auto-Constructed Benchmark for Multi-Domain Code Generation",
    463       "authors": ["Qiming Zhu"],
    464       "year": 2024,
    465       "arxiv_id": "2408.13204",
    466       "relevance": "Most closely related work — multi-domain code generation benchmark focused on programming domains rather than application domains."
    467     },
    468     {
    469       "title": "Top Leaderboard Ranking = Top Coding Proficiency, Always? EvoEval: Evolving Coding Benchmarks via LLM",
    470       "authors": ["Chunqiu Steven Xia"],
    471       "year": 2024,
    472       "relevance": "Questions whether leaderboard rankings reflect true coding proficiency, directly parallel to this paper's research question."
    473     },
    474     {
    475       "title": "RepoCoder: Repository-level code completion through iterative retrieval and generation",
    476       "authors": ["Fengji Zhang"],
    477       "year": 2023,
    478       "arxiv_id": "2303.12570",
    479       "relevance": "Repository-level code generation method using retrieval augmentation, the basis for the similar-context approach tested in RQ3."
    480     },
    481     {
    482       "title": "Code llama: Open foundation models for code",
    483       "authors": ["Baptiste Roziere"],
    484       "year": 2023,
    485       "arxiv_id": "2308.12950",
    486       "relevance": "One of the evaluated open-source LLM families for code generation."
    487     },
    488     {
    489       "title": "DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models in Code Intelligence",
    490       "authors": ["Qihao Zhu"],
    491       "year": 2024,
    492       "arxiv_id": "2406.11931",
    493       "relevance": "Successor to DeepSeekCoder evaluated in this paper; a notably absent contemporary baseline."
    494     },
    495     {
    496       "title": "On the effectiveness of large language models in domain-specific code generation",
    497       "authors": ["Xiaodong Gu"],
    498       "year": 2024,
    499       "relevance": "Directly studies LLM effectiveness in domain-specific code generation, a core theme of DomainCodeBench."
    500     },
    501     {
    502       "title": "CodeBLEU: a Method for Automatic Evaluation of Code Synthesis",
    503       "authors": ["Shuo Ren"],
    504       "year": 2020,
    505       "arxiv_id": "2009.10297",
    506       "relevance": "The sole evaluation metric used in this paper for measuring code generation quality."
    507     }
    508   ],
    509   "engagement_factors": {
    510     "practical_relevance": {
    511       "score": 2,
    512       "justification": "Practitioners can use the benchmark to evaluate which LLM is best for their specific application domain, with the benchmark publicly released."
    513     },
    514     "surprise_contrarian": {
    515       "score": 2,
    516       "justification": "Challenges the assumption that HumanEval rankings predict real-world domain performance — GPT-4 (#1 general) loses to DeepSeekCoder-33B (#4) in most domains."
    517     },
    518     "fear_safety": {
    519       "score": 0,
    520       "justification": "No safety, security, or AI risk concerns raised."
    521     },
    522     "drama_conflict": {
    523       "score": 1,
    524       "justification": "Mild implicit criticism of general-domain benchmarks as insufficient, but no direct controversy or confrontational framing."
    525     },
    526     "demo_ability": {
    527       "score": 2,
    528       "justification": "Benchmark and code are released on GitHub; researchers can evaluate their own models, though it requires significant compute to run."
    529     },
    530     "brand_recognition": {
    531       "score": 1,
    532       "justification": "University + Huawei Cloud authors; evaluates GPT-4 and well-known open-source models but is not from a high-profile AI lab."
    533     }
    534   }
    535 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs