ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24248B)


      1 {
      2   "paper": {
      3     "title": "Holistic Evaluation of State-of-the-Art LLMs for Code Generation",
      4     "authors": ["Le Zhang", "Suresh Kothari"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2512.18131"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "DeepSeek-R1 and GPT-4.1 consistently outperform other models across 944 LeetCode problems in 5 languages on Pass@1, compile-time errors, runtime errors, functional failures, and algorithmic suboptimality. Llama-3.3 performs worst across nearly all metrics. Adding an optimization prompt hint reduces algorithmic suboptimality for most models, with DeepSeek-R1 showing strongest responsiveness. Python and JavaScript yield fewest compile-time errors while Go has the most.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "Section 3.7 provides a Figshare link: https://figshare.com/s/26448e92798aab34e407 containing datasets, code, and evaluation results."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The replication package on Figshare includes datasets and evaluation results (Section 3.7). The underlying LeetCode problems are publicly available."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, requirements.txt, or dependency details are mentioned in the paper."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The paper mentions a replication package but does not describe how to run the experiments."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "All results are reported as point estimates (e.g., Pass@1 percentages in Table 4) with no confidence intervals or error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper claims models 'outperform' others based solely on comparing raw percentages. No statistical significance tests are applied."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Tables 4-6 report absolute performance metrics with baselines for context. E.g., Pass@1 ranging from 51.38% (Llama-3.3 Go) to 89.30% (DeepSeek-R1 Python), providing magnitude context for comparisons."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper uses 944 problems (large dataset) and 202 (small dataset) with no justification for why these sizes are adequate or how they were determined."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Single-run results only. Each problem is submitted once per model per language (Section 3.5). No variance across runs is reported."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Six models are compared against each other across multiple metrics, serving as mutual baselines."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "All six models are from 2024-2025: DeepSeek-R1, DeepSeek-V3, GPT-4.1, Claude-3.7, Qwen2.5-Coder, Llama-3.3. These are state-of-the-art at time of writing."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "This is a benchmark comparison study, not a system with components to ablate."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Five metrics are used: Pass@1, Compile-time Error, Runtime Error, Functional Failure, and Algorithmic Suboptimality (Sections 3.6.1-3.6.5)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No human evaluation of generated code quality. All evaluation is automated through LeetCode's test suite."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "LeetCode's built-in test suites serve as the evaluation, and these are not used for any model tuning in this study. The models are evaluated as-is via API."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by model, programming language, and difficulty level (Tables 2, 4, 5). The large vs. small dataset also provides category-level analysis."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper discusses failure scenarios including syntax errors, logical flaws, and suboptimal algorithms. Section 4 analyzes specific error types (CE, RE, FF, AS) with language-specific patterns."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Llama-3.3's poor performance is consistently reported. Section 4.6 notes Claude-3.7 shows minimal/no improvement from optimization hints, and some models show higher AS with hints."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims DeepSeek-R1 and GPT-4.1 consistently outperform others, which is supported by Tables 4-5 across all metrics."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes causal claims about prompt engineering improving results (Section 4.6, 'the inclusion of the optimization hint leads to a reduction in AS values') based on a single before/after comparison with no controls for stochastic variation. Single-run design means differences could be noise."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "Title says 'Holistic Evaluation' and abstract claims 'comprehensive empirical evaluation' but tests only LeetCode algorithmic problems (greedy, sorting, binary search, trees). Does not bound claims to this narrow domain — real-world software development involves far more than algorithmic puzzles."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "No discussion of alternative explanations for performance differences. Model size, training data composition, reasoning capabilities, and other confounds are not systematically analyzed."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper frames LeetCode Pass@1 as evidence for 'reliable code generation in real-world software development tasks' (abstract) without acknowledging that LeetCode algorithmic puzzles are a narrow proxy for real-world coding capability."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Table 1 specifies exact model names: DeepSeek-R1, DeepSeek-V3-0324, Qwen2.5-Coder-32B-Instruct, Llama-3.3-70B-Instruct, GPT-4.1, Claude-3.7-Sonnet with sizes and release years."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Figure 1 provides the complete prompt structure with an actual example including role specification, problem description, constraints, code snippet, test cases, and additional instructions."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 3.3 reports temperature=0.1 and top-p=0.95 for all models, with justification for these choices."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. Models are called directly via API with single-shot prompts."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.2 describes the dataset construction: 944 manually picked LeetCode problems focusing on four topics, with 202 selected for complexity sensitivity. Problem distribution by difficulty and test case count is documented in Tables 2-3."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No dedicated limitations section. The paper has Conclusions, Recommendations, and Future Work sections but no substantive discussion of limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats-to-validity discussion. The contamination concern in Section 3.2 is the only acknowledgment of a methodological limitation."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show. Future Work (Section 6) mentions areas not covered but does not frame them as scope boundaries of the current findings."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 3.7 states all artifacts including datasets, code, and evaluation results are available at a Figshare link."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3.2 describes data collection: 944 manually picked LeetCode problems covering greedy algorithms, sorting, binary search, and tree-based problems, with difficulty distribution."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data source is LeetCode problems (a standard public benchmark)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 3.5 describes the pipeline: LLMs generate code → submitted via custom LeetCode API → one submission per model per language → results collected in JSON format."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Authors are from the Department of Computer Science, Iowa State University. No affiliation with any of the evaluated model providers."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Funding is not disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial disclosure statement is present."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff dates are stated for any of the six models. This is critical since LeetCode problems and solutions are widely available online."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Section 3.2 acknowledges that 'some problems or their corresponding solutions may have appeared in the pretraining corpora' and describes mitigation: selecting recent problems, avoiding discussion content/solutions."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "While Section 3.2 acknowledges contamination risk and uses recent problems as mitigation, no concrete detection method is applied. The paper admits 'complete elimination of contamination cannot be guaranteed' but does not analyze which models' training data may include LeetCode solutions."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Table 6 reports API pricing per million tokens and total costs for each model. Section 4.7.2 discusses cost analysis including DeepSeek's off-peak pricing."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Table 6 reports total token usage (input/output in millions) and total USD costs for all models. Total costs range from $2.39 (Llama-3.3) to $108.65 (Claude-3.7)."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Single-run results only. Temperature is set to 0.1 but no seed sensitivity analysis is performed."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Section 3.5 explicitly states 'one submission per model and per programming language' — i.e., exactly 1 run per configuration."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Temperature=0.1 and top-p=0.95 are chosen based on prior literature but no search was performed. Only one configuration is tested per model."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Section 3.3 justifies parameter choices by citing prior work showing top-p=0.95 and temperature=0.1 are effective for one-shot code generation."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No statistical tests are performed at all, so no multiple comparison correction. Comparisons across 6 models × 5 languages × 5 metrics are made without any statistical framework."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "The authors are not evaluating their own system. They are comparing third-party LLMs."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Section 4.7 analyzes token usage and cost across models, noting DeepSeek-R1 uses substantially more tokens. Table 6 provides full cost breakdown alongside performance results."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No discussion of whether LeetCode algorithmic problems are a valid proxy for 'real-world software development tasks' as claimed. The paper uses LeetCode without questioning its construct validity for the broad claims being made."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is used. All models receive the same direct API prompt."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "Section 3.2 states they 'intentionally selected relatively recent problems that are less likely to be included in existing training corpora' as a mitigation strategy."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the prompt structure or LeetCode problem format provides hints not available in real-world usage."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether LeetCode problems share structural similarities that could inflate performance (e.g., similar problem templates, shared solution patterns)."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No concrete leakage detection method is applied. Mitigation is limited to selecting recent problems — no canary strings, membership inference, or n-gram overlap analysis."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "DeepSeek-R1 and GPT-4.1 consistently outperform other models in correctness, efficiency, and robustness across all programming languages.",
    364       "evidence": "Table 4 shows DeepSeek-R1 achieves highest Pass@1 (86.97-89.30%) and lowest FF/AS metrics across all 5 languages. GPT-4.1 is second-best on most metrics.",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "Llama-3.3 shows the lowest performance across most metrics.",
    369       "evidence": "Table 4: Llama-3.3 has Pass@1 of 51-55%, highest FF (30-34%), and highest AS (7.6-10.5%) across languages.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Python3 and JavaScript produce zero compile-time errors across all models.",
    374       "evidence": "Table 4 shows CE=0.00% for both Python3 and JavaScript across all six models.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Optimization prompt hints reduce algorithmic suboptimality for most models.",
    379       "evidence": "Table 5 and Figure 7 show AS reduction with hints for DeepSeek-R1, GPT-4.1, DeepSeek-V3, and Qwen2.5-Coder. Claude-3.7 shows minimal/no improvement.",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "DeepSeek-R1 maintains moderate costs despite high token consumption due to pricing structure.",
    384       "evidence": "Table 6 shows DeepSeek-R1 uses 36.52M output tokens (vs 4-6M for others) but costs $20.52 at off-peak vs Claude-3.7's $108.65.",
    385       "supported": "strong"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "Single-run evaluation",
    391       "detail": "Each problem is submitted exactly once per model. With temperature=0.1, outputs are near-deterministic but not fully deterministic. No variance or reproducibility assessment is possible from single runs."
    392     },
    393     {
    394       "flag": "No statistical tests for claimed differences",
    395       "detail": "All comparative claims ('outperforms', 'consistently better') are based on raw percentage comparisons with no significance testing. Differences between some models may not be meaningful."
    396     },
    397     {
    398       "flag": "LeetCode contamination risk understated",
    399       "detail": "LeetCode problems and solutions are extensively available online. Models trained on web data almost certainly saw many of these problems. The mitigation of using 'recent problems' is not validated — no cutoff dates are provided for any model's training data."
    400     },
    401     {
    402       "flag": "Overly broad claims from narrow benchmark",
    403       "detail": "The title claims 'Holistic Evaluation' and recommendations target 'real-world software development tasks' but the benchmark covers only algorithmic problems (greedy, sorting, binary search, trees) — no API usage, debugging, refactoring, or real-world software engineering tasks."
    404     },
    405     {
    406       "flag": "No limitations section",
    407       "detail": "The paper lacks any dedicated discussion of limitations or threats to validity, which is a significant methodological omission for an empirical evaluation."
    408     },
    409     {
    410       "flag": "Prompt engineering claim from uncontrolled comparison",
    411       "detail": "The optimization hint experiment (Section 4.6) compares two single runs without controlling for stochastic variation. Observed differences may be noise rather than signal."
    412     }
    413   ],
    414   "cited_papers": [
    415     {
    416       "title": "Evaluating large language models trained on code",
    417       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    418       "year": 2021,
    419       "arxiv_id": "2107.03374",
    420       "relevance": "Foundational Codex evaluation (HumanEval benchmark) that established LLM code generation evaluation methodology."
    421     },
    422     {
    423       "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions",
    424       "authors": ["H. Pearce", "B. Ahmad", "B. Tan", "B. Dolan-Gavitt", "R. Karri"],
    425       "year": 2025,
    426       "relevance": "Evaluates security vulnerabilities in LLM-generated code, directly relevant to code quality assessment."
    427     },
    428     {
    429       "title": "A performance study of LLM-generated code on LeetCode",
    430       "authors": ["T. Coignion", "C. Quinton", "R. Rouvoy"],
    431       "year": 2024,
    432       "relevance": "Prior LeetCode-based LLM evaluation study examining hyperparameter impact and prompt design."
    433     },
    434     {
    435       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    436       "authors": ["D. Guo", "D. Yang", "H. Zhang"],
    437       "year": 2025,
    438       "arxiv_id": "2501.12948",
    439       "relevance": "Technical report for one of the top-performing models evaluated in this study."
    440     },
    441     {
    442       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    443       "authors": ["J. Liu", "C.S. Xia", "Y. Wang", "L. Zhang"],
    444       "year": 2023,
    445       "arxiv_id": "2305.01210",
    446       "relevance": "Rigorously evaluates LLM code generation correctness, questioning optimistic early assessments."
    447     },
    448     {
    449       "title": "CodeGen: An open large language model for code with multi-turn program synthesis",
    450       "authors": ["E. Nijkamp", "B. Pang", "H. Hayashi"],
    451       "year": 2022,
    452       "arxiv_id": "2203.13474",
    453       "relevance": "Demonstrates structured prompting improving code generation robustness across languages."
    454     },
    455     {
    456       "title": "A survey on large language models for code generation",
    457       "authors": ["J. Jiang", "F. Wang", "J. Shen", "S. Kim", "S. Kim"],
    458       "year": 2024,
    459       "arxiv_id": "2406.00515",
    460       "relevance": "Comprehensive survey of LLM code generation covering architectures and evaluation approaches."
    461     },
    462     {
    463       "title": "LeetCodeDataset: A temporal dataset for robust evaluation and efficient training of code LLMs",
    464       "authors": ["Y. Xia", "W. Shen", "Y. Wang"],
    465       "year": 2025,
    466       "arxiv_id": "2504.14655",
    467       "relevance": "Addresses temporal aspects of LeetCode evaluation and contamination concerns for code LLMs."
    468     },
    469     {
    470       "title": "Proving the coding interview: A benchmark for formally verified code generation",
    471       "authors": ["Q. Dougherty", "R. Mehta"],
    472       "year": 2025,
    473       "relevance": "Evaluates LLMs on deductively verifiable code generation, complementary evaluation approach."
    474     }
    475   ]
    476 }

Impressum · Datenschutz