scan.json (26566B)
1 { 2 "paper": { 3 "title": "ProjDevBench: Benchmarking AI Coding Agents on End-to-End Project Development", 4 "authors": ["Pengrui Lu", "Shiqi Zhang", "Yunzhong Hou", "Lyumanshan Ye", "Chaoyi Huang", "Zixi Chen", "Ji Zeng", "Hantao Jiang", "Pengfei Liu", "Yiwei Wang", "Ming-Hsuan Yang"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2602.01655" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "ProjDevBench evaluates 6 coding agents across multiple LLM backends on 20 end-to-end C++ project construction tasks, finding an overall acceptance rate of only 27.38%. Codex+GPT-5 achieves the best overall score (77.85%), with performance gaps widening on from-scratch tasks. Key failure modes include specification misalignment (42% wrong answers), time complexity optimization failures (14% TLE), and resource management limitations. Extended interaction correlates negatively with performance (Spearman ρ = −0.734 for tokens vs. score).", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "The paper provides a GitHub URL: https://github.com/zsworld6/projdevbench in the abstract." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The benchmark problems are available via the GitHub repository mentioned in the abstract." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": true, 28 "justification": "Appendix E provides a detailed Docker base image configuration (Dockerfile) specifying exact packages: gcc-13, g++-13, cmake, Python 3.12, Node.js 20, and all agent CLI tools." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": true, 33 "justification": "Appendix E.2 documents the full evaluation pipeline including configuration loading, Docker container initialization, workspace setup, agent execution, OJ submission, and code review steps." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results in Tables 2–6 report only point estimates with no confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": true, 45 "justification": "Appendix H mentions a paired t-test: 'Pairwise comparison of top agents (Cursor vs. Claude Code) shows statistically significant difference (p < 0.05, paired t-test).' However, this is only for one comparison." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper reports percentage scores with baselines for comparison (e.g., Codex+GPT-5 at 77.85 vs. Augment+GPT-5 at 72.35), and Spearman correlations with specific ρ values and p-values in Table 4." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The benchmark has only 20 problems with no justification for why 20 is sufficient. No power analysis or discussion of statistical adequacy of this sample size for the claims made." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "Single evaluation pass per agent-model configuration. No variance across runs reported. Appendix H reports std of 32.4 across problems, but this is cross-problem variation, not cross-run variance." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Six coding agents across multiple LLM backends are compared against each other, and Table 1 compares ProjDevBench against prior benchmarks (HumanEval, SWE-bench, DevEval, etc.)." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "The evaluated agents and models are highly contemporary: GPT-5, Claude Sonnet 4.5, Gemini 3 Pro Preview, plus recent open-source models. All are 2025-2026 era." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": false, 77 "justification": "No ablation study of the benchmark design (e.g., effect of code review weight, different scoring schemes). The 80/20 execution/review weighting is asserted without ablation." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper uses three metrics: Execution Score, Code Review Score, and Final Weighted Score, reported separately in Table 2." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": true, 87 "justification": "Section 5.4 validates the LLM-based code review against human expert annotations, with multiple annotators independently reviewing submissions. Figure 5 shows Pearson r=0.709 and Cohen's κ=0.710." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": false, 92 "justification": "The paper does not discuss held-out test sets. All 20 problems are used for evaluation, and the OJ test suites serve as the evaluation mechanism. No train/dev/test split is mentioned." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Table 2 breaks results down by Easy vs. Hard subsets. Table 3 provides per-failure-type distribution. Table 6 reports per-problem scores. Figure 4 shows category distribution." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 5.1 provides extensive failure analysis across five categories: specification misalignment, edge case handling, time complexity, resource management, and code engineering gaps, with specific examples from individual problems." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper extensively reports where agents fail: only 27.38% acceptance rate, agents struggle with complex tasks, extended interaction correlates negatively with performance." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims 27.38% acceptance rate (supported by Table 3), agents struggle with complex system design (supported by Section 5.1), and the benchmark evaluates system architecture, correctness, and refinement (supported by the evaluation protocol)." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": false, 119 "justification": "The paper makes implicit causal claims such as 'extended interaction alone does not guarantee successful task completion' and attributes performance differences to framework design vs. model capability without controlling for confounds. The correlation analysis (Table 4) is acknowledged as correlational, but language like 'harder problems compel agents' implies causation." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": true, 124 "justification": "The Limitations section explicitly states: 'tasks focus primarily on C++, and it remains unclear whether observed agent behaviors generalize to other languages.' The scope is bounded to the 20 tasks and tested agents." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper does not discuss alternative explanations for performance differences. For example, agent framework differences (prompting strategies, tool usage) are not controlled for, and no discussion of whether the OJ platform characteristics bias results." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper is relatively precise: it measures execution scores and code review compliance, and frames these as measures of 'end-to-end project construction capability' — which is what the benchmark directly tests. The metrics match the claims without overframing." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "Models are specified by marketing names only: 'GPT-5', 'Sonnet-4.5', 'Gemini 3 Pro Preview', 'GLM-4.6', 'Kimi-k2-0905-Preview', 'DeepSeek-V3.2-Exp'. No API versions or snapshot dates are provided except Kimi which includes a date-like suffix." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Appendix F provides the full prompt text used for both single-problem and multiple-problem evaluation scenarios." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": false, 151 "justification": "No mention of temperature, top-p, max tokens, or any sampling hyperparameters for the LLM API calls used by the agents." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "The paper evaluates third-party coding agents (Cursor, Copilot, Claude Code, etc.) as black boxes via their CLIs. The authors cannot describe the internal scaffolding of these proprietary tools." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 3.2 and Figure 3 document the three-stage problem filtering pipeline: ~2800 initial → ~100 after scope filtering → 20 after quality filtering, with criteria described at each stage." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "A dedicated 'Limitations and Future Work' section appears after Section 6, discussing benchmark scale, language coverage, and evaluation mode limitations." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "The limitations section raises specific concerns: only 20 tasks (scaling difficulty), C++ only (language generalization unclear), fully autonomous only (excludes human-in-the-loop). These are specific to this study." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "The paper explicitly states what is not covered: other programming languages, human-in-the-loop workflows, and larger task sets. The Limitations section bounds the claims." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "While the benchmark problems are released, the raw agent execution logs, submission histories, and OJ test outputs are not made available for independent verification." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 3.2 describes the three-stage collection pipeline from a university OJ platform, with clear filtering criteria at each stage." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants were recruited. The study evaluates AI agents on curated problems. Student performance statistics in Table 5 are historical data from the OJ platform." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline from ~2800 problems → ~100 → 20 is documented in Section 3.2 with filtering criteria. The evaluation pipeline is documented in Appendix E.2." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding or acknowledgments section is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are listed: UC Merced, Shanghai Jiao Tong University, Shanghai Innovation Institute, Beijing Institute of Technology. None of these are the evaluated tool vendors." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding information is disclosed, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "The paper does not state the training data cutoff dates for any of the evaluated models (GPT-5, Sonnet 4.5, Gemini 3 Pro, etc.)." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether the OJ problems or their solutions may have appeared in the models' training data. The problems come from a university OJ platform that may have solutions online." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "The benchmark problems are drawn from an existing OJ platform with ~2800 problems. No discussion of whether these problems and their solutions were available online before the models' training cutoffs." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants. The study evaluates AI coding agents on benchmark problems." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in the study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in the study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in the study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in the study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in the study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in the study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": true, 283 "justification": "The paper reports token consumption: agents average 4.81M tokens per problem (Section 5.3), with per-problem token counts in Table 6. Time information is also provided (most complex tasks take up to two hours)." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": true, 288 "justification": "Appendix E.3 states resource limits: 8GB memory, 4 CPU cores, 6GB Node.js heap per container. Per-problem token budgets are in Table 6." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "Single evaluation pass per agent-model configuration. No multiple seeds or runs reported." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": true, 300 "justification": "Section 4.1 explicitly states: 'For each agent-model configuration, we run a single evaluation pass on every problem in the benchmark.'" 301 }, 302 "hyperparameter_search_budget": { 303 "applies": false, 304 "answer": false, 305 "justification": "The paper evaluates coding agents via their CLIs without hyperparameter tuning. No search budget applies." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": true, 310 "justification": "Appendix C.3 states the final reported score is the maximum across all valid submissions within the submission limit, and this policy is transparently described." 311 }, 312 "multiple_comparison_correction": { 313 "applies": true, 314 "answer": false, 315 "justification": "Multiple agent-model comparisons are made across Table 2 but only one p-value is reported (Appendix H) with no multiple comparison correction." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors developed the benchmark and evaluate all agents on it. No discussion of author-evaluation bias or whether the benchmark design may favor certain agent types." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": true, 325 "justification": "Table 4 and Section 5.3 analyze the relationship between compute (tokens) and performance, finding a strong negative correlation (ρ = −0.734)." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "The paper does not discuss whether the 20 OJ-derived C++ problems actually measure 'end-to-end project development capability' as claimed. No analysis of construct validity or comparison with alternative definitions of project-level development." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": false, 334 "answer": false, 335 "justification": "The paper evaluates bundled tools (Cursor, Copilot, Claude Code) as products. The scaffold IS what is being tested, so this confound does not apply." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "Problems come from an existing university OJ platform. No discussion of whether solutions to these problems existed online before the models' training cutoffs." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the evaluation setup leaks information. Agents have internet access during evaluation (Appendix E.3: 'Containers have full internet access'), which could allow looking up solutions." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the 20 problems share structural similarities that could inflate or deflate performance estimates." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No leakage detection or prevention methods are described despite using problems from a public OJ platform and giving agents internet access." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "Overall acceptance rate across all agents is 27.38%", 364 "evidence": "Table 3 shows 484 out of 1,768 total submissions were accepted (27.38%).", 365 "supported": "strong" 366 }, 367 { 368 "claim": "Codex+GPT-5 achieves the best overall performance at 77.85% final weighted score", 369 "evidence": "Table 2 shows Codex+GPT-5 at 77.85 overall final score, ahead of all other configurations.", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "Extended interaction correlates negatively with performance", 374 "evidence": "Table 4 reports Spearman ρ = −0.734 (p = 0.0002) for tokens vs. score and ρ = −0.668 (p = 0.0013) for turns vs. score, across 20 problems in Claude Code.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "42% of failures are due to wrong answers and 14% to time limit exceeded", 379 "evidence": "Table 3 shows Wrong Answer at 41.86% and Time Limit Exceeded at 13.91% of submissions.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "LLM-based code review aligns closely with human judgment (accuracy 0.852, Cohen's κ 0.710)", 384 "evidence": "Section 5.4 and Figure 5 report human validation with multiple annotators showing Pearson r=0.709 for continuous assessment and accuracy 0.852, κ=0.710 for binary rule verification.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "Performance gaps widen on from-scratch construction tasks compared to project-completion tasks", 389 "evidence": "Table 2 shows several agents dropping substantially from Easy to Hard (e.g., GitHub Copilot+Sonnet-4.5 from 71.10 to 36.63 execution, Gemini CLI from 74.57 to 35.53).", 390 "supported": "strong" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "Single run per configuration", 396 "detail": "Each agent-model pair is evaluated once per problem with no repeated runs. Given the stochastic nature of LLM outputs, results could differ substantially across runs, but no variance is reported." 397 }, 398 { 399 "flag": "Very small benchmark (N=20)", 400 "detail": "Only 20 problems, with results aggregated across them. Statistical conclusions drawn from 20 data points have limited power, and individual problem characteristics could drive aggregate results." 401 }, 402 { 403 "flag": "Contamination risk from public OJ platform", 404 "detail": "Problems are sourced from a university OJ platform with ~2800 problems. Solutions may be publicly available online, and agents have full internet access during evaluation. No contamination analysis is performed." 405 }, 406 { 407 "flag": "Internet access during evaluation", 408 "detail": "Appendix E.3 states 'Containers have full internet access for network operations.' Agents could potentially look up solutions to OJ problems during evaluation, confounding the measurement of coding capability." 409 }, 410 { 411 "flag": "C++ only", 412 "detail": "All 20 problems are C++ tasks, yet the paper draws conclusions about 'end-to-end project development' capability broadly. Agent performance in C++ may not reflect capability in more commonly used languages like Python or JavaScript." 413 }, 414 { 415 "flag": "Uncontrolled agent framework confound", 416 "detail": "Different agents use different scaffolding, tool access, and interaction strategies. Comparing Codex+GPT-5 vs. Claude Code+GPT-5 conflates framework effects with other differences. The interaction analysis (Table 4, Table 6) is done only on Claude Code, limiting generalizability." 417 } 418 ], 419 "cited_papers": [ 420 { 421 "title": "Evaluating large language models trained on code", 422 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 423 "year": 2021, 424 "relevance": "Introduced HumanEval, the foundational function-level code generation benchmark that ProjDevBench extends beyond." 425 }, 426 { 427 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 428 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"], 429 "year": 2024, 430 "relevance": "Key issue-resolution benchmark that ProjDevBench positions against as insufficient for end-to-end development evaluation." 431 }, 432 { 433 "title": "OpenHands: An open platform for AI software developers as generalist agents", 434 "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"], 435 "year": 2025, 436 "arxiv_id": "2407.16741", 437 "relevance": "Open-source autonomous coding agent framework representative of the agents ProjDevBench evaluates." 438 }, 439 { 440 "title": "SWE-agent: Agent-computer interfaces enable automated software engineering", 441 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig"], 442 "year": 2024, 443 "arxiv_id": "2405.15793", 444 "relevance": "Explores agent-computer interfaces for SE, directly relevant to understanding how agents interact with development environments." 445 }, 446 { 447 "title": "Agentless: Demystifying LLM-based software engineering agents", 448 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"], 449 "year": 2024, 450 "relevance": "Contrasts agentic vs. non-agentic approaches to SE, relevant baseline approach." 451 }, 452 { 453 "title": "Self-refine: iterative refinement with self-feedback", 454 "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"], 455 "year": 2023, 456 "relevance": "Iterative self-refinement paradigm that ProjDevBench's multi-submission evaluation tests." 457 }, 458 { 459 "title": "Competition-level code generation with AlphaCode", 460 "authors": ["Yujia Li", "David Choi", "Junyoung Chung"], 461 "year": 2022, 462 "relevance": "Competition-level code generation benchmark, predecessor to project-level evaluation." 463 }, 464 { 465 "title": "DevEval: A manually-annotated code generation benchmark aligned with real-world code repositories", 466 "authors": ["Juyong Li", "Guanjun Li", "Yongmin Zhao"], 467 "year": 2024, 468 "arxiv_id": "2405.19856", 469 "relevance": "Repository-aligned benchmark with staged development, directly compared against in Table 1." 470 }, 471 { 472 "title": "Benchmarking and studying the LLM-based agent system in end-to-end software development", 473 "authors": ["Zijie Zeng", "Yifan Li", "Ruiqi Xie"], 474 "year": 2025, 475 "arxiv_id": "2511.04064", 476 "relevance": "E2EDevBench, a concurrent end-to-end SE benchmark compared against in Table 1." 477 }, 478 { 479 "title": "NL2Repo-Bench: Towards long-horizon repository generation evaluation of coding agents", 480 "authors": ["Jiawei Ding", "Shuai Long", "Chenhao Pu"], 481 "year": 2025, 482 "arxiv_id": "2512.12730", 483 "relevance": "Repository generation benchmark from NL requirements, concurrent work compared in Table 1." 484 }, 485 { 486 "title": "ReAct: Synergizing reasoning and acting in language models", 487 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"], 488 "year": 2023, 489 "relevance": "Foundational reasoning-acting paradigm used in modern coding agents." 490 }, 491 { 492 "title": "Program synthesis with large language models", 493 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 494 "year": 2021, 495 "arxiv_id": "2108.07732", 496 "relevance": "Introduced MBPP benchmark for function-level code generation." 497 } 498 ] 499 }