scan.json (26551B)
1 { 2 "paper": { 3 "title": "OmniCode: A Benchmark for Evaluating Software Development Agents", 4 "authors": [ 5 "Atharv Sonwane", 6 "Eng-Shen Tu", 7 "Wei-Chung Lu", 8 "Claas Beger", 9 "Carter Larsen", 10 "Debjit Dhar", 11 "Simon Alford", 12 "Rachel Chen", 13 "Ronit Pattanayak", 14 "Tuan Anh Dang", 15 "Guohao Chen", 16 "Gloria Geng", 17 "Kevin Ellis", 18 "Saikat Dutta" 19 ], 20 "year": 2026, 21 "venue": "arXiv", 22 "arxiv_id": "2602.02262" 23 }, 24 "scan_version": 2, 25 "active_modules": ["experimental_rigor", "data_leakage"], 26 "checklist": { 27 "artifacts": { 28 "code_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The abstract states 'Code and data are available at https://github.com/seal-research/OmniCode.'" 32 }, 33 "data_released": { 34 "applies": true, 35 "answer": true, 36 "justification": "The benchmark data is released via the same GitHub repository as stated in the abstract." 37 }, 38 "environment_specified": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper mentions containerized environments for each instance but does not provide a requirements.txt, Dockerfile, or detailed environment specification for reproducing the evaluation experiments." 42 }, 43 "reproduction_instructions": { 44 "applies": true, 45 "answer": false, 46 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README-level guidance for replicating the evaluation is included in the paper itself." 47 } 48 }, 49 "statistical_methodology": { 50 "confidence_intervals_or_error_bars": { 51 "applies": true, 52 "answer": false, 53 "justification": "All results in Tables 2, 3, 5, and 6 are point estimates with no confidence intervals or error bars." 54 }, 55 "significance_tests": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper compares model performance across tasks and languages but uses no statistical significance tests. Differences are reported as raw percentages." 59 }, 60 "effect_sizes_reported": { 61 "applies": true, 62 "answer": false, 63 "justification": "No effect sizes are reported. Pearson correlations are given in Table 4 but no effect size measures for the main performance comparisons." 64 }, 65 "sample_size_justified": { 66 "applies": true, 67 "answer": false, 68 "justification": "The benchmark has 1794 tasks total but no justification for why this size is sufficient, nor power analysis for the evaluation claims." 69 }, 70 "variance_reported": { 71 "applies": true, 72 "answer": false, 73 "justification": "No variance, standard deviation, or spread measures are reported. All results are single-run numbers with no indication of result stability." 74 } 75 }, 76 "evaluation_design": { 77 "baselines_included": { 78 "applies": true, 79 "answer": true, 80 "justification": "Two agent frameworks (SWE-Agent and Aider) are compared, and four LLMs are evaluated. Table 3 provides direct comparison between SWE-Agent and Aider." 81 }, 82 "baselines_contemporary": { 83 "applies": true, 84 "answer": true, 85 "justification": "The evaluated models (Gemini 2.5 Flash, GPT-5-mini, DeepSeek-V3.1, Qwen3-32B) and frameworks (SWE-Agent, Aider) are contemporary state-of-the-art systems." 86 }, 87 "ablation_study": { 88 "applies": true, 89 "answer": true, 90 "justification": "Section 4.5 ablates the impact of including bad patches in test generation evaluation (Figure 6), showing that gold-patch-only evaluation dramatically overestimates capability." 91 }, 92 "multiple_metrics": { 93 "applies": true, 94 "answer": true, 95 "justification": "Multiple metrics are used: pass rate for bug-fixing/test-gen/review-response, Style-Fixing Score (Equation 2), Fix Rate, Error Ratio (Table 5), complexity analysis, and Pearson correlations." 96 }, 97 "human_evaluation": { 98 "applies": true, 99 "answer": false, 100 "justification": "No human evaluation of agent outputs is performed. All evaluation is automated via test suites and style-checking tools. Manual validation was done for benchmark construction but not for evaluating agent performance." 101 }, 102 "held_out_test_set": { 103 "applies": true, 104 "answer": true, 105 "justification": "The benchmark instances are newly curated or synthetically crafted to avoid data leakage. The paper states tasks are 'synthetically crafted or recently curated to avoid data leakage issues.'" 106 }, 107 "per_category_breakdown": { 108 "applies": true, 109 "answer": true, 110 "justification": "Results are broken down by task type (4 categories), language (3 languages), and model (4 models) in Tables 2, 3, 5, and 6." 111 }, 112 "failure_cases_discussed": { 113 "applies": true, 114 "answer": true, 115 "justification": "Appendix B provides detailed failure mode analysis (Figure 10, 11) categorizing agent failures into incorrect fixes, localization failure, failed tool calls, no patch, invalid patches, and empty patches." 116 }, 117 "negative_results_reported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper reports substantial negative results: agents struggle with test generation (max 25%), C++ tasks, and style fixing introduces many new errors (Error Ratio >2 for C++ and Java in Table 5)." 121 } 122 }, 123 "claims_and_evidence": { 124 "abstract_claims_supported": { 125 "applies": true, 126 "answer": true, 127 "justification": "Abstract claims about SWE-Agent achieving max 20.9% on Java Test Generation and poor C++/Java performance are supported by Table 2." 128 }, 129 "causal_claims_justified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper makes causal claims like 'Providing Reviews helps solve issues' (Section 4.2) and that patch complexity is a 'latent factor' in performance (Section 4.4). The review-response comparison is confounded by instance selection: review instances were sampled from higher-complexity gold patches, making the comparison non-controlled." 133 }, 134 "generalization_bounded": { 135 "applies": true, 136 "answer": false, 137 "justification": "The title claims to evaluate 'Software Development Agents' generally, but only two agent frameworks are tested with a limited set of models. The paper does not explicitly bound claims to the tested systems. Section 6 acknowledges scope limitations but the title and abstract are broader than the evidence." 138 }, 139 "alternative_explanations_discussed": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper does not substantively discuss alternative explanations for observed results. For example, the C++ performance gap could be due to training data distribution rather than task complexity, but this is not explored." 143 }, 144 "proxy_outcome_distinction": { 145 "applies": true, 146 "answer": true, 147 "justification": "The paper's claims match the granularity of its measurements. It reports pass rates on specific task types and does not frame test-suite pass rates as broader claims about 'software engineering capability.' The style-fixing score explicitly defines what is measured (Equation 2)." 148 } 149 }, 150 "setup_transparency": { 151 "model_versions_specified": { 152 "applies": true, 153 "answer": false, 154 "justification": "Models are listed as 'Gemini 2.5 Flash', 'GPT-5-mini', 'DeepSeek-V3.1', and 'Qwen3-32B' without snapshot dates or API versions. These are marketing names without precise version identifiers." 155 }, 156 "prompts_provided": { 157 "applies": true, 158 "answer": true, 159 "justification": "Full prompt text for all task types is provided in Appendix C: bug-fixing, test generation, style-fixing, review-fixing, review generation, and bad patch generation prompts." 160 }, 161 "hyperparameters_reported": { 162 "applies": true, 163 "answer": false, 164 "justification": "Only the per-instance cost limit ($2.0 for SWE-Agent) and Aider timeout (20 minutes, 3 retries) are mentioned. No temperature, top-p, or other sampling parameters are reported." 165 }, 166 "scaffolding_described": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper uses SWE-Agent and Aider as agent frameworks but says only 'we use the default settings for SWE-Agent.' No description of the scaffolding internals (tool use, retry logic, feedback mechanisms) is provided." 170 }, 171 "data_preprocessing_documented": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 3.1 describes data collection (selecting PRs that resolve issues and introduce tests), manual filtering criteria, and Appendix F provides additional filtering details. Section 3.2 documents task-specific augmentation pipelines." 175 } 176 }, 177 "limitations_and_scope": { 178 "limitations_section_present": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 6 'Limitations and Future Work' provides substantive discussion of what the benchmark does not cover." 182 }, 183 "threats_to_validity_specific": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section 6 discusses specific limitations: the benchmark doesn't cover config files, multi-language projects, profiling, natural language design conversations, sprint planning. It also identifies specific future extensions (security violations, code migration)." 187 }, 188 "scope_boundaries_stated": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 6 explicitly states what is not tested: 'Real programmers deal with config files, multiple languages, profiling and optimizing, and engage in natural language conversations to make design decisions, plan sprints, and other forms of team strategy.'" 192 } 193 }, 194 "data_integrity": { 195 "raw_data_available": { 196 "applies": true, 197 "answer": true, 198 "justification": "The benchmark data and code are released at the GitHub repository, enabling independent verification of task instances and evaluation results." 199 }, 200 "data_collection_described": { 201 "applies": true, 202 "answer": true, 203 "justification": "Section 3.1 describes data collection from GitHub repositories and existing benchmarks (SWE-Bench-Verified, Multi-SWE-Bench), including PR selection criteria and manual verification." 204 }, 205 "recruitment_methods_described": { 206 "applies": false, 207 "answer": false, 208 "justification": "No human participants. Data is from public GitHub repositories and existing benchmarks." 209 }, 210 "data_pipeline_documented": { 211 "applies": true, 212 "answer": true, 213 "justification": "The pipeline from raw PRs to benchmark tasks is documented: Section 3.1 covers base instance collection, Sections 3.2.1-3.2.4 cover task-specific augmentation, and Appendix G details bad patch generation with counts (112 C++, 237 Java, 760 Python)." 214 } 215 }, 216 "conflicts_of_interest": { 217 "funding_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Section 7 acknowledges Google (GCP credits) and Meta (LLM Evaluation Research Grant)." 221 }, 222 "affiliations_disclosed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Author affiliations are clearly listed: Cornell University, UC Santa Barbara, Jadavpur University, NYU, and one independent contributor." 226 }, 227 "funder_independent_of_outcome": { 228 "applies": true, 229 "answer": false, 230 "justification": "Google and Meta both produce models that could be evaluated on this benchmark. Google's Gemini is directly evaluated. The funders have a financial interest in how their models perform on coding benchmarks." 231 }, 232 "financial_interests_declared": { 233 "applies": true, 234 "answer": false, 235 "justification": "No competing interests statement is provided in the paper." 236 } 237 }, 238 "contamination": { 239 "training_cutoff_stated": { 240 "applies": true, 241 "answer": false, 242 "justification": "No training data cutoff dates are stated for any of the four evaluated models." 243 }, 244 "train_test_overlap_discussed": { 245 "applies": true, 246 "answer": true, 247 "justification": "The paper explicitly addresses this: tasks are 'synthetically crafted or recently curated to avoid data leakage issues' (abstract). The benchmark includes synthetic bad patches and newly collected instances to mitigate overlap." 248 }, 249 "benchmark_contamination_addressed": { 250 "applies": true, 251 "answer": true, 252 "justification": "The paper frames contamination prevention as a core design goal, using synthetic task generation and recent curation to create tasks that models could not have seen during training." 253 } 254 }, 255 "human_studies": { 256 "pre_registered": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "irb_or_ethics_approval": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "demographics_reported": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "inclusion_exclusion_criteria": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 }, 276 "randomization_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in this study." 280 }, 281 "blinding_described": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in this study." 285 }, 286 "attrition_reported": { 287 "applies": false, 288 "answer": false, 289 "justification": "No human participants in this study." 290 } 291 }, 292 "cost_and_practicality": { 293 "inference_cost_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Only a per-instance cost limit of $2.0 is mentioned for SWE-Agent. Actual costs incurred are not reported. Aider evaluations were limited 'to limit costs' but no figures are given." 297 }, 298 "compute_budget_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "No total computational budget, GPU hours, or total API spend is reported despite running thousands of agent evaluations across multiple models." 302 } 303 }, 304 "experimental_rigor": { 305 "seed_sensitivity_reported": { 306 "applies": true, 307 "answer": false, 308 "justification": "No seed sensitivity analysis is reported. All results appear to be from single runs." 309 }, 310 "number_of_runs_stated": { 311 "applies": true, 312 "answer": false, 313 "justification": "The number of experimental runs per configuration is not stated. Results appear to be single-run." 314 }, 315 "hyperparameter_search_budget": { 316 "applies": true, 317 "answer": false, 318 "justification": "No hyperparameter search was discussed. Default settings are used for SWE-Agent but no search budget is reported." 319 }, 320 "best_config_selection_justified": { 321 "applies": true, 322 "answer": true, 323 "justification": "Default settings are used for SWE-Agent (Section 4.1), which avoids cherry-picking. The only adjustment is the per-instance cost limit to $2.0, which is stated." 324 }, 325 "multiple_comparison_correction": { 326 "applies": true, 327 "answer": false, 328 "justification": "Multiple comparisons are made across 4 models × 4 tasks × 3 languages but no correction for multiple comparisons is applied (no significance tests at all)." 329 }, 330 "self_comparison_bias_addressed": { 331 "applies": true, 332 "answer": false, 333 "justification": "The authors created the benchmark and evaluate agents on it without acknowledging potential author-evaluation bias in benchmark construction or evaluation setup." 334 }, 335 "compute_budget_vs_performance": { 336 "applies": true, 337 "answer": false, 338 "justification": "Models of vastly different sizes are compared (Qwen3-32B vs. proprietary frontier models) without discussing compute budget differences. The $2.0 cost limit applies to all but the underlying compute is not equalized." 339 }, 340 "benchmark_construct_validity": { 341 "applies": true, 342 "answer": true, 343 "justification": "Section 4.5 explicitly discusses construct validity of test generation evaluation, showing that gold-patch-only evaluation overestimates capability and introducing bad patches for more robust assessment. Section 4.4 analyzes patch complexity as a latent factor." 344 }, 345 "scaffold_confound_addressed": { 346 "applies": true, 347 "answer": true, 348 "justification": "The paper compares SWE-Agent and Aider as separate frameworks (Table 3) and discusses how their different architectures (agentic vs pipeline) affect results (Section 4.3). For the main model comparison in Table 2, all models use the same SWE-Agent scaffold." 349 } 350 }, 351 "data_leakage": { 352 "temporal_leakage_addressed": { 353 "applies": true, 354 "answer": true, 355 "justification": "The paper addresses temporal leakage by design: tasks are 'synthetically crafted or recently curated to avoid data leakage issues' and bad patches are newly generated, making them unavailable in training data." 356 }, 357 "feature_leakage_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of whether the evaluation setup (e.g., issue descriptions, test structures) could leak information beyond what would be available in real usage." 361 }, 362 "non_independence_addressed": { 363 "applies": true, 364 "answer": false, 365 "justification": "Some instances come from SWE-Bench-Verified and Multi-SWE-Bench. No discussion of whether evaluated models may have been trained on or tuned against these benchmarks." 366 }, 367 "leakage_detection_method": { 368 "applies": true, 369 "answer": false, 370 "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap) is applied. The mitigation is entirely through synthetic/recent data construction." 371 } 372 } 373 }, 374 "claims": [ 375 { 376 "claim": "SWE-Agent achieves a maximum of 20.9% on Java Test Generation with DeepSeek-V3.1", 377 "evidence": "Table 2 shows DeepSeek-V3.1 achieves 20.9% on Java test generation, the highest among all models.", 378 "supported": "strong" 379 }, 380 { 381 "claim": "Performance on bug-fixing is strongly correlated with review-response (Pearson = 0.921) but weakly with style-fixing (0.512)", 382 "evidence": "Table 4 reports Pearson correlations across task types and languages.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "Including bad patches in test evaluation reveals that gold-patch-only evaluation dramatically overestimates model testing capability", 387 "evidence": "Figure 6 shows Qwen C++ drops from 22.7% to 4.55% and DeepSeek C++ from 43.8% to 25% when bad-patch failure is required.", 388 "supported": "strong" 389 }, 390 { 391 "claim": "SWE-Agent consistently outperforms Aider across most tasks and languages", 392 "evidence": "Table 3 shows SWE-Agent outperforms Aider on most cells, especially for C++ where Aider achieves only 1.8% on bug-fixing vs 8.0%.", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "Style-fixing attempts introduce many new style errors, particularly for Java and C++", 397 "evidence": "Table 5 shows Error Ratios of 2.46-5.46 for C++ and Java, meaning fixes create 2-5x as many issues as originally present.", 398 "supported": "strong" 399 }, 400 { 401 "claim": "Providing reviews helps solve issues that bug-fixing alone cannot", 402 "evidence": "Section 4.2 reports review-response resolved 15 unique Java instances vs 4 for bug-fixing, but the comparison is confounded by instance selection (reviews generated for higher-complexity patches).", 403 "supported": "weak" 404 } 405 ], 406 "methodology_tags": ["benchmark-eval"], 407 "key_findings": "OmniCode is a 1794-task benchmark spanning bug fixing, test generation, code review response, and style fixing across Python, Java, and C++. Current agents struggle most with test generation (max 25%) and non-Python languages. Style-fixing attempts frequently introduce new errors, with error ratios exceeding 5x for Java. Including bad patches in test evaluation reveals that gold-patch-only metrics overestimate testing capability by 2-3x.", 408 "red_flags": [ 409 { 410 "flag": "No variance or uncertainty quantification", 411 "detail": "All results are single-run point estimates across thousands of agent invocations with stochastic LLMs. No error bars, no repeated runs, no seed sensitivity. Results could vary substantially across runs." 412 }, 413 { 414 "flag": "Funder conflict of interest", 415 "detail": "Google funded via GCP credits while Gemini 2.5 Flash is directly evaluated. Meta funded via research grant. Neither conflict is acknowledged in the paper." 416 }, 417 { 418 "flag": "Confounded review-response comparison", 419 "detail": "The claim that reviews help is confounded: review instances were sampled from higher-complexity patches, making the bug-fixing vs review-response comparison non-controlled. The paper acknowledges this partially but still presents it as a finding." 420 }, 421 { 422 "flag": "Single-run results for stochastic systems", 423 "detail": "Agent-based systems are highly stochastic. SWE-Agent + LLM results without repeated runs make it impossible to distinguish signal from noise, especially for small per-category sample sizes." 424 }, 425 { 426 "flag": "Unequal compute across model comparisons", 427 "detail": "Qwen3-32B (open, 32B parameters) is compared against proprietary frontier models without discussing compute budget differences. The $2.0 cost cap does not equalize effective compute." 428 } 429 ], 430 "cited_papers": [ 431 { 432 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 433 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"], 434 "year": 2024, 435 "relevance": "Foundational benchmark for evaluating LLM agents on real-world GitHub issue resolution." 436 }, 437 { 438 "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering", 439 "authors": ["John Yang", "Carlos Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"], 440 "year": 2024, 441 "relevance": "Introduces SWE-Agent, one of the primary agent frameworks evaluated in this paper." 442 }, 443 { 444 "title": "Agentless: Demystifying LLM-Based Software Engineering Agents", 445 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"], 446 "year": 2024, 447 "arxiv_id": "2407.01489", 448 "relevance": "Non-agentic approach to SWE tasks; used to generate bad patches for the OmniCode benchmark." 449 }, 450 { 451 "title": "Multi-SWE-Bench: A Multilingual Benchmark for Issue Resolving", 452 "authors": ["Daoguang Zan", "Zhirong Huang", "Wei Liu"], 453 "year": 2025, 454 "arxiv_id": "2504.02605", 455 "relevance": "Extended SWE-Bench to multiple languages; OmniCode builds on this for Java and C++ instances." 456 }, 457 { 458 "title": "SWE-Smith: Scaling Data for Software Engineering Agents", 459 "authors": ["John Yang", "Kilian Leret", "Carlos E. Jimenez"], 460 "year": 2025, 461 "arxiv_id": "2504.21798", 462 "relevance": "Synthetic bug generation for training coding agents; related to OmniCode's synthetic task creation approach." 463 }, 464 { 465 "title": "SWT-Bench: Testing and Validating Real-World Bug-Fixes with Code Agents", 466 "authors": ["Niels Mündler", "Mark Müller", "Jingxuan He", "Martin Vechev"], 467 "year": 2024, 468 "relevance": "Prior test generation benchmark that OmniCode improves upon with bad-patch evaluation." 469 }, 470 { 471 "title": "Evaluating Large Language Models Trained on Code", 472 "authors": ["Mark Chen", "Jerry Tworek"], 473 "year": 2021, 474 "arxiv_id": "2107.03374", 475 "relevance": "Introduced HumanEval, a foundational code generation benchmark." 476 }, 477 { 478 "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents", 479 "authors": ["Xingyao Wang", "Boxuan Li"], 480 "year": 2024, 481 "relevance": "Open-source agent platform for software development tasks." 482 }, 483 { 484 "title": "SWE-Lancer: Can Frontier LLMs Earn $1 Million from Real-World Freelance Software Engineering?", 485 "authors": ["Samuel Miserendino", "Michele Wang", "Tejal Patwardhan", "Johannes Heidecke"], 486 "year": 2025, 487 "arxiv_id": "2502.12115", 488 "relevance": "Evaluates LLM agents on real-world freelance software engineering tasks with monetary stakes." 489 }, 490 { 491 "title": "AutoCodeRover: Autonomous Program Improvement", 492 "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"], 493 "year": 2024, 494 "relevance": "Autonomous program repair agent evaluated on SWE-Bench." 495 }, 496 { 497 "title": "SWE-PolyBench: A Multi-Language Benchmark for Repository Level Evaluation of Coding Agents", 498 "authors": ["Muhammad Shihab Rashid"], 499 "year": 2025, 500 "arxiv_id": "2504.08703", 501 "relevance": "Multi-language coding benchmark with improved data quality checks." 502 }, 503 { 504 "title": "BugPilot: Complex Bug Generation for Efficient Learning of SWE Skills", 505 "authors": ["Atharv Sonwane"], 506 "year": 2025, 507 "arxiv_id": "2510.19898", 508 "relevance": "Synthetic bug generation methodology related to OmniCode's bad patch generation." 509 } 510 ] 511 }