scan.json (35484B)
1 { 2 "paper": { 3 "title": "SWE-rebench: An Automated Pipeline for Task Collection and Decontaminated Evaluation of Software Engineering Agents", 4 "authors": [ 5 "Ibragim Badertdinov", 6 "Alexander Golubev", 7 "Maksim Nekrashevich", 8 "Anton Shevtsov", 9 "Simon Karasik", 10 "Andrei Andriushchenko", 11 "Maria Trofimova", 12 "Daria Litvintseva", 13 "Boris Yangel" 14 ], 15 "year": 2025, 16 "venue": "NeurIPS 2025 Track on Datasets and Benchmarks", 17 "arxiv_id": "2505.20411", 18 "doi": "10.48550/arXiv.2505.20411" 19 }, 20 "scan_version": 3, 21 "active_modules": ["experimental_rigor", "data_leakage"], 22 "methodology_tags": ["benchmark-eval"], 23 "key_findings": "SWE-rebench introduces a fully automated pipeline that extracts 21,336 verifiable SWE tasks from 3,468 GitHub repositories, eliminating the manual curation bottleneck of prior benchmarks. Comparing model performance on SWE-rebench (fresh 2025 tasks) versus SWE-bench Verified reveals substantial performance drops for several models (e.g., DeepSeek-V3-0324 drops from 39.7% to 21.3%), suggesting potential contamination effects in older benchmarks. GPT-4.1 is the only model showing notable performance decline between January and March-April 2025 task subsets. Using a standardized ReAct scaffold and 5 runs per model, the paper demonstrates that DeepSeek-V3 models are the strongest open-source performers while Qwen3's thinking mode provides no measurable advantage over no-think mode.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "Section 2.4 states 'Accompanying code for utilizing the dataset, including scripts for tasks evaluation, is available on GitHub.' The paper also links to the HuggingFace dataset and leaderboard website." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "The SWE-rebench dataset of 21,336 tasks is publicly available on HuggingFace Datasets (https://huggingface.co/datasets/nebius/SWE-rebench), as stated in Section 2.4 and footnote 2." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "While the paper describes the evaluation infrastructure (vLLM on 2 nodes with 8×H200 GPUs each, Appendix J) and each task instance includes install_config and pinned dependencies, no environment specification (requirements.txt, Dockerfile) is provided for reproducing the pipeline or evaluation infrastructure itself." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "The paper references code on GitHub and data on HuggingFace, but does not include step-by-step reproduction instructions in the paper itself. The evaluation setup is described at a high level in Appendix J but lacks specific commands to replicate the full pipeline or benchmark runs." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": true, 51 "justification": "Tables 1 and 2 report SEM (Standard Error of the Mean) for all models across both benchmark subsets, providing uncertainty quantification for the resolved rate metric." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "No formal statistical significance tests (p-values, t-tests, etc.) are reported. Claims about model differences and contamination effects are based on comparing point estimates and SEM without formal hypothesis testing." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Tables 1 and 2 report absolute resolved rates on both SWE-bench Verified and SWE-rebench, allowing direct computation of effect magnitudes. For example, DeepSeek-V3-0324 drops from 39.7% to 21.3% across benchmarks, providing clear magnitude context." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "The benchmark contains 294 tasks from 169 repositories (Appendix H describes filtering criteria), but no justification is given for why 294 tasks is sufficient for the claims made, and no power analysis is provided." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": true, 71 "justification": "Section 3.2 states 'we run each model five times on the full benchmark' and reports SEM across runs in Tables 1 and 2, along with pass@5 as an additional variance-aware metric." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "Tables 1 and 2 compare 13 models against each other, including GPT-4.1, DeepSeek-V3, Qwen3, LLaMa-4, and others. Table 2 additionally compares performance on SWE-bench Verified as a reference baseline." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "The evaluated models include the latest releases: GPT-4.1 (April 2025), DeepSeek-V3-0324 (March 2025), Qwen3 (April 2025), LLaMa-4 (2025), and gemma-3 (2025). These are contemporary state-of-the-art models." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": false, 88 "justification": "Table 3 (Appendix C) compares installation approaches (agentless with 1/3/10 candidates vs agent-based), but this is method selection for a single pipeline component, not a systematic ablation of the pipeline stages. No ablation of the benchmark evaluation design (e.g., effect of filtering criteria, quality assessment) is provided." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "Three metrics are reported: resolved rate (mean across 5 runs), SEM (standard error of the mean), and pass@5. These capture different aspects of model performance." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": false, 98 "justification": "All evaluation is automated via test-suite pass/fail. No human evaluation of model outputs (patches, trajectories) is included in the benchmark evaluation. The SWE-bench Verified human annotations were used only for training the quality classifier, not for evaluating model outputs." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "The 294 benchmark tasks are used exclusively for evaluation. The paper explicitly criticizes other approaches for 'scaffoldings are often developed and tuned on subsets from SWE-bench' (Section 3.1) and uses a fixed, standardized scaffold with no task-specific tuning." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Results are broken down by temporal subset (January 2025 vs March-April 2025 in Table 1), by benchmark (SWE-bench Verified vs SWE-rebench in Table 2), and by number of files changed (Appendix F: 1 file 28.6%, 2 files 20.6%, ≥3 files 17.5%)." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 3.3 discusses Qwen2.5-Coder-32B-Instruct's instruction-following failures ('frequently hallucinated environment responses or enters loops of formatting errors'). Appendix J.2 provides a detailed trajectory example of this failure mode. Appendix B.5 discusses pipeline failure modes." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "Section 3.3 reports that Qwen3's think mode provides 'no measurable advantage' over no-think mode, GPT-4.1 shows performance decline on the March-April subset, and Qwen2.5-Coder-32B-Instruct 'underperforms expectations.' Pipeline limitations (31% success rate for installation recipes) are also reported." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "Abstract claims are supported: the automated pipeline is described in Section 2, the 21,000+ task dataset is documented with statistics (Table 7), and the contamination analysis is presented in Tables 1-2 with temporal comparisons showing performance differences between SWE-bench Verified and SWE-rebench." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper claims 'performance of some language models might be inflated due to contamination issues' (abstract). While hedged with 'might,' the implication is that contamination causes the performance gap between SWE-bench Verified and SWE-rebench. However, the gap could be explained by task difficulty differences, repository diversity, or distribution shift between benchmarks. The temporal comparison (Jan vs Mar-Apr) provides suggestive evidence but doesn't control for confounds." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": false, 135 "justification": "The title claims 'Evaluation of Software Engineering Agents' broadly while all results are Python-only. Although the abstract mentions 'Python-based SWE tasks' and Section 4 acknowledges the Python limitation, the title and overall framing suggest generality beyond what was tested. The benchmark uses only 169 repositories." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper attributes the performance gap between SWE-bench Verified and SWE-rebench primarily to contamination, without substantively discussing alternative explanations such as differences in task difficulty distribution, repository familiarity, or temporal recency making tasks inherently harder. The limitations section discusses automated quality and Python scope but not alternative explanations for the contamination finding." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": true, 145 "justification": "Section 3.2 specifically identifies two things being measured: '(1) The ability to comprehend a real-world software issue, devise a plan, implement a correct code patch' and '(2) The ability to follow instructions and operate within a structured agentic framework.' The proxy (test-suite pass rate) and the claimed capabilities are closely aligned, and the paper discusses test patch quality assessment (Section 2.4) as a factor affecting measurement validity." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": true, 152 "justification": "Table 1 lists specific model identifiers with version dates where applicable: 'gpt-4.1-2025-04-14', 'DeepSeek-V3-0324', 'DeepSeek-V3-1226', 'Qwen3-235B-A22B', 'Qwen2.5-72B-Instruct', etc. Open-source models include size specifications, and proprietary models include snapshot dates." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": true, 157 "justification": "The full system prompt used for all model evaluations is provided in Appendix I, spanning several pages with complete tool descriptions, formatting instructions, and a demonstration trajectory. The paper states 'we share the exact system prompt used for all model evaluations' (Section 3.2)." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": false, 162 "justification": "Appendix J states they used 'generation hyperparameters recommended by the respective model developers (e.g., temperature, p in top-p, max context length etc.)' and 'standardize the context length to 128K tokens,' but does not list the actual parameter values used for each model. Readers cannot determine the specific temperature or top-p settings applied." 163 }, 164 "scaffolding_described": { 165 "applies": true, 166 "answer": true, 167 "justification": "The scaffolding is described as a 'minimal ReAct-style agentic framework' (Section 3.2). The complete system prompt with all tools (create, edit, goto, open, replace, scroll, search_file, submit) is provided in Appendix I. The paper explicitly states function-calling is not used and all interaction is text-based command generation." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "The four-stage pipeline is documented in detail (Sections 2.1-2.4): preliminary collection with specific filtering criteria (Section 2.1), automated installation (Section 2.2), execution-based verification (Section 2.3), and quality assessment (Section 2.4). Table 6 (Appendix L) provides the data funnel with counts at each stage. Benchmark filtering criteria are listed in Appendix H." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 4 is titled 'Discussion and limitations' and provides substantive discussion of multiple limitations including automated quality assessment imperfections, Python-only scope, and scalability trade-offs." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section 4 discusses specific threats: 'our LLM-based approach to generating installation instructions... was validated on a limited set of 18 repositories,' 'the automated task quality assessment... cannot fully replicate nuanced human judgment,' and 'fully automated pipeline may result in some tasks being imperfectly described or unsolvable solely from the issue.'" 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 4 explicitly states: 'The initial release of SWE-rebench is limited exclusively to Python, restricting its immediate applicability to other language ecosystems.' The paper also notes that automated assessment 'can lead to lower absolute success rates compared to manually curated benchmarks' and describes what the benchmark does NOT test." 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": true, 196 "justification": "The full SWE-rebench dataset with all 21,336 task instances and their annotations is publicly available on HuggingFace Datasets. Appendix E shows the complete schema with all fields. Each instance includes problem_statement, patches, test results, and metadata." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 2.1 describes data collection in detail: sources (GitHub Archive and GitHub), approximately 450,000 pull requests from 30,000+ repositories, with specific inclusion criteria (permissive licenses, Python >75%, merged PRs, issue descriptions >10 characters, 1-15 files changed, etc.)." 202 }, 203 "recruitment_methods_described": { 204 "applies": true, 205 "answer": true, 206 "justification": "The sample recruitment is well-described: repositories were selected based on permissive licenses (listed in Appendix D), Python code proportion >75%, and PR/issue linking criteria. The filtering funnel from 10M PRs to 21K tasks is documented in Table 6." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": true, 211 "justification": "The four-stage pipeline is fully documented with counts: ~10M PRs → ~450K candidates → ~150K filtered → ~21K valid tasks (Table 6). Each filtering criterion is specified (Section 2.1), and acceptance rates are reported at each stage (5%, 33%, 14%, 100%)." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding or acknowledgments section is present in the paper. All authors are affiliated with Nebius, and the work uses Nebius infrastructure (TractoAI), but there is no formal funding disclosure." 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "All nine authors are listed with 'Nebius' affiliation in the paper header. The paper also discloses use of Nebius's TractoAI platform for distributed computing." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": false, 228 "justification": "Nebius is a cloud computing/AI infrastructure company (owner of TractoAI) with commercial interests in the SWE agent ecosystem. The benchmark promotes use of infrastructure for training and evaluating agents, which aligns with Nebius's business interests. This potential conflict is not discussed." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests or financial interests statement is present in the paper. Nebius's commercial interests in the AI agent infrastructure space are not disclosed as a potential conflict." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": true, 239 "answer": false, 240 "justification": "The paper uses model release dates as a proxy (models released after March 1, 2025 are marked with asterisks in Table 1) but does not state the actual training data cutoff dates for any of the evaluated models." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": true, 244 "answer": true, 245 "justification": "This is a central contribution of the paper. Section 3.1 discusses contamination risk for SWE-bench ('models released afterward may have been exposed to its data during training'). Section 3.2 describes their temporal decontamination approach: tasks created in 2025 after model release dates. Tables 1-2 explicitly analyze potential contamination effects." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": true, 249 "answer": true, 250 "justification": "The paper's core innovation is addressing contamination. Section 3.2 describes continuous dataset updates using fresh tasks from 2025, temporal tracking of issue creation dates against model releases, and explicit marking of potentially contaminated evaluations on the leaderboard." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants. The study uses automated pipelines and model evaluations on GitHub-sourced tasks." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants. All data comes from public GitHub repositories and automated model evaluations." 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants. Repository and task inclusion criteria are documented separately under data_preprocessing." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants or experimental conditions requiring randomization." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants or evaluators requiring blinding." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "Appendix J reports that 'a single run over the full task set required approximately 7 hours' for DeepSeek-V3 on 2 nodes with 8×H200 GPUs. However, wall-clock time is reported for only one model, with no token counts, API costs, or per-task costs for any model." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "Hardware is described (2 nodes × 8×H200 GPUs per model, Appendix J) and one model's runtime is given (~7 hours for DeepSeek-V3). But total GPU-hours across all 13 models × 5 runs, pipeline compute costs (TractoAI), and quality model fine-tuning costs are not stated." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": true, 306 "justification": "Appendix J states 'each model was run 5 times on the full benchmark using different random seeds.' SEM is reported in Tables 1-2, capturing cross-seed variability." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": true, 311 "justification": "Section 3.2 explicitly states 'we run each model five times on the full benchmark' and Appendix J confirms '5 times... using different random seeds.'" 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": true, 316 "justification": "Section 3.2 and Appendix J explicitly state that 'default generation hyperparameters as recommended by model developers' were used with no tuning, making the search budget transparently zero." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": true, 321 "justification": "All models use their default configurations with no selection or tuning. All 13 models' results are reported in Tables 1-2 without cherry-picking configurations." 322 }, 323 "multiple_comparison_correction": { 324 "applies": true, 325 "answer": false, 326 "justification": "The paper compares 13 models and makes multiple claims about relative performance differences without applying any correction for multiple comparisons (e.g., Bonferroni, Holm)." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "While Nebius doesn't evaluate its own model, the authors designed the benchmark and evaluation scaffold. Potential biases in benchmark construction (task selection, filtering criteria, scaffold design) that could inadvertently favor certain models are not discussed." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": true, 335 "answer": false, 336 "justification": "Models of vastly different sizes are compared (27B to 235B parameters) without analyzing or controlling for compute budget differences. Performance is not reported as a function of model size or inference compute." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": true, 341 "justification": "Section 3.2 explicitly describes what the benchmark measures ('The ability to comprehend a real-world software issue... and implement a correct code patch' and 'The ability to follow instructions and operate within a structured agentic framework'). Section 2.4 discusses quality assessment of tasks. Limitations section acknowledges that automated curation may produce imperfect tasks." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": true, 345 "answer": true, 346 "justification": "This is a major design principle. Section 3.2 states 'every model is assessed by using the same minimal ReAct-style agentic framework, identical prompts and default generation hyperparameters.' Section 3.1 explicitly identifies scaffolding variability as a problem in prior work. Function calling is disabled to equalize evaluations." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": true, 352 "answer": true, 353 "justification": "This is the paper's central contribution. Section 3.2 describes using 'automated pipeline for a continuous supply of fresh tasks' with temporal tracking. Tasks are from January and March-April 2025, after most models' training periods. Models released after March 2025 are marked in Table 1." 354 }, 355 "feature_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "The paper does not discuss whether the evaluation setup could leak answer information through the provided context. The focus is entirely on temporal/training contamination. Whether the issue descriptions, repository structure, or test names provide unintended hints toward solutions is not analyzed." 359 }, 360 "non_independence_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "The benchmark uses 169 repositories, but there is no discussion of whether training data and test data share repositories, code patterns, or problem structures. Whether the 294 benchmark tasks are structurally independent from data seen during model training (beyond temporal filtering) is not addressed." 364 }, 365 "leakage_detection_method": { 366 "applies": true, 367 "answer": true, 368 "justification": "The paper applies a concrete temporal decontamination method: tasks are sourced from issues created in 2025, and 'we precisely track the creation dates of the issues and their corresponding pull requests against model release dates' (Section 3.2). Potentially contaminated evaluations are explicitly marked on the leaderboard." 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "The automated pipeline produces 21,336 verifiable SWE tasks from 3,468 GitHub repositories without manual curation.", 375 "evidence": "Section 2 describes the four-stage pipeline. Table 6 (Appendix L) shows the data funnel: ~10M PRs → ~450K candidates → ~150K filtered → ~21K valid tasks. Table 7 provides dataset statistics.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "The automated installation recipe generation successfully configures environments for 31% of all repositories.", 380 "evidence": "Section 2.2 states '31% of all repositories.' Validation was performed on 18 SWE-bench repositories (Table 3, Appendix C), with 6-9 out of 18 succeeding depending on configuration.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Performance of some language models on SWE-bench Verified may be inflated due to data contamination.", 385 "evidence": "Table 2 shows substantial drops from SWE-bench Verified to SWE-rebench Mar-Apr 2025: DeepSeek-V3-0324 drops from 39.7% to 21.3%, Qwen2.5-72B-Instruct from 11.3% to 9.3%. Both DeepSeek-V3 versions perform similarly on SWE-rebench but diverge on SWE-bench Verified (39.7% vs 35.2%).", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "GPT-4.1 is the only model whose performance noticeably declined between the January and March-April 2025 task subsets.", 390 "evidence": "Table 1 shows GPT-4.1 dropping from 31.1% (Jan) to 26.7% (Mar-Apr), while other models generally maintain or improve their resolved rates across the same period.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "DeepSeek-V3 models demonstrate the strongest performance among open-source models across both SWE-rebench subsets.", 395 "evidence": "Tables 1 and 2 consistently show DeepSeek-V3-0324 and V3-1226 at the top of open-source models: 21.3-21.7% on SWE-rebench Jan and 21.3-21.9% on Mar-Apr, substantially above the next open-source models.", 396 "supported": "strong" 397 }, 398 { 399 "claim": "Qwen3 models perform similarly with or without think mode enabled.", 400 "evidence": "Table 1 shows Qwen3-235B no-think: 15.2%/16.6% vs think: 13.7%/12.2% (Jan/Mar-Apr). Qwen3-32B no-think: 13.2%/13.7% vs think: 11.8%/11.2%. Pass@5 scores are nearly identical. Section 3.3 notes this suggests 'the base model's capabilities are sufficiently strong for deliberate planning to provide no measurable advantage.'", 401 "supported": "moderate" 402 }, 403 { 404 "claim": "The fine-tuned quality assessment model achieves 81% accuracy on task complexity prediction, improving over the 68% baseline.", 405 "evidence": "Section 2.4 reports fine-tuned Qwen-72B achieves 81% accuracy (weighted F1: 0.82) for Task Complexity vs 68% for vanilla Qwen-72B. Table 4 (Appendix F) provides detailed precision/recall breakdowns.", 406 "supported": "strong" 407 } 408 ], 409 "red_flags": [ 410 { 411 "flag": "Undisclosed commercial interest", 412 "detail": "All authors are from Nebius, which sells AI compute infrastructure (TractoAI). The benchmark promotes the SWE agent training/evaluation ecosystem that drives demand for Nebius infrastructure. This commercial interest is not disclosed as a potential conflict." 413 }, 414 { 415 "flag": "Task difficulty confound in contamination claims", 416 "detail": "The performance gap between SWE-bench Verified and SWE-rebench (e.g., DeepSeek-V3-0324: 39.7% → 21.3%) is attributed primarily to contamination, but could also be explained by differences in task difficulty, repository diversity, or the automated vs manual curation quality. No formal test controls for these confounds." 417 }, 418 { 419 "flag": "Small validation set for pipeline validation", 420 "detail": "The automated installation recipe approach was validated on only 18 repositories from SWE-bench (Appendix C), then deployed at scale across 30,000+ repositories. The prompt engineering was also based on this limited validation." 421 }, 422 { 423 "flag": "No formal significance testing for model comparisons", 424 "detail": "Despite reporting SEM, no formal significance tests are applied to any model comparison claims. With 294 tasks and 5 runs, several reported differences (e.g., Qwen3 think vs no-think) may not be statistically significant." 425 } 426 ], 427 "cited_papers": [ 428 { 429 "title": "SWE-bench: Can language models resolve real-world github issues?", 430 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"], 431 "year": 2024, 432 "relevance": "The foundational SWE benchmark that SWE-rebench is designed to complement and address contamination issues in." 433 }, 434 { 435 "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents", 436 "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"], 437 "year": 2024, 438 "arxiv_id": "2407.16741", 439 "relevance": "Major open-source SWE agent platform evaluated on SWE-bench, directly relevant to agent scaffolding and evaluation practices." 440 }, 441 { 442 "title": "Agentless: Demystifying llm-based software engineering agents", 443 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"], 444 "year": 2024, 445 "arxiv_id": "2407.01489", 446 "relevance": "Agentless approach to SWE tasks that inspired SWE-rebench's installation recipe generation methodology." 447 }, 448 { 449 "title": "Training software engineering agents and verifiers with swe-gym", 450 "authors": ["Jiayi Pan", "Xingyao Wang", "Graham Neubig"], 451 "year": 2024, 452 "arxiv_id": "2412.21139", 453 "relevance": "Prior dataset for training SWE agents with manually curated interactive tasks, which SWE-rebench aims to surpass in scale." 454 }, 455 { 456 "title": "SWE-RL: Advancing llm reasoning via reinforcement learning on open software evolution", 457 "authors": ["Yuxiang Wei", "Olivier Duchenne", "Jade Copet"], 458 "year": 2025, 459 "arxiv_id": "2502.18449", 460 "relevance": "Demonstrates RL for training SWE agents, directly relevant to SWE-rebench's stated goal of providing RL training data." 461 }, 462 { 463 "title": "SWE-smith: Scaling data for software engineering agents", 464 "authors": ["John Yang", "Kilian Leret", "Carlos E. Jimenez"], 465 "year": 2025, 466 "arxiv_id": "2504.21798", 467 "relevance": "Concurrent work on scaling SWE agent training data using artificial bug injection, contrasting with SWE-rebench's real-world task collection." 468 }, 469 { 470 "title": "SWE-polybench: A multi-language benchmark for repository level evaluation of coding agents", 471 "authors": ["Muhammad Shihab Rashid", "Christian Bock"], 472 "year": 2025, 473 "arxiv_id": "2504.08703", 474 "relevance": "Multi-language SWE benchmark, complementary to SWE-rebench's Python-only scope." 475 }, 476 { 477 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 478 "authors": ["Naman Jain", "King Han", "Alex Gu"], 479 "year": 2024, 480 "arxiv_id": "2403.07974", 481 "relevance": "Pioneering continuously-updated decontaminated code benchmark, providing a model for SWE-rebench's temporal decontamination approach." 482 }, 483 { 484 "title": "ReAct: Synergizing reasoning and acting in language models", 485 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"], 486 "year": 2023, 487 "arxiv_id": "2210.03629", 488 "relevance": "The ReAct framework used as the standardized scaffolding for all SWE-rebench benchmark evaluations." 489 }, 490 { 491 "title": "Kimi-Dev: Agentless training as skill prior for SWE-agents", 492 "authors": ["Zonghan Yang", "Shengjie Wang", "Kelin Fu"], 493 "year": 2025, 494 "arxiv_id": "2509.23045", 495 "relevance": "Work that directly utilizes SWE-rebench dataset for training SWE agents, demonstrating the dataset's practical value." 496 }, 497 { 498 "title": "DeepSWE: Training a state-of-the-art coding agent from scratch by scaling RL", 499 "authors": ["Michael Luo", "Naman Jain", "Jaskirat Singh"], 500 "year": 2025, 501 "relevance": "Demonstrates training SWE agents via RL at scale, the primary use case SWE-rebench is designed to support." 502 }, 503 { 504 "title": "SWE-bench+: Enhanced coding benchmark for LLMs", 505 "authors": ["Reem Aleithan", "Haoran Xue"], 506 "year": 2024, 507 "arxiv_id": "2410.06992", 508 "relevance": "Follow-up work addressing SWE-bench limitations through post-cutoff filtering, contrasting with SWE-rebench's automated approach." 509 } 510 ], 511 "engagement_factors": { 512 "practical_relevance": { 513 "score": 3, 514 "justification": "Immediately usable 21K-task dataset on HuggingFace and public leaderboard for practitioners training and evaluating SWE agents." 515 }, 516 "surprise_contrarian": { 517 "score": 2, 518 "justification": "Challenges the validity of SWE-bench Verified scores by showing potential contamination-driven inflation, undermining a widely-used benchmark." 519 }, 520 "fear_safety": { 521 "score": 0, 522 "justification": "No safety or AI risk concerns raised; brief mention of potential misuse is generic." 523 }, 524 "drama_conflict": { 525 "score": 2, 526 "justification": "Implicitly argues that SWE-bench scores are unreliable due to contamination, a 'benchmarks are inflated' angle that generates discussion." 527 }, 528 "demo_ability": { 529 "score": 2, 530 "justification": "Dataset on HuggingFace, code on GitHub, and public leaderboard at swe-rebench.com — accessible but not a pip-installable tool." 531 }, 532 "brand_recognition": { 533 "score": 1, 534 "justification": "Nebius is a known AI infrastructure company but not a household name. SWE-bench connection provides some recognition boost." 535 } 536 } 537 }