scan.json (32654B)
1 { 2 "paper": { 3 "title": "SWE-Protégé: Learning to Selectively Collaborate With an Expert Unlocks Small Language Models as Software Engineering Agents", 4 "authors": [ 5 "Patrick Tser Jern Kon", 6 "Archana Pradeep", 7 "Ang Chen", 8 "Alexander P. Ellis", 9 "Warren Hunt", 10 "Zijian Wang", 11 "John Yang", 12 "Samuel Thompson" 13 ], 14 "year": 2026, 15 "venue": "arXiv", 16 "arxiv_id": "2602.22124" 17 }, 18 "scan_version": 2, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "SWE-Protégé post-trains Qwen2.5-Coder-7B-Instruct to achieve 42.4% Pass@1 on SWE-bench Verified by learning to selectively collaborate with an expert model (Opus 4.1), a +25.4% improvement over the prior SLM state of the art. The approach uses expert assistance sparsely (~4 calls per task, 11% of total tokens), yielding up to 8.2x lower cost than direct expert execution. A two-phase pipeline (SFT on expert-augmented trajectories followed by GRPO with shaped rewards) eliminates degenerative looping and teaches multi-turn pair-programming behavior.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": false, 27 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. No mention of releasing source code, model weights, or training infrastructure." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The paper uses publicly available datasets: SWE-smith task dataset (referenced with HuggingFace URL), SWE-bench Verified, and SWE-Gym subset (also with HuggingFace URL). However, the expert-augmented trajectories generated by the authors are not released." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "Hardware is mentioned ('single node equipped with 8 NVIDIA A100/H100 80G GPUs, e.g., AWS p4de.24xlarge') and frameworks are named (Torchtune, SkyRL, vLLM, SWE-agent), but no version numbers, requirements.txt, Dockerfile, or conda environment specifications are provided." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The method is described in sufficient detail for an expert to attempt reimplementation, but no executable reproduction guide exists." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "All main results (Table 1, Table 2) report single point estimates (e.g., 42.4%, 30.8%) without confidence intervals or error bars. Fig. 6 shows bootstrap uncertainty bands but only for the looping analysis, not for the main performance metric." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "No statistical significance tests are used. Claims of improvement (e.g., '+25.4% over the prior SLM state of the art') are based on comparing raw numbers without any hypothesis test." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Absolute improvements are reported with baseline context throughout: '+25.4% over the prior state of the art' (42.4% vs 17.0%), '+2.2%' over SWE-agent-LM-32B (42.4% vs 40.2%), '4.2× and 8.2× lower' cost than direct execution, '≈40%' total token reduction in Phase II." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "SWE-bench Verified contains 500 instances and the held-out evaluation uses 400 tasks, but no justification for these sample sizes or power analysis is provided." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "All results are single-run (Pass@1 with one rollout per instance). The paper explicitly states 'We do not use multi-attempt sampling, majority voting, or other test-time scaling.' No variance across seeds or runs is reported." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Table 1 includes extensive baselines: closed-weight models (GPT-4o, Claude 3.5/3.7/4.5 Sonnet), open-weight large models (SWE-agent-LM-32B, R2E-Gym-32B, SWE-fixer-72B), and SLMs (SWE-gym-7B, SWE-agent-LM-7B, Lingma-SWE-GPT-7B)." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "Baselines include SWE-smith/SWE-agent-LM (2025), R2E-Gym (2025), SWE-Fixer (2025), and CWM (2025), representing the state of the art at time of evaluation. The authors independently reproduced the SWE-agent-LM-7B baseline." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "Extensive ablations in §4.1 and Fig. 7: module modifications (loop heuristic, context window), expert modifications (in-house 7B/32B experts), dataset modifications (in-place trajectory editing), Phase I vs Phase II (Table 2), and data scaling (Fig. 2)." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Multiple metrics reported: %Resolved (Pass@1), per-task cost (Fig. 4), expert token fraction (Fig. 3), total tokens per task, step counts (Fig. 10), looping rates (Fig. 6), failure mode distribution (Fig. 5), reward component trajectories (Fig. 8)." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": false, 96 "justification": "Evaluation is entirely automated via SWE-bench unit tests. No human evaluation of patch quality, correctness beyond test passing, or code quality is included." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": true, 101 "justification": "Results are reported on SWE-bench Verified (separate from the SWE-smith training tasks). Additionally, a held-out subset of 400 tasks 'explicitly excluded from the trajectory-generation mixture' is used for the contamination study (§4.1)." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": false, 106 "justification": "No per-repository, per-difficulty, or per-task-type breakdown of the main resolution rate is provided. Breakdowns exist for process metrics (looping, cost, steps, failure modes) but not for the primary evaluation metric across task categories." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Failure modes are extensively analyzed: degenerative looping (Fig. 6), cost/step limit aborts (Fig. 5), failure mode shifts between Phase I and Phase II, and qualitative trajectory examples showing stalled behavior (App. E, Fig. 17-19)." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "Several negative results: SWE-smith training induces regression at larger budgets (11.8% at 5k trajectories, §1); LoRA and QLoRA underperformed full SFT (§3.1); in-place trajectory modification performs worst at 14.2% (Fig. 7); in-house 7B expert achieves only 17.0% (Fig. 7); loop heuristic provides no benefit with limited context (§4.1)." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "Abstract claims are all supported: 42.4% Pass@1 (Table 1), +25.4% improvement over prior SLM SOTA (Table 1, 42.4% vs 17.0%), ~4 expert calls per task (Fig. 8), 11% of total tokens (Fig. 3), 8.2× lower cost (Fig. 4). All numbers match." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "Causal claims are supported by controlled ablations: Phase II RL improves over Phase I SFT (Table 2, controlled single-variable), component ablations isolate loop heuristic, context window, expert quality, and data generation method (Fig. 7). Each ablation modifies one variable." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "Title claims 'Software Engineering Agents' broadly, but results are only on SWE-bench Verified (Python-based repositories, bug-fixing tasks). The impact statement acknowledges 'Our evaluation focuses primarily on Python-based repositories' but the title and abstract do not bound this. Discussion says techniques 'could in principle be applied to other domains' without evidence." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": true, 138 "justification": "Multiple alternatives addressed through ablations: gains could be from distillation rather than collaboration (refuted by dataset modification ablation, 14.2%), from more expert tokens (refuted by token analysis showing 11%), from contamination (refuted by held-out evaluation), or from the specific expert model (addressed by testing three different expert backends)." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper measures Pass@1 on SWE-bench Verified (unit test pass rate) and frames claims at this level of granularity. Claims are about 'SWE-bench Verified' performance and cost efficiency, not broader unsubstantiated claims about software engineering capability in general." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": false, 150 "justification": "Base model 'Qwen-2.5-Coder-Instruct-7B' is a specific HuggingFace checkpoint. However, expert models are specified only by marketing names: 'Claude Sonnet 3.7', 'Sonnet 4.5', 'Opus 4.1' — no snapshot dates or API versions provided. These expert models are critical to the approach and their behavior changes across versions." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": true, 155 "justification": "Full prompt text provided in the appendix: trajectory generation prompt (Fig. 11), SWE-Protégé agent prompt (Fig. 12), expert prompts for training and evaluation (Fig. 13-14), warrant judge prompt (Fig. 15), follow judge prompt (Fig. 16). These are actual prompts, not just descriptions." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Detailed hyperparameters reported: SFT batch size 32, max seq length 32,762 (§3.1); GRPO ε_high=0.28, ε_low=0.20, 6 rollouts per prompt, batch size 16, 160 steps (§3.1); reward shaping schedule with specific weights (k1=15, k2=8, λ_loop=0.5, c_loop=-10, w_expert=0.3, w_follow=2.0); 75-step and $2 budget per task; 6 expert calls maximum." 161 }, 162 "scaffolding_described": { 163 "applies": true, 164 "answer": true, 165 "justification": "SWE-agent framework described in detail: ReAct-style thought/action pairs, 75-step limit, tool set (bash, file edit, ask_expert), expert context window (5 most recent messages), history processing (last_n_observations with expert guidance preservation), Docker runtime via SWE-ReX (§2.1, §3, App. C)." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Trajectory generation procedure documented: Claude Sonnet 3.7 generates trajectories with expert augmentation, rejection sampling keeps only resolved tasks yielding ~4.8K trajectories, controlled check by regenerating without expert augmentation (App. B). RL uses 100-task subset from SWE-Gym. Expert-augmented data includes gold patch for trajectory generation but not evaluation." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 6 'Discussion' contains 'Limitations and Future Work' subsection discussing scope constraints, unexplored design choices, and expert optimization. The Impact Statement also discusses risks and limitations at length." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "Specific limitations discussed: focus on SWE-bench within SWE-agent framework (§6), no exhaustive exploration of design choices (§6), expert treated as fixed black-box (§6), Python-based repository evaluation only (Impact Statement), risks of incorrect deferral decisions and propagating incorrect expert guidance (Impact Statement)." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": true, 187 "justification": "Section 6 states: 'While we focus on SWE-bench within the SWE-agent framework' and 'our goal is to establish that SWE-Protégé can materially improve SLM performance rather than fully optimize the frontier.' Impact Statement: 'Our evaluation focuses primarily on Python-based repositories, reflecting SWE-bench's task composition.'" 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "The underlying public datasets (SWE-smith, SWE-bench) are available, but the authors' generated expert-augmented trajectories, trained model weights, and RL training logs are not released for independent verification." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Trajectory generation procedure described in §2.2 and App. B: Claude Sonnet 3.7 generates trajectories in SWE-agent with augmented action space, rejection sampling retains resolved tasks, expert is the same model with gold patch access. Yields ~4.8K trajectories. RL uses 100-task SWE-Gym subset." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. Data sources are standard public benchmarks (SWE-bench Verified, SWE-smith task dataset, SWE-Gym)." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "Full pipeline documented: task sampling from SWE-smith → trajectory generation with expert augmentation → rejection sampling (keep resolved) → ~4.8K trajectories for SFT → Phase II uses 100-task subset from SWE-Gym for RL with 6 rollouts per prompt, 160 steps. Controlled comparison regenerating without expert augmentation to verify equivalence (App. B)." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding section, acknowledgments, or grant information is provided. The work involves Meta and University of Michigan affiliations but no funding sources are disclosed." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly listed: Meta (1), University of Michigan (2), Stanford University (3). The footnote '†Work done at Meta' clarifies the primary author's arrangement." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "No funding disclosed, so independence cannot be assessed. Meta (where most authors are affiliated) has a commercial interest in demonstrating that small models can be competitive with larger proprietary models, which aligns with the paper's conclusions." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial disclosure statement is present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "No training data cutoff dates stated for any model. Release dates are mentioned for context ('Sonnet 3.7 was released on February 24, 2025, while SWE-smith was released on April 29, 2025') but these are release dates, not training cutoff dates. Qwen2.5-Coder's training cutoff is not stated." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": true, 243 "justification": "Section 4.1 Contamination Study directly addresses this: evaluation on 'a held-out SWE-smith-style subset of 400 tasks that was explicitly excluded from the trajectory-generation mixture.' They also note that the held-out subset was released after expert models became available." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": true, 248 "justification": "The contamination study (§4.1) evaluates on tasks released after expert model availability, excluded from training data. Results show 40.3% with Sonnet 3.7 expert on held-out tasks (vs. 30.8% on SWE-bench Verified with same expert), indicating gains persist beyond the training distribution." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study. All evaluation is automated on software benchmarks." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants. The study involves training and evaluating language models on code benchmarks." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": true, 292 "justification": "Per-task inference costs reported in detail (Fig. 4): median expert cost $0.13 (Sonnet 3.7), $0.15 (Sonnet 4.5), $0.65 (Opus 4.1) vs. direct expert execution at $0.54/$1.24. Token decomposition in Fig. 3 shows expert tokens ~11% of total. $2 per-task budget specified." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "Hardware specified ('single node with 8 NVIDIA A100/H100 80G GPUs') and RL training is described as '160 total steps' on '100 tasks.' However, total GPU hours, wall-clock training time, and total API spend for trajectory generation are not quantified." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "All results are single-run: 'We report %Resolved (Pass@1), i.e., the fraction of tasks solved by a single rollout per instance.' No multiple-seed results or seed sensitivity analysis reported." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": true, 309 "justification": "Explicitly stated: 'We do not use multi-attempt sampling, majority voting, or other test-time scaling' — single rollout per instance. RL training uses '6 rollouts per prompt' (§3.1)." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "No hyperparameter search budget reported. Hyperparameters are stated (ε_high=0.28, ε_low=0.20, reward weights) but without documenting how many configurations were tried or the search method used to select them." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": true, 319 "justification": "All configurations tested are reported: three expert backends (Table 1), scaling curves (Fig. 2), multiple ablation variants (Fig. 7), LoRA vs QLoRA vs full SFT comparison. Selection justified by empirical comparison (e.g., 'both underperformed full SFT on SWE-bench Verified Pass@1')." 320 }, 321 "multiple_comparison_correction": { 322 "applies": false, 323 "answer": false, 324 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "Authors compare their system against baselines without acknowledging self-comparison bias. They independently reproduced SWE-agent-LM-7B results but do not discuss the bias of authors evaluating their own system per Lucic et al. (2018)." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": true, 334 "justification": "Fig. 4 compares cost across methods; §4.1 tests expert-only baselines at comparable budgets ('we separately test Sonnet 3.7 and Sonnet 4.5 on SWE-agent with step limits of 8 and 16, respectively, which is a more generous budget that exceeds both our 6-expert-call cap and average expert cost'). Performance drops sharply (18.2% and 26%) at these matched-cost baselines." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "No discussion of whether SWE-bench actually measures the claimed software engineering capability. They use SWE-bench Verified (human-vetted subset) which improves reliability but don't question whether unit-test pass rate captures real software repair quality." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": true, 343 "answer": true, 344 "justification": "Key comparisons use the same SWE-agent scaffold: SWE-Protégé-7B vs. SWE-agent-LM-7B and SWE-agent-LM-32B all use SWE-agent. Table 1 clearly labels the 'System' column for each result. The authors note their method uses 'the off-the-shelf SWE-agent framework exactly as used in SWE-smith.'" 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": true, 351 "justification": "Section 4.1 addresses temporal aspects: 'this subset was released after the expert models became available (e.g., Sonnet 3.7 was released on February 24, 2025, while SWE-smith was released on April 29, 2025), reducing the likelihood of overlap.' Held-out evaluation uses newer tasks disjoint from training." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "During training, the expert has access to the gold patch for trajectory generation (§2.2). During evaluation, the expert 'no longer has access to the gold patch and unit tests.' While this is documented, the paper does not discuss whether the SLM could have learned to extract gold-patch-derived signals from the expert's trained behavior." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "The paper does not discuss whether SWE-smith training tasks and SWE-bench Verified share underlying repositories or structurally similar problems. Both draw from GitHub repositories but potential overlap in code structure or issue patterns is not analyzed." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": true, 366 "justification": "Concrete method applied: evaluation on 'a held-out SWE-smith-style subset of 400 tasks that was explicitly excluded from the trajectory-generation mixture' with temporal separation (subset released after expert model availability). Results (40.3% with expert, 32.0% without) confirm gains persist." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "SWE-Protégé-7B achieves 42.4% Pass@1 on SWE-bench Verified, a +25.4% improvement over the prior SLM state of the art (SWE-agent-LM-7B at 17.0%).", 373 "evidence": "Table 1, §4. Main evaluation on SWE-bench Verified 500-instance subset with standard SWE-agent setup (75-step limit, no test-time sampling). Baseline independently reproduced.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Expert assistance is sparse: ~4 calls per task with expert tokens comprising only ~11% of total tokens.", 378 "evidence": "Fig. 3 decomposes per-task token usage; Fig. 8 shows mean expert calls stable at ~3.5-4.5. Expert reply statistics: median 500, p95 937, max 1,657 tokens (§4.1).", 379 "supported": "strong" 380 }, 381 { 382 "claim": "SWE-Protégé yields up to 8.2× lower expert cost than direct expert execution.", 383 "evidence": "Fig. 4 shows median per-task expert costs: $0.15 (Sonnet 4.5 via SWE-Protégé) vs $1.24 (direct Sonnet 4.5 execution) = 8.2× reduction. Sonnet 3.7: 4.2× reduction (§4.1).", 384 "supported": "strong" 385 }, 386 { 387 "claim": "Phase II RL consistently improves performance over SFT checkpoints, delivering an average gain of 3.4%.", 388 "evidence": "Table 2: +1.2% (Sonnet 3.7), +6.2% (Sonnet 4.5), +2.8% (Opus 4.1) absolute improvement over corresponding SFT checkpoints.", 389 "supported": "strong" 390 }, 391 { 392 "claim": "Phase II RL eliminates degenerative looping: only 0.8% of trajectories have repeated-action runs longer than 10 steps after RL, compared to 31.0% after SFT.", 393 "evidence": "Fig. 6 quantifies looping across models. Fig. 8 and 9 show loop penalty approaching 0 and loop-negative rate dropping to near-zero during RL training.", 394 "supported": "strong" 395 }, 396 { 397 "claim": "Performance scales monotonically with training data for SWE-Protégé but plateaus/regresses for SWE-agent-LM-7B.", 398 "evidence": "Fig. 2 shows SWE-Protégé scaling from 19.0% (1.3k trajectories) to 33.4% (4.8k) with Sonnet 3.7. SWE-agent-LM-7B plateaus at 17.0% (2.4k) then regresses to 11.8% (5.0k).", 399 "supported": "moderate" 400 }, 401 { 402 "claim": "Gains from SWE-Protégé persist beyond the training distribution on held-out tasks.", 403 "evidence": "Contamination study (§4.1): 7B model achieves 40.3% with Sonnet 3.7 on 400 held-out tasks (vs 32.0% without expert), 32B variant achieves 43.0% with Sonnet 3.7 (vs 41.5% without).", 404 "supported": "moderate" 405 } 406 ], 407 "red_flags": [ 408 { 409 "flag": "No error bars or variance reporting on main results", 410 "detail": "All main results (Table 1, Table 2) are single-run Pass@1 with no confidence intervals, standard deviations, or multiple-seed evaluations. On a 500-instance benchmark, sampling variance could be substantial. The paper explicitly states single rollouts are used." 411 }, 412 { 413 "flag": "Expert has gold patch access during training data generation", 414 "detail": "During trajectory generation (§2.2), the expert model has access to the ground-truth solution, producing '38% more usable trajectories.' While the paper documents this and the expert loses gold patch access during evaluation, this creates an asymmetry that could inflate training quality beyond what's achievable in real deployment." 415 }, 416 { 417 "flag": "Company employees evaluating a method favorable to their interests", 418 "detail": "Most authors are from Meta, which has a strategic interest in demonstrating that small open-weight models can compete with larger proprietary models. The paper proposes a paradigm where a cheap SLM uses expensive frontier models sparingly — aligning with Meta's open-weight model strategy. No conflict of interest statement is included." 419 } 420 ], 421 "cited_papers": [ 422 { 423 "title": "SWE-bench: Can language models resolve real-world github issues?", 424 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"], 425 "year": 2023, 426 "arxiv_id": "2310.06770", 427 "relevance": "Foundational benchmark for evaluating LM agents on real-world software engineering tasks; the primary evaluation benchmark used in this paper." 428 }, 429 { 430 "title": "SWE-smith: Scaling data for software engineering agents", 431 "authors": ["John Yang", "Kilian Lieret", "Carlos E Jimenez", "Alexander Wettig"], 432 "year": 2025, 433 "arxiv_id": "2504.21798", 434 "relevance": "Prior state-of-the-art SLM training approach for SWE-bench; provides the training task dataset and primary baseline for SWE-Protégé." 435 }, 436 { 437 "title": "SWE-agent: Agent-computer interfaces enable automated software engineering", 438 "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"], 439 "year": 2024, 440 "relevance": "The agent scaffold used in SWE-Protégé; establishes the ReAct-style tool-use interface for software engineering tasks." 441 }, 442 { 443 "title": "SWE-RL: Advancing llm reasoning via reinforcement learning on open software evolution", 444 "authors": ["Yuxiang Wei", "Olivier Duchenne", "Jade Copet"], 445 "year": 2025, 446 "arxiv_id": "2502.18449", 447 "relevance": "RL approach for software agents using Llama 3 at large compute scale; contrasts with SWE-Protégé's lightweight approach." 448 }, 449 { 450 "title": "Training Software Engineering Agents and Verifiers with SWE-Gym", 451 "authors": ["Jiayi Pan", "Xingyao Wang", "Graham Neubig"], 452 "year": 2024, 453 "arxiv_id": "2412.21139", 454 "relevance": "Open training environment for SWE agents at 7B/32B scales; provides RL training tasks used in SWE-Protégé Phase II." 455 }, 456 { 457 "title": "CWM: An open-weights llm for research on code generation with world models", 458 "authors": ["Jade Copet", "Quentin Carbonneaux", "Gal Cohen"], 459 "year": 2025, 460 "arxiv_id": "2510.02387", 461 "relevance": "End-to-end trained 32B code agent with test-time scaling; represents the high-compute end of the spectrum SWE-Protégé aims to make more efficient." 462 }, 463 { 464 "title": "R2E-Gym: Procedural environments and hybrid verifiers for scaling open-weights swe agents", 465 "authors": ["Naman Jain", "Jaskirat Singh", "Manish Shetty"], 466 "year": 2025, 467 "arxiv_id": "2504.07164", 468 "relevance": "Open-weight SWE agent training with procedural environments; baseline comparison in Table 1." 469 }, 470 { 471 "title": "RouteLLM: Learning to route llms with preference data", 472 "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"], 473 "year": 2024, 474 "arxiv_id": "2406.18665", 475 "relevance": "Learned LLM routing approach using preference data; represents the model routing paradigm that SWE-Protégé extends to multi-turn agentic settings." 476 }, 477 { 478 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 479 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 480 "year": 2023, 481 "arxiv_id": "2305.05176", 482 "relevance": "Cost-efficient LLM usage through cascading and routing; foundational work in the model routing space that SWE-Protégé relates to." 483 }, 484 { 485 "title": "Small language models are the future of agentic ai", 486 "authors": ["Peter Belcak", "Greg Heinrich", "Shizhe Diao"], 487 "year": 2025, 488 "arxiv_id": "2506.02153", 489 "relevance": "Position paper on SLMs as practical agents; defines the ≤10B parameter threshold used in SWE-Protégé." 490 }, 491 { 492 "title": "Lingma SWE-GPT: An open development-process-centric language model for automated software improvement", 493 "authors": ["Yingwei Ma", "Rongyu Cao", "Yongchang Cao"], 494 "year": 2024, 495 "arxiv_id": "2411.00622", 496 "relevance": "Development-process-centric training for SWE agents at 7B/72B scale; prior 7B-class SWE system baseline." 497 }, 498 { 499 "title": "SWE-Fixer: Training open-source llms for effective and efficient github issue resolution", 500 "authors": ["Chengxing Xie", "Bowen Li", "Chang Gao"], 501 "year": 2025, 502 "arxiv_id": "2501.05040", 503 "relevance": "Specialized retriever+editor architecture for SWE tasks; alternative approach to training SWE agents." 504 } 505 ] 506 }