scan-v5.json (24569B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Agentic Bug Reproduction for Effective Automated Program Repair at Google", 6 "authors": [ 7 "Runxiang Cheng", 8 "Michele Tufano", 9 "Jürgen Cito", 10 "José Cambronero", 11 "Pat Rondon", 12 "Renyao Wei", 13 "Aaron Sun", 14 "Satish Chandra" 15 ], 16 "year": 2025, 17 "venue": "arXiv.org", 18 "arxiv_id": "2502.01821", 19 "doi": "10.48550/arXiv.2502.01821" 20 }, 21 "checklist": { 22 "claims_and_evidence": { 23 "abstract_claims_supported": { 24 "applies": true, 25 "answer": true, 26 "justification": "All key abstract claims (28% vs 10% plausible BRT rate, 30% more bugs fixed with BRTs, 70% top-1 EPR precision) are directly supported by Table 2, Figure 3, and Figure 5 respectively.", 27 "source": "haiku" 28 }, 29 "causal_claims_justified": { 30 "applies": true, 31 "answer": false, 32 "justification": "The claim that BRTs cause improved APR performance is tested on only 23 bugs with no statistical significance testing; the small sample makes causal inference inadequate despite the controlled within-subject comparison.", 33 "source": "haiku" 34 }, 35 "generalization_bounded": { 36 "applies": true, 37 "answer": true, 38 "justification": "Section 7 explicitly acknowledges the study focuses exclusively on Google's internal environment and that generalizability to other industrial settings 'requires further investigation.'", 39 "source": "haiku" 40 }, 41 "alternative_explanations_discussed": { 42 "applies": true, 43 "answer": false, 44 "justification": "The paper controls for LLM differences but does not discuss whether BRT Agent's advantage over LIBRO stems from the agent scaffolding, code search, or the fine-tuned LLM—these factors are fully confounded with no ablation.", 45 "source": "haiku" 46 }, 47 "proxy_outcome_distinction": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper defines plausible BRTs (F→P behavior) as a proxy and acknowledges in threats to validity that this metric 'may not fully capture all aspects of a BRT, such as its readability or maintainability.'", 51 "source": "haiku" 52 } 53 }, 54 "limitations_and_scope": { 55 "limitations_section_present": { 56 "applies": true, 57 "answer": true, 58 "justification": "Section 7 'Threats to Validity' is a dedicated section covering Internal, External, and Construct validity with specific subsections.", 59 "source": "haiku" 60 }, 61 "threats_to_validity_specific": { 62 "applies": true, 63 "answer": true, 64 "justification": "Specific threats include the small 80-bug dataset limiting subgroup analysis, potential implementation bias in the LIBRO adaptation, LLM non-determinism, and Google-specific generalizability limits—these go beyond boilerplate.", 65 "source": "haiku" 66 }, 67 "scope_boundaries_stated": { 68 "applies": true, 69 "answer": true, 70 "justification": "The paper explicitly states findings are limited to Google's internal environment and that EPR is an indirect measure that may not always correlate with human-judged fix correctness.", 71 "source": "haiku" 72 } 73 }, 74 "conflicts_of_interest": { 75 "funding_disclosed": { 76 "applies": true, 77 "answer": false, 78 "justification": "No explicit funding disclosure statement appears anywhere in the paper.", 79 "source": "haiku" 80 }, 81 "affiliations_disclosed": { 82 "applies": true, 83 "answer": true, 84 "justification": "Author affiliations (UIUC, Google, TU Wien) are disclosed; a footnote clarifies that Cheng and Cito conducted the research at Google.", 85 "source": "haiku" 86 }, 87 "funder_independent_of_outcome": { 88 "applies": true, 89 "answer": false, 90 "justification": "The majority of authors are Google employees evaluating Google's own internal tools (Passerine APR system, proprietary fine-tuned Gemini), creating a direct conflict of interest with the outcome.", 91 "source": "haiku" 92 }, 93 "financial_interests_declared": { 94 "applies": true, 95 "answer": false, 96 "justification": "No competing interests or financial interests declaration appears in the paper.", 97 "source": "haiku" 98 } 99 }, 100 "scope_and_framing": { 101 "key_terms_defined": { 102 "applies": true, 103 "answer": true, 104 "justification": "BRT is formally defined in Section 2.1 with precise F→P behavior criteria; 'candidate BRT,' 'plausible BRT,' and EPR are all precisely defined in Section 5.2.", 105 "source": "haiku" 106 }, 107 "intended_contribution_clear": { 108 "applies": true, 109 "answer": true, 110 "justification": "Three explicit contributions are stated: BRT Agent system and comparison with LIBRO, assessment of BRT impact on APR (Passerine), and the EPR metric for fix selection.", 111 "source": "haiku" 112 }, 113 "engagement_with_prior_work": { 114 "applies": true, 115 "answer": true, 116 "justification": "Section 3 provides detailed comparison with LIBRO, SWE-Agent+, and LLM test generation literature, explicitly situating differences in industrial context and usefulness of generated BRTs.", 117 "source": "haiku" 118 } 119 } 120 }, 121 "type_checklist": { 122 "empirical": { 123 "artifacts": { 124 "code_released": { 125 "applies": true, 126 "answer": false, 127 "justification": "The system is built on proprietary Google infrastructure; no code is released and no promise of future release is mentioned.", 128 "source": "haiku" 129 }, 130 "data_released": { 131 "applies": true, 132 "answer": false, 133 "justification": "The evaluation dataset is from Google's internal issue tracking system (GITS) and is not publicly available.", 134 "source": "haiku" 135 }, 136 "environment_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "No environment specifications (requirements, Docker, etc.) are provided; the system depends on proprietary Google infrastructure.", 140 "source": "haiku" 141 }, 142 "reproduction_instructions": { 143 "applies": true, 144 "answer": false, 145 "justification": "No reproduction instructions are provided; complete dependency on Google's internal infrastructure makes external reproduction impossible.", 146 "source": "haiku" 147 } 148 }, 149 "statistical_methodology": { 150 "confidence_intervals_or_error_bars": { 151 "applies": true, 152 "answer": false, 153 "justification": "Main results (28% vs 10% plausible BRT rate, 70% top-1 EPR precision) are reported as point estimates without any confidence intervals or error bars.", 154 "source": "haiku" 155 }, 156 "significance_tests": { 157 "applies": true, 158 "answer": false, 159 "justification": "No statistical significance tests are applied to any comparisons (BRT Agent vs LIBRO, with/without BRT for APR) despite making comparative claims on small samples.", 160 "source": "haiku" 161 }, 162 "effect_sizes_reported": { 163 "applies": true, 164 "answer": true, 165 "justification": "Absolute differences with baseline context are provided: 28% vs 10% plausible BRTs, 17/23 vs 13/23 bugs fixed, precision@K values with K varying—sufficient to assess magnitude.", 166 "source": "haiku" 167 }, 168 "sample_size_justified": { 169 "applies": true, 170 "answer": false, 171 "justification": "The 80-bug sample is acknowledged as a potential limitation but no power analysis or formal sample size justification is provided.", 172 "source": "haiku" 173 }, 174 "variance_reported": { 175 "applies": true, 176 "answer": false, 177 "justification": "Despite running 20 runs per bug to account for LLM stochasticity, no variance, standard deviation, or confidence intervals are reported for aggregate metrics.", 178 "source": "haiku" 179 } 180 }, 181 "evaluation_design": { 182 "baselines_included": { 183 "applies": true, 184 "answer": true, 185 "justification": "LIBRO, the state-of-the-art BRT generation approach, is adapted to Google's environment and used as the primary baseline for all BRT generation comparisons.", 186 "source": "haiku" 187 }, 188 "baselines_contemporary": { 189 "applies": true, 190 "answer": true, 191 "justification": "LIBRO (ICSE 2023) is the most directly comparable recent approach; SWT-Bench (NeurIPS 2024) results for SWE-Agent+ are referenced for broader context.", 192 "source": "haiku" 193 }, 194 "ablation_study": { 195 "applies": true, 196 "answer": false, 197 "justification": "No ablation study isolates the contribution of individual BRT Agent components (reasoning LLM, fine-tuned code-editing LLM, code search, ReAct scaffolding).", 198 "source": "haiku" 199 }, 200 "multiple_metrics": { 201 "applies": true, 202 "answer": true, 203 "justification": "Multiple metrics are used: candidate BRT rate, plausible BRT rate, candidate-to-plausible rate, bugs fixed, steps to fix, and precision/recall/F1/MRR for EPR.", 204 "source": "haiku" 205 }, 206 "human_evaluation": { 207 "applies": true, 208 "answer": true, 209 "justification": "Two authors manually inspect all plausible BRT patches against oracle BRTs for semantic equivalence, with a third author resolving disagreements.", 210 "source": "haiku" 211 }, 212 "held_out_test_set": { 213 "applies": true, 214 "answer": true, 215 "justification": "The 80 production bugs with ground truth oracle BRTs serve as a held-out evaluation set; the code-editing LLM's training cutoff explicitly predates all evaluated bugs.", 216 "source": "haiku" 217 }, 218 "per_category_breakdown": { 219 "applies": true, 220 "answer": true, 221 "justification": "Table 3 provides plausible BRT rates broken down by 7 programming languages (Java, C++, Go, Python, Kotlin, Dart, TypeScript) for both techniques.", 222 "source": "haiku" 223 }, 224 "failure_cases_discussed": { 225 "applies": true, 226 "answer": true, 227 "justification": "Failure modes are discussed: LIBRO fails mainly via build errors it cannot recover from; BRT Agent modifies existing tests in 11% of cases; 21% of BRT Agent runs exhaust the step limit.", 228 "source": "haiku" 229 }, 230 "negative_results_reported": { 231 "applies": true, 232 "answer": true, 233 "justification": "Dart achieves 0% plausible BRT rate for both LIBRO and BRT Agent; EPR recall limitations are quantified and discussed as a trade-off.", 234 "source": "haiku" 235 } 236 }, 237 "setup_transparency": { 238 "model_versions_specified": { 239 "applies": true, 240 "answer": false, 241 "justification": "Models are described only as 'a Gemini model fine-tuned on Google's internal code' and 'a publicly available Gemini'—no version numbers or snapshot dates are given.", 242 "source": "haiku" 243 }, 244 "prompts_provided": { 245 "applies": true, 246 "answer": false, 247 "justification": "Prompt structure is described at a high level (bug report + buggy file + test file) and the meta task description string is quoted, but full prompt text is not provided verbatim.", 248 "source": "haiku" 249 }, 250 "hyperparameters_reported": { 251 "applies": true, 252 "answer": true, 253 "justification": "Temperature (0.7 for LIBRO, 0.2 for BRT Agent), top-P (0.95), number of runs (50 for LIBRO, 20 for BRT Agent), and step limit (25) are all explicitly reported.", 254 "source": "haiku" 255 }, 256 "scaffolding_described": { 257 "applies": true, 258 "answer": true, 259 "justification": "Section 4.2 details BRT Agent's ReAct-based loop, its full action set (Table 1), change description generation process, and termination conditions with sufficient specificity.", 260 "source": "haiku" 261 }, 262 "data_preprocessing_documented": { 263 "applies": true, 264 "answer": false, 265 "justification": "Dataset construction is described only as 'automated extraction and filtering phases as well as manual curation' with full details deferred to the concurrent Passerine paper [30].", 266 "source": "haiku" 267 } 268 }, 269 "data_integrity": { 270 "raw_data_available": { 271 "applies": true, 272 "answer": false, 273 "justification": "The bug dataset is from Google's internal GITS and is not publicly accessible.", 274 "source": "haiku" 275 }, 276 "data_collection_described": { 277 "applies": true, 278 "answer": true, 279 "justification": "Section 5.1.1 describes that bugs were human-reported, human-fixed, sourced from GITS since June 2024, across seven languages, with manual curation to ensure fixes address root causes.", 280 "source": "haiku" 281 }, 282 "recruitment_methods_described": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants were recruited; bugs are drawn from an internal issue tracker.", 286 "source": "haiku" 287 }, 288 "data_pipeline_documented": { 289 "applies": true, 290 "answer": false, 291 "justification": "The full pipeline from collection to analysis is not documented; automated extraction and filtering details are deferred to the Passerine paper [30].", 292 "source": "haiku" 293 } 294 }, 295 "contamination": { 296 "training_cutoff_stated": { 297 "applies": true, 298 "answer": true, 299 "justification": "Section 4.2.3 explicitly states the code-editing LLM's training data cutoff predates the reporting of all bugs analyzed in the study.", 300 "source": "haiku" 301 }, 302 "train_test_overlap_discussed": { 303 "applies": true, 304 "answer": true, 305 "justification": "The paper explicitly states training data excludes all bugs, code changes, and BRTs in the evaluation set, 'preventing any potential data leakage.'", 306 "source": "haiku" 307 }, 308 "benchmark_contamination_addressed": { 309 "applies": true, 310 "answer": true, 311 "justification": "Bugs are from Google's internal tracker since June 2024 and training cutoff is stated to predate all evaluation bugs, directly addressing contamination.", 312 "source": "haiku" 313 } 314 }, 315 "human_studies": { 316 "pre_registered": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human participants were recruited for this study.", 320 "source": "haiku" 321 }, 322 "irb_or_ethics_approval": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants were recruited for this study.", 326 "source": "haiku" 327 }, 328 "demographics_reported": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants were recruited for this study.", 332 "source": "haiku" 333 }, 334 "inclusion_exclusion_criteria": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human participants were recruited for this study.", 338 "source": "haiku" 339 }, 340 "randomization_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human participants were recruited for this study.", 344 "source": "haiku" 345 }, 346 "blinding_described": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human participants were recruited for this study.", 350 "source": "haiku" 351 }, 352 "attrition_reported": { 353 "applies": false, 354 "answer": false, 355 "justification": "No human participants were recruited for this study.", 356 "source": "haiku" 357 } 358 }, 359 "cost_and_practicality": { 360 "inference_cost_reported": { 361 "applies": true, 362 "answer": false, 363 "justification": "No inference cost or latency figures are reported despite running 1,600 BRT Agent runs (80 bugs × 20 runs) and 4,000 LIBRO calls.", 364 "source": "haiku" 365 }, 366 "compute_budget_stated": { 367 "applies": true, 368 "answer": false, 369 "justification": "No compute budget or resource requirements are stated anywhere in the paper.", 370 "source": "haiku" 371 } 372 } 373 } 374 }, 375 "claims": [ 376 { 377 "claim": "BRT Agent achieves 28% plausible BRT generation rate vs 10% by adapted LIBRO on 80 Google production bugs", 378 "evidence": "Table 2: BRT Agent 85% candidate / 28% plausible; LIBRO 41% candidate / 10% plausible", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Providing generated BRTs to Passerine results in ~30% more bugs with plausible fixes (74% vs 57%)", 383 "evidence": "Figure 3: 17/23 bugs fixed with BRT vs 13/23 without on the 23-bug subset where BRT Agent succeeded", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "EPR correctly selects a plausible fix from 20 APR-generated candidates in 70% of cases at top-1 ranking", 388 "evidence": "Figure 5: precision@1 = 0.7, MRR@1 = 0.7", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "67% of plausible BRTs generated by BRT Agent are semantically equivalent or identical to oracle BRTs", 393 "evidence": "Manual inspection: 19% identical + 48% semantically equivalent = 67% of plausible BRT patches", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "BRT Agent generalizes across 6 of 7 programming languages; only Dart produces 0% results", 398 "evidence": "Table 3 language breakdown showing non-zero rates for Java (28%), C++ (16%), Go (17%), Python (45%), Kotlin (50%), TypeScript (100%)", 399 "supported": "strong" 400 }, 401 { 402 "claim": "Passerine takes fewer agent steps to generate plausible fixes when provided with BRTs", 403 "evidence": "Figure 4 shows a leftward shift in step count distribution when BRT is provided as input", 404 "supported": "moderate" 405 } 406 ], 407 "methodology_tags": [ 408 "benchmark-eval", 409 "case-study" 410 ], 411 "key_findings": "BRT Agent, combining a ReAct-based reasoning LLM with a proprietary fine-tuned code-editing LLM, achieves 28% plausible bug reproduction test generation on 80 Google production bugs—significantly outperforming adapted LIBRO (10%). Generated BRTs improve Google's APR system (Passerine) from fixing 57% to 74% of bugs on a 23-bug subset, with fewer agent steps required. The proposed Ensemble Pass Rate (EPR) metric achieves 70% top-1 precision for selecting correct fixes from pools of 20 APR-generated candidates. Both BRT Agent and LIBRO fail completely on Dart bugs, and 11% of BRT Agent's plausible patches are invalid due to unintended modification of existing tests.", 412 "red_flags": [ 413 { 414 "flag": "No statistical significance tests", 415 "detail": "All comparisons (BRT Agent vs LIBRO, with/without BRT for APR) are reported as raw percentages without significance tests or confidence intervals despite the small sample sizes making chance effects plausible." 416 }, 417 { 418 "flag": "Tiny APR evaluation sample", 419 "detail": "RQ2 and RQ3 are evaluated on only 23 bugs (those where BRT Agent happened to succeed), making the 30% improvement claim fragile and potentially inflated." 420 }, 421 { 422 "flag": "No ablation study", 423 "detail": "The paper never isolates whether BRT Agent's advantage over LIBRO comes from the agent scaffolding, fine-tuned LLM, code search, or their combination—all factors are fully confounded." 424 }, 425 { 426 "flag": "Google-only, entirely non-reproducible evaluation", 427 "detail": "All evaluation uses proprietary Google infrastructure, internal bugs, and internal LLMs; no external party can reproduce any result." 428 }, 429 { 430 "flag": "Unspecified model versions", 431 "detail": "Models are described only as 'a Gemini model fine-tuned on Google's internal code' and 'a publicly available Gemini' without version numbers or snapshot dates." 432 }, 433 { 434 "flag": "Google employees evaluating Google systems", 435 "detail": "Majority of authors are Google employees evaluating Google's own APR system (Passerine) and Google's proprietary LLMs with no independent validation." 436 } 437 ], 438 "cited_papers": [ 439 { 440 "title": "Large Language Models are Few-shot Testers: Exploring LLM-based General Bug Reproduction (LIBRO)", 441 "relevance": "Primary baseline adapted and compared against in all BRT generation experiments" 442 }, 443 { 444 "title": "SWT-Bench: Testing and Validating Real-World Bug-Fixes with Code Agents", 445 "relevance": "Most recent BRT generation benchmark; SWE-Agent+ results used for broader context comparison" 446 }, 447 { 448 "title": "Evaluating Agent-based Program Repair at Google (Passerine)", 449 "relevance": "Concurrent work describing the APR system evaluated and the same 80-bug dataset" 450 }, 451 { 452 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 453 "relevance": "Agent framework conceptually similar to BRT Agent; SWE-Agent+ is a direct point of comparison" 454 }, 455 { 456 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 457 "relevance": "Theoretical framework underlying BRT Agent's reasoning loop design" 458 }, 459 { 460 "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs", 461 "relevance": "Standard benchmark used to evaluate LIBRO; reference point for comparing BRT generation performance" 462 }, 463 { 464 "title": "Swe-bench: Can Language Models Resolve Real-World GitHub Issues?", 465 "relevance": "Major benchmark for evaluating code agents; provides context for the field's evaluation practices" 466 }, 467 { 468 "title": "Evaluating Diverse Large Language Models for Automatic and General Bug Reproduction", 469 "relevance": "Extended LIBRO evaluation providing additional baseline context" 470 } 471 ], 472 "engagement_factors": { 473 "practical_relevance": { 474 "score": 3, 475 "justification": "Demonstrates industrial-scale BRT generation at Google with concrete improvement in APR effectiveness—directly actionable for engineering teams." 476 }, 477 "surprise_contrarian": { 478 "score": 1, 479 "justification": "Expected result that agent-based approach outperforms few-shot baseline; the 0% Dart result and the EPR precision-recall trade-offs are modestly interesting." 480 }, 481 "fear_safety": { 482 "score": 0, 483 "justification": "No AI safety or risk concerns; purely a software engineering productivity paper." 484 }, 485 "drama_conflict": { 486 "score": 0, 487 "justification": "No controversy or conflict angle; straightforward industrial evaluation." 488 }, 489 "demo_ability": { 490 "score": 1, 491 "justification": "The BRT generation concept is demonstrable in open-source analogues (SWE-agent, LIBRO) but the actual Google system requires proprietary infrastructure." 492 }, 493 "brand_recognition": { 494 "score": 3, 495 "justification": "Google authorship, Google production bugs, and evaluation on Gemini models provide strong brand recognition for the HN/tech audience." 496 } 497 }, 498 "hn_data": { 499 "threads": [ 500 { 501 "hn_id": "43876276", 502 "title": "Agentic Bug Reproduction for Effective Automated Program Repair at Google", 503 "points": 2, 504 "comments": 0, 505 "url": "https://news.ycombinator.com/item?id=43876276", 506 "created_at": "2025-05-03T01:54:39Z" 507 }, 508 { 509 "hn_id": "45599001", 510 "title": "Agentic Bug Reproduction for Effective Automated Program Repair at Google", 511 "points": 1, 512 "comments": 0, 513 "url": "https://news.ycombinator.com/item?id=45599001", 514 "created_at": "2025-10-15T22:20:39Z" 515 } 516 ], 517 "top_points": 2, 518 "total_points": 3, 519 "total_comments": 0 520 } 521 }