scan.json (28582B)
1 { 2 "scan_version": 3, 3 "active_modules": [ 4 "experimental_rigor", 5 "data_leakage" 6 ], 7 "paper": { 8 "title": "Agentic Bug Reproduction for Effective Automated Program Repair at Google", 9 "authors": [ 10 "Runxiang Cheng", 11 "Michele Tufano", 12 "Jürgen Cito", 13 "José Cambronero", 14 "Pat Rondon", 15 "Renyao Wei", 16 "Aaron Sun", 17 "Satish Chandra" 18 ], 19 "year": 2025, 20 "arxiv_id": "2502.01821" 21 }, 22 "methodology_tags": [ 23 "benchmark-eval", 24 "case-study" 25 ], 26 "key_findings": "BRT Agent achieves 28% plausible BRT generation rate on 80 Google production bugs, compared to 10% by LIBRO. Providing generated BRTs to Passerine APR system results in 30% more bugs with plausible fixes (17/23 vs 13/23). The proposed Ensemble Pass Rate (EPR) metric correctly selects a plausible fix from 20 candidates in 70% of cases at top-1 ranking. Manual inspection shows 86% of plausible BRT patches are valid (identical or semantically equivalent to oracle BRTs).", 27 "claims": [ 28 { 29 "claim": "BRT Agent achieves 28% plausible BRT generation rate compared to 10% by LIBRO on 80 Google production bugs.", 30 "evidence": "Table 2 in §6.1 shows candidate BRTs (85% vs 41%), plausible BRTs (28% vs 10%), and candidate-to-plausible rate (34% vs 24%).", 31 "supported": "strong" 32 }, 33 { 34 "claim": "Providing generated BRTs to Passerine results in 30% more bugs with plausible fixes.", 35 "evidence": "§6.2 and Figure 3: 17/23 bugs fixed with BRT vs 13/23 without BRT. 6 new unique bugs solved.", 36 "supported": "moderate" 37 }, 38 { 39 "claim": "EPR correctly selects a plausible fix from a pool of 20 candidates in 70% of cases based on top-1 ranking.", 40 "evidence": "§6.3.1 and Figure 5: precision@1 = 0.7, MRR = 0.7.", 41 "supported": "moderate" 42 }, 43 { 44 "claim": "86% of plausible generated BRT patches are valid (identical or semantically equivalent to oracle BRTs).", 45 "evidence": "§6.1.1 manual inspection: 19% identical, 48% semantically equivalent, 16% valid with irrelevant additions, 11% invalid.", 46 "supported": "strong" 47 }, 48 { 49 "claim": "Passerine takes fewer steps on average to generate plausible fixes when provided with a BRT.", 50 "evidence": "§6.2 and Figure 4 show leftward shift in step distribution. Probability of plausible fix given BRT used is 33% vs 2% when not used.", 51 "supported": "moderate" 52 } 53 ], 54 "red_flags": [ 55 { 56 "flag": "Google employees evaluating Google system", 57 "detail": "6 of 8 authors are Google employees. They evaluate BRT Agent and Passerine, both Google-internal systems. The conflict is disclosed via affiliations but not explicitly acknowledged as a potential bias." 58 }, 59 { 60 "flag": "Small evaluation dataset", 61 "detail": "Only 80 bugs for RQ1 and 23 bugs for RQ2/RQ3. The paper acknowledges this in threats to validity but the small N limits statistical confidence in the reported percentages." 62 }, 63 { 64 "flag": "No statistical significance tests", 65 "detail": "Comparisons between BRT Agent and LIBRO (28% vs 10%) are presented as raw percentages without any statistical tests despite small sample sizes where differences could be due to chance." 66 }, 67 { 68 "flag": "Non-reproducible due to proprietary infrastructure", 69 "detail": "The approach relies on Google's internal codebase, fine-tuned Gemini models, and Passerine APR system. No external researcher can reproduce or verify any of these results." 70 }, 71 { 72 "flag": "Upper-bound evaluation for RQ2", 73 "detail": "The paper acknowledges that RQ2 results represent an upper bound because they select plausible BRTs (determined using the ground truth fix) to provide to Passerine, which wouldn't be possible in practice." 74 } 75 ], 76 "checklist": { 77 "artifacts": { 78 "code_released": { 79 "applies": true, 80 "answer": false, 81 "justification": "No source code repository or archive is mentioned. The system is built on Google's internal infrastructure and no code is released." 82 }, 83 "data_released": { 84 "applies": true, 85 "answer": false, 86 "justification": "The dataset of 80 production bugs is from Google's internal issue tracking system (GITS) and is not publicly available." 87 }, 88 "environment_specified": { 89 "applies": true, 90 "answer": false, 91 "justification": "The paper mentions Google's internal development environment, Bazel build system, and fine-tuned Gemini models, but no specific environment specifications (versions, dependencies) are provided." 92 }, 93 "reproduction_instructions": { 94 "applies": true, 95 "answer": false, 96 "justification": "No reproduction instructions are provided. The entire system relies on Google's proprietary infrastructure." 97 } 98 }, 99 "statistical_methodology": { 100 "confidence_intervals_or_error_bars": { 101 "applies": true, 102 "answer": false, 103 "justification": "Results are reported as raw percentages (e.g., 28%, 10%) without confidence intervals or error bars despite small sample sizes." 104 }, 105 "significance_tests": { 106 "applies": true, 107 "answer": false, 108 "justification": "The paper claims BRT Agent 'significantly outperforms LIBRO' but provides no statistical significance tests. Comparisons are based on raw percentage differences." 109 }, 110 "effect_sizes_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Effect sizes are conveyed through percentage improvements with baseline context: 28% vs 10% plausible BRT rate, 17/23 vs 13/23 bugs fixed, precision@1 = 0.7. The reader can assess magnitude." 114 }, 115 "sample_size_justified": { 116 "applies": true, 117 "answer": false, 118 "justification": "No justification for why 80 bugs were used. The paper acknowledges the small size as a threat to validity but does not provide power analysis or other justification." 119 }, 120 "variance_reported": { 121 "applies": true, 122 "answer": false, 123 "justification": "Despite running LIBRO 50 times and BRT Agent 20 times per bug, no variance or standard deviation across runs is reported. Only aggregate percentages are shown." 124 } 125 }, 126 "evaluation_design": { 127 "baselines_included": { 128 "applies": true, 129 "answer": true, 130 "justification": "LIBRO is adapted and used as a baseline comparison (§4.1, Table 2). SWE-Agent+ is discussed as related work but not directly compared on the same dataset." 131 }, 132 "baselines_contemporary": { 133 "applies": true, 134 "answer": true, 135 "justification": "LIBRO (Kang et al., 2023) is a recent and relevant baseline for BRT generation. SWE-Agent+ (Mündler et al., 2024) is discussed but not directly evaluated on the same dataset due to Google's proprietary setting." 136 }, 137 "ablation_study": { 138 "applies": true, 139 "answer": false, 140 "justification": "No ablation study is conducted to isolate the contribution of individual BRT Agent components (e.g., fine-tuned code-editing LLM, ReAct reasoning, code search). The improvement over LIBRO could be due to any combination of factors." 141 }, 142 "multiple_metrics": { 143 "applies": true, 144 "answer": true, 145 "justification": "Multiple metrics are used: candidate BRTs, plausible BRTs, candidate-to-plausible rate (RQ1); number of bugs fixed, steps to fix, plausibility given BRT usage (RQ2); precision, recall, F1, MRR (RQ3)." 146 }, 147 "human_evaluation": { 148 "applies": true, 149 "answer": true, 150 "justification": "Manual inspection of plausible BRTs by two authors with a third resolving disagreements (§6.1.1). They classify BRTs as identical, semantically equivalent, valid with irrelevant additions, or invalid." 151 }, 152 "held_out_test_set": { 153 "applies": true, 154 "answer": true, 155 "justification": "The code-editing LLM's training data cutoff predates all bugs in the evaluation, preventing data leakage (§4.2.3). The 80-bug dataset was constructed independently." 156 }, 157 "per_category_breakdown": { 158 "applies": true, 159 "answer": true, 160 "justification": "Table 3 provides per-language breakdown of plausible BRTs across 7 programming languages. Figure 2 shows per-step action distribution." 161 }, 162 "failure_cases_discussed": { 163 "applies": true, 164 "answer": true, 165 "justification": "§6.1.1 discusses failure modes: LIBRO's main failure is build errors it cannot recover from. 11% of BRT Agent's plausible patches are invalid due to modifying existing tests. Table 5 shows termination reasons including steps exhausted (21%) and framework exceptions (7%)." 166 }, 167 "negative_results_reported": { 168 "applies": true, 169 "answer": true, 170 "justification": "The paper reports that BRT Agent fails on 72% of bugs (only 28% plausible BRTs). Dart achieves 0% for both techniques. 21% of runs exhaust the step limit. The agent occasionally hallucinates non-existent actions (§6.1.2)." 171 } 172 }, 173 "claims_and_evidence": { 174 "abstract_claims_supported": { 175 "applies": true, 176 "answer": true, 177 "justification": "Abstract claims (28% vs 10% BRT rate, 30% more bugs with plausible fixes, EPR top-1 70%) are all supported by Tables 2-3, Figures 3-6 in the results sections." 178 }, 179 "causal_claims_justified": { 180 "applies": true, 181 "answer": false, 182 "justification": "The paper claims BRT Agent 'significantly outperforms' LIBRO but lacks ablation to identify which component causes the improvement. The claim that BRTs 'improve' APR is supported by controlled comparison but on only 23 bugs without statistical testing." 183 }, 184 "generalization_bounded": { 185 "applies": true, 186 "answer": true, 187 "justification": "The paper explicitly bounds findings to Google's internal environment (§7 External Validity) and acknowledges that 'the generalizability of our findings to other industrial settings requires further investigation.'" 188 }, 189 "alternative_explanations_discussed": { 190 "applies": true, 191 "answer": true, 192 "justification": "§7 discusses alternative explanations: implementation bias in LIBRO adaptation, LLM differences, randomness. §6.2 acknowledges RQ2 results are an upper bound due to BRT selection with ground truth fixes." 193 }, 194 "proxy_outcome_distinction": { 195 "applies": true, 196 "answer": false, 197 "justification": "The paper measures 'plausible BRT generation rate' (tests that fail on buggy code and pass on fixed code) and frames this as BRT Agent being effective for 'automated program repair.' The gap between plausible BRTs (the proxy — tests that exhibit fail-to-pass behavior) and truly correct BRTs (tests that validate the actual fix intent) is partially addressed via manual inspection (86% valid), but the broader claim of 'effective APR' from 28% success rate is not qualified as a proxy for debugging productivity." 198 } 199 }, 200 "setup_transparency": { 201 "model_versions_specified": { 202 "applies": true, 203 "answer": false, 204 "justification": "The paper says 'a Gemini model fine-tuned on Google's internal code' and 'a publicly available Gemini' without specifying exact model versions or snapshot dates for either." 205 }, 206 "prompts_provided": { 207 "applies": true, 208 "answer": false, 209 "justification": "The paper describes prompt elements (meta task description, change description, bug report) but does not provide actual prompt text. Only one example description is given: 'Add a test case that asserts the function returns null when given an empty input' (§4.2.3)." 210 }, 211 "hyperparameters_reported": { 212 "applies": true, 213 "answer": true, 214 "justification": "§5.1.2: LIBRO temperature 0.7, top P 0.95, 50 runs per bug. BRT Agent temperature 0.2, top P 0.95, 20 runs per bug, 25 max steps. Three synthetic examples in system prompt." 215 }, 216 "scaffolding_described": { 217 "applies": true, 218 "answer": true, 219 "justification": "§4.2 describes BRT Agent's workflow in detail: initialization, ReAct reasoning loop, action set (Table 1: cat, code_search, edit, bazel test, finish), code-editing LLM integration, observation handling, iteration, and termination conditions." 220 }, 221 "data_preprocessing_documented": { 222 "applies": true, 223 "answer": true, 224 "justification": "§5.1.1 describes dataset construction: 80 production bugs from GITS, extracted via automated extraction and filtering plus manual curation. Each bug verified to have genuine fix. Dataset from concurrent work [30]." 225 } 226 }, 227 "limitations_and_scope": { 228 "limitations_section_present": { 229 "applies": true, 230 "answer": true, 231 "justification": "§7 'Threats to Validity' is a dedicated section covering internal, external, and construct validity threats." 232 }, 233 "threats_to_validity_specific": { 234 "applies": true, 235 "answer": true, 236 "justification": "§7 discusses specific threats: the 80-bug dataset size limiting generalizability, implementation bias in LIBRO adaptation, specific mitigation of using same Gemini models, EPR as indirect measure of fix correctness, metrics not capturing readability/maintainability." 237 }, 238 "scope_boundaries_stated": { 239 "applies": true, 240 "answer": true, 241 "justification": "§7 External Validity explicitly states findings are limited to Google's internal environment and that 'the specific tools, processes, and codebase characteristics may differ significantly from those in other companies.'" 242 } 243 }, 244 "data_integrity": { 245 "raw_data_available": { 246 "applies": true, 247 "answer": false, 248 "justification": "No raw data is available. The 80-bug dataset, generated BRTs, and experimental traces are all from Google's proprietary systems and are not released." 249 }, 250 "data_collection_described": { 251 "applies": true, 252 "answer": true, 253 "justification": "§5.1.1 describes the dataset: 80 production bugs from GITS, reported and fixed by human developers, recent (since June 2024), spanning 7 languages, constructed via automated extraction/filtering and manual curation [30]." 254 }, 255 "recruitment_methods_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in a study sense. The bugs come from Google's issue tracker, not a recruited sample." 259 }, 260 "data_pipeline_documented": { 261 "applies": true, 262 "answer": true, 263 "justification": "§5.1.1 documents the pipeline: bugs extracted from GITS → automated filtering → manual curation (verifying fixes address root cause) → final 80-bug dataset. Each sample includes GITS issue, ground truth fix with oracle BRT, buggy files, and test file." 264 } 265 }, 266 "conflicts_of_interest": { 267 "funding_disclosed": { 268 "applies": true, 269 "answer": false, 270 "justification": "No funding disclosure or acknowledgments section is present in the paper." 271 }, 272 "affiliations_disclosed": { 273 "applies": true, 274 "answer": true, 275 "justification": "Author affiliations are clearly listed: 6 authors from Google, 1 from UIUC, 1 from TU Wien. Footnote notes Cheng and Cito conducted research at Google." 276 }, 277 "funder_independent_of_outcome": { 278 "applies": true, 279 "answer": false, 280 "justification": "Google employees are evaluating Google's internal systems (BRT Agent, Passerine). Google has a direct interest in demonstrating their APR systems work well. The funder is not independent of the outcome." 281 }, 282 "financial_interests_declared": { 283 "applies": true, 284 "answer": false, 285 "justification": "No competing interests or financial interests statement is present in the paper." 286 } 287 }, 288 "contamination": { 289 "training_cutoff_stated": { 290 "applies": true, 291 "answer": true, 292 "justification": "§4.2.3 states 'the code-editing LLM's training data excludes all bugs, code changes, and BRTs used in our empirical evaluation—its training data cutoff predates the reporting of all bugs analyzed in this study.'" 293 }, 294 "train_test_overlap_discussed": { 295 "applies": true, 296 "answer": true, 297 "justification": "§4.2.3 explicitly addresses that the training data cutoff predates all bugs, preventing data leakage. The bugs are recent (since June 2024)." 298 }, 299 "benchmark_contamination_addressed": { 300 "applies": true, 301 "answer": true, 302 "justification": "§4.2.3 addresses contamination by confirming temporal separation: training data cutoff predates the reporting of all 80 bugs (since June 2024)." 303 } 304 }, 305 "human_studies": { 306 "pre_registered": { 307 "applies": false, 308 "answer": false, 309 "justification": "No human participants in an experimental study. The manual inspection of BRTs is an evaluation methodology, not a human subjects study." 310 }, 311 "irb_or_ethics_approval": { 312 "applies": false, 313 "answer": false, 314 "justification": "No human participants in the study." 315 }, 316 "demographics_reported": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human participants in the study." 320 }, 321 "inclusion_exclusion_criteria": { 322 "applies": false, 323 "answer": false, 324 "justification": "No human participants in the study." 325 }, 326 "randomization_described": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participants in the study." 330 }, 331 "blinding_described": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants in the study." 335 }, 336 "attrition_reported": { 337 "applies": false, 338 "answer": false, 339 "justification": "No human participants in the study." 340 } 341 }, 342 "cost_and_practicality": { 343 "inference_cost_reported": { 344 "applies": true, 345 "answer": false, 346 "justification": "No inference cost, API cost, or latency is reported despite running LIBRO 50 times per bug and BRT Agent 20 times per bug across 80 bugs (4000 + 1600 runs total)." 347 }, 348 "compute_budget_stated": { 349 "applies": true, 350 "answer": false, 351 "justification": "No total computational budget, GPU hours, or API spend is stated despite significant compute usage." 352 } 353 }, 354 "experimental_rigor": { 355 "seed_sensitivity_reported": { 356 "applies": true, 357 "answer": false, 358 "justification": "Despite running 20 (BRT Agent) and 50 (LIBRO) runs per bug to account for stochasticity, no per-bug variance or seed sensitivity analysis is reported. Only aggregate percentages across all bugs." 359 }, 360 "number_of_runs_stated": { 361 "applies": true, 362 "answer": true, 363 "justification": "§5.1.2 explicitly states: 50 runs per bug for LIBRO, 20 runs per bug for BRT Agent." 364 }, 365 "hyperparameter_search_budget": { 366 "applies": true, 367 "answer": false, 368 "justification": "The paper mentions 'small-scale experiments on different prompt structures, paraphrases, and input' for LIBRO prompt crafting (§4.1) but does not quantify the search budget or configurations tried." 369 }, 370 "best_config_selection_justified": { 371 "applies": true, 372 "answer": false, 373 "justification": "Temperature and top-P values (0.7/0.95 for LIBRO, 0.2/0.95 for BRT Agent) are stated as following prior work [20, 30] but no justification for why these are optimal or how they were selected." 374 }, 375 "multiple_comparison_correction": { 376 "applies": false, 377 "answer": false, 378 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 379 }, 380 "self_comparison_bias_addressed": { 381 "applies": true, 382 "answer": false, 383 "justification": "Google employees evaluate their own BRT Agent system against their adaptation of LIBRO. The paper does not acknowledge self-comparison bias despite the authors implementing both the system and the baseline adaptation." 384 }, 385 "compute_budget_vs_performance": { 386 "applies": true, 387 "answer": false, 388 "justification": "BRT Agent uses a reasoning LLM + code-editing LLM with up to 25 iterative steps, while LIBRO uses a single LLM call. This compute difference is not discussed or controlled for." 389 }, 390 "benchmark_construct_validity": { 391 "applies": true, 392 "answer": true, 393 "justification": "§7 Construct Validity discusses limitations of the BRT evaluation metrics, noting they 'may not fully capture all aspects of a BRT, such as its readability, maintainability.' Manual inspection is used to supplement automated metrics." 394 }, 395 "scaffold_confound_addressed": { 396 "applies": true, 397 "answer": false, 398 "justification": "BRT Agent uses a multi-step agentic scaffold (ReAct reasoning + code-editing LLM + bazel test feedback loop) while LIBRO uses a single LLM call. The paper does not control for or discuss the scaffolding difference — improvements could stem from the iterative scaffold rather than the fine-tuned code-editing LLM. Section VII mentions LLM differences but not scaffolding confound." 399 } 400 }, 401 "data_leakage": { 402 "temporal_leakage_addressed": { 403 "applies": true, 404 "answer": true, 405 "justification": "§4.2.3 explicitly states the code-editing LLM's training data cutoff predates reporting of all bugs (since June 2024), addressing temporal leakage." 406 }, 407 "feature_leakage_addressed": { 408 "applies": true, 409 "answer": true, 410 "justification": "The experimental setup is transparent about what information each technique receives. LIBRO gets buggy files + test file; BRT Agent gets only buggy files. Ground truth test file is not given to BRT Agent (§5.1.2)." 411 }, 412 "non_independence_addressed": { 413 "applies": true, 414 "answer": false, 415 "justification": "No discussion of whether the 80 bugs are independent (e.g., from different projects, different codebases) or whether bugs from the same project could share structural similarities affecting results." 416 }, 417 "leakage_detection_method": { 418 "applies": true, 419 "answer": true, 420 "justification": "Temporal separation is used as a concrete leakage prevention method: training data cutoff predates all evaluated bugs (§4.2.3)." 421 } 422 } 423 }, 424 "cited_papers": [ 425 { 426 "title": "Large Language Models are Few-shot Testers: Exploring LLM-based General Bug Reproduction", 427 "authors": [ 428 "Sungmin Kang", 429 "Juyeon Yoon", 430 "Shin Yoo" 431 ], 432 "year": 2023, 433 "relevance": "LIBRO — the primary baseline for BRT generation using LLMs, evaluated and compared against in this paper." 434 }, 435 { 436 "title": "Evaluating Agent-based Program Repair at Google", 437 "authors": [ 438 "Pat Rondon", 439 "Renyao Wei", 440 "José Cambronero", 441 "Jürgen Cito", 442 "Aaron Sun", 443 "Siddhant Sanyam", 444 "Michele Tufano", 445 "Satish Chandra" 446 ], 447 "year": 2025, 448 "relevance": "Passerine APR system at Google — the industrial APR system integrated with BRT Agent in this paper." 449 }, 450 { 451 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 452 "authors": [ 453 "John Yang", 454 "Carlos E Jimenez", 455 "Alexander Wettig", 456 "Kilian Lieret", 457 "Shunyu Yao", 458 "Karthik Narasimhan", 459 "Ofir Press" 460 ], 461 "year": 2024, 462 "arxiv_id": "2405.15793", 463 "relevance": "SWE-Agent — a prominent LLM agent for software engineering that was adapted for BRT generation as SWE-Agent+." 464 }, 465 { 466 "title": "SWT-Bench: Testing and Validating Real-World Bug-Fixes with Code Agents", 467 "authors": [ 468 "Niels Mündler", 469 "Mark Niklas Mueller", 470 "Jingxuan He", 471 "Martin Vechev" 472 ], 473 "year": 2024, 474 "relevance": "Benchmark specifically for BRT generation evaluation in Python, compared SWE-Agent+ and LIBRO." 475 }, 476 { 477 "title": "Swe-bench: Can Language Models Resolve Real-World GitHub Issues?", 478 "authors": [ 479 "Carlos E Jimenez", 480 "John Yang", 481 "Alexander Wettig", 482 "Shunyu Yao", 483 "Kexin Pei", 484 "Ofir Press", 485 "Karthik Narasimhan" 486 ], 487 "year": 2023, 488 "arxiv_id": "2310.06770", 489 "relevance": "SWE-bench — foundational benchmark for evaluating LLM agents on real-world software engineering tasks." 490 }, 491 { 492 "title": "Evaluating Large Language Models Trained on Code", 493 "authors": [ 494 "Mark Chen", 495 "Jerry Tworek" 496 ], 497 "year": 2021, 498 "arxiv_id": "2107.03374", 499 "relevance": "Codex — foundational work on LLMs for code generation, underlying technology for many code agents." 500 }, 501 { 502 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 503 "authors": [ 504 "Shunyu Yao", 505 "Jeffrey Zhao", 506 "Dian Yu", 507 "Nan Du", 508 "Izhak Shafran", 509 "Karthik Narasimhan", 510 "Yuan Cao" 511 ], 512 "year": 2022, 513 "arxiv_id": "2210.03629", 514 "relevance": "ReAct framework used as the reasoning paradigm for BRT Agent's agentic workflow." 515 }, 516 { 517 "title": "CODAMOSA: Escaping Coverage Plateaus in Test Generation with Pre-trained Large Language Models", 518 "authors": [ 519 "Caroline Lemieux", 520 "Jeevana Priya Inala", 521 "Shuvendu K Lahiri", 522 "Siddhartha Sen" 523 ], 524 "year": 2023, 525 "relevance": "Hybrid approach combining evolutionary search with LLM code understanding for test generation." 526 }, 527 { 528 "title": "Autocoderover: Autonomous Program Improvement", 529 "authors": [ 530 "Yuntong Zhang", 531 "Haifeng Ruan", 532 "Zhiyu Fan", 533 "Abhik Roychoudhury" 534 ], 535 "year": 2024, 536 "relevance": "LLM agent for autonomous program repair, adapted for BRT generation in SWT-Bench evaluation." 537 }, 538 { 539 "title": "TDD-Bench Verified: Can LLMs Generate Tests for Issues Before They Get Resolved?", 540 "authors": [ 541 "Toufique Ahmed", 542 "Martin Hirzel", 543 "Rangeet Pan", 544 "Avraham Shinnar", 545 "Saurabh Sinha" 546 ], 547 "year": 2024, 548 "arxiv_id": "2412.02883", 549 "relevance": "Concurrent work on LLM-based BRT generation that also found selecting test files as input improves generation." 550 }, 551 { 552 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 553 "authors": [ 554 "Lianmin Zheng", 555 "Wei-Lin Chiang" 556 ], 557 "year": 2023, 558 "relevance": "LLM-as-a-Judge methodology used in this paper for sampling plausible BRTs for RQ2." 559 } 560 ], 561 "engagement_factors": { 562 "practical_relevance": { 563 "score": 2, 564 "justification": "Describes an actionable agent-based approach for automated bug reproduction and repair, though the specific tool is Google-internal and not publicly available." 565 }, 566 "surprise_contrarian": { 567 "score": 1, 568 "justification": "Results confirm the expected advantage of agentic approaches over simpler prompting, with no major counterintuitive findings." 569 }, 570 "fear_safety": { 571 "score": 0, 572 "justification": "No safety, security, or risk angle discussed." 573 }, 574 "drama_conflict": { 575 "score": 0, 576 "justification": "No controversy or conflict; straightforward comparison of two internal techniques." 577 }, 578 "demo_ability": { 579 "score": 0, 580 "justification": "Entirely built on Google's proprietary codebase, fine-tuned models, and internal infrastructure with no public code or demo." 581 }, 582 "brand_recognition": { 583 "score": 3, 584 "justification": "Google-authored paper about Google's internal bug repair infrastructure using Gemini, hitting high brand recognition." 585 } 586 } 587 }