scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24569B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Agentic Bug Reproduction for Effective Automated Program Repair at Google",
      6     "authors": [
      7       "Runxiang Cheng",
      8       "Michele Tufano",
      9       "Jürgen Cito",
     10       "José Cambronero",
     11       "Pat Rondon",
     12       "Renyao Wei",
     13       "Aaron Sun",
     14       "Satish Chandra"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv.org",
     18     "arxiv_id": "2502.01821",
     19     "doi": "10.48550/arXiv.2502.01821"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "All key abstract claims (28% vs 10% plausible BRT rate, 30% more bugs fixed with BRTs, 70% top-1 EPR precision) are directly supported by Table 2, Figure 3, and Figure 5 respectively.",
     27         "source": "haiku"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The claim that BRTs cause improved APR performance is tested on only 23 bugs with no statistical significance testing; the small sample makes causal inference inadequate despite the controlled within-subject comparison.",
     33         "source": "haiku"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Section 7 explicitly acknowledges the study focuses exclusively on Google's internal environment and that generalizability to other industrial settings 'requires further investigation.'",
     39         "source": "haiku"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper controls for LLM differences but does not discuss whether BRT Agent's advantage over LIBRO stems from the agent scaffolding, code search, or the fine-tuned LLM—these factors are fully confounded with no ablation.",
     45         "source": "haiku"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper defines plausible BRTs (F→P behavior) as a proxy and acknowledges in threats to validity that this metric 'may not fully capture all aspects of a BRT, such as its readability or maintainability.'",
     51         "source": "haiku"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 7 'Threats to Validity' is a dedicated section covering Internal, External, and Construct validity with specific subsections.",
     59         "source": "haiku"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Specific threats include the small 80-bug dataset limiting subgroup analysis, potential implementation bias in the LIBRO adaptation, LLM non-determinism, and Google-specific generalizability limits—these go beyond boilerplate.",
     65         "source": "haiku"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper explicitly states findings are limited to Google's internal environment and that EPR is an indirect measure that may not always correlate with human-judged fix correctness.",
     71         "source": "haiku"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No explicit funding disclosure statement appears anywhere in the paper.",
     79         "source": "haiku"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Author affiliations (UIUC, Google, TU Wien) are disclosed; a footnote clarifies that Cheng and Cito conducted the research at Google.",
     85         "source": "haiku"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "The majority of authors are Google employees evaluating Google's own internal tools (Passerine APR system, proprietary fine-tuned Gemini), creating a direct conflict of interest with the outcome.",
     91         "source": "haiku"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests or financial interests declaration appears in the paper.",
     97         "source": "haiku"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "BRT is formally defined in Section 2.1 with precise F→P behavior criteria; 'candidate BRT,' 'plausible BRT,' and EPR are all precisely defined in Section 5.2.",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Three explicit contributions are stated: BRT Agent system and comparison with LIBRO, assessment of BRT impact on APR (Passerine), and the EPR metric for fix selection.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 3 provides detailed comparison with LIBRO, SWE-Agent+, and LLM test generation literature, explicitly situating differences in industrial context and usefulness of generated BRTs.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "The system is built on proprietary Google infrastructure; no code is released and no promise of future release is mentioned.",
    128           "source": "haiku"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "The evaluation dataset is from Google's internal issue tracking system (GITS) and is not publicly available.",
    134           "source": "haiku"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No environment specifications (requirements, Docker, etc.) are provided; the system depends on proprietary Google infrastructure.",
    140           "source": "haiku"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "No reproduction instructions are provided; complete dependency on Google's internal infrastructure makes external reproduction impossible.",
    146           "source": "haiku"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "Main results (28% vs 10% plausible BRT rate, 70% top-1 EPR precision) are reported as point estimates without any confidence intervals or error bars.",
    154           "source": "haiku"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "No statistical significance tests are applied to any comparisons (BRT Agent vs LIBRO, with/without BRT for APR) despite making comparative claims on small samples.",
    160           "source": "haiku"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Absolute differences with baseline context are provided: 28% vs 10% plausible BRTs, 17/23 vs 13/23 bugs fixed, precision@K values with K varying—sufficient to assess magnitude.",
    166           "source": "haiku"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The 80-bug sample is acknowledged as a potential limitation but no power analysis or formal sample size justification is provided.",
    172           "source": "haiku"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "Despite running 20 runs per bug to account for LLM stochasticity, no variance, standard deviation, or confidence intervals are reported for aggregate metrics.",
    178           "source": "haiku"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "LIBRO, the state-of-the-art BRT generation approach, is adapted to Google's environment and used as the primary baseline for all BRT generation comparisons.",
    186           "source": "haiku"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "LIBRO (ICSE 2023) is the most directly comparable recent approach; SWT-Bench (NeurIPS 2024) results for SWE-Agent+ are referenced for broader context.",
    192           "source": "haiku"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": false,
    197           "justification": "No ablation study isolates the contribution of individual BRT Agent components (reasoning LLM, fine-tuned code-editing LLM, code search, ReAct scaffolding).",
    198           "source": "haiku"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Multiple metrics are used: candidate BRT rate, plausible BRT rate, candidate-to-plausible rate, bugs fixed, steps to fix, and precision/recall/F1/MRR for EPR.",
    204           "source": "haiku"
    205         },
    206         "human_evaluation": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Two authors manually inspect all plausible BRT patches against oracle BRTs for semantic equivalence, with a third author resolving disagreements.",
    210           "source": "haiku"
    211         },
    212         "held_out_test_set": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "The 80 production bugs with ground truth oracle BRTs serve as a held-out evaluation set; the code-editing LLM's training cutoff explicitly predates all evaluated bugs.",
    216           "source": "haiku"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Table 3 provides plausible BRT rates broken down by 7 programming languages (Java, C++, Go, Python, Kotlin, Dart, TypeScript) for both techniques.",
    222           "source": "haiku"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Failure modes are discussed: LIBRO fails mainly via build errors it cannot recover from; BRT Agent modifies existing tests in 11% of cases; 21% of BRT Agent runs exhaust the step limit.",
    228           "source": "haiku"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "Dart achieves 0% plausible BRT rate for both LIBRO and BRT Agent; EPR recall limitations are quantified and discussed as a trade-off.",
    234           "source": "haiku"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": false,
    241           "justification": "Models are described only as 'a Gemini model fine-tuned on Google's internal code' and 'a publicly available Gemini'—no version numbers or snapshot dates are given.",
    242           "source": "haiku"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": false,
    247           "justification": "Prompt structure is described at a high level (bug report + buggy file + test file) and the meta task description string is quoted, but full prompt text is not provided verbatim.",
    248           "source": "haiku"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Temperature (0.7 for LIBRO, 0.2 for BRT Agent), top-P (0.95), number of runs (50 for LIBRO, 20 for BRT Agent), and step limit (25) are all explicitly reported.",
    254           "source": "haiku"
    255         },
    256         "scaffolding_described": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Section 4.2 details BRT Agent's ReAct-based loop, its full action set (Table 1), change description generation process, and termination conditions with sufficient specificity.",
    260           "source": "haiku"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": false,
    265           "justification": "Dataset construction is described only as 'automated extraction and filtering phases as well as manual curation' with full details deferred to the concurrent Passerine paper [30].",
    266           "source": "haiku"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": false,
    273           "justification": "The bug dataset is from Google's internal GITS and is not publicly accessible.",
    274           "source": "haiku"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "Section 5.1.1 describes that bugs were human-reported, human-fixed, sourced from GITS since June 2024, across seven languages, with manual curation to ensure fixes address root causes.",
    280           "source": "haiku"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": false,
    284           "answer": false,
    285           "justification": "No human participants were recruited; bugs are drawn from an internal issue tracker.",
    286           "source": "haiku"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": false,
    291           "justification": "The full pipeline from collection to analysis is not documented; automated extraction and filtering details are deferred to the Passerine paper [30].",
    292           "source": "haiku"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": true,
    299           "justification": "Section 4.2.3 explicitly states the code-editing LLM's training data cutoff predates the reporting of all bugs analyzed in the study.",
    300           "source": "haiku"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": true,
    305           "justification": "The paper explicitly states training data excludes all bugs, code changes, and BRTs in the evaluation set, 'preventing any potential data leakage.'",
    306           "source": "haiku"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": true,
    311           "justification": "Bugs are from Google's internal tracker since June 2024 and training cutoff is stated to predate all evaluation bugs, directly addressing contamination.",
    312           "source": "haiku"
    313         }
    314       },
    315       "human_studies": {
    316         "pre_registered": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants were recruited for this study.",
    320           "source": "haiku"
    321         },
    322         "irb_or_ethics_approval": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants were recruited for this study.",
    326           "source": "haiku"
    327         },
    328         "demographics_reported": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants were recruited for this study.",
    332           "source": "haiku"
    333         },
    334         "inclusion_exclusion_criteria": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants were recruited for this study.",
    338           "source": "haiku"
    339         },
    340         "randomization_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants were recruited for this study.",
    344           "source": "haiku"
    345         },
    346         "blinding_described": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants were recruited for this study.",
    350           "source": "haiku"
    351         },
    352         "attrition_reported": {
    353           "applies": false,
    354           "answer": false,
    355           "justification": "No human participants were recruited for this study.",
    356           "source": "haiku"
    357         }
    358       },
    359       "cost_and_practicality": {
    360         "inference_cost_reported": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "No inference cost or latency figures are reported despite running 1,600 BRT Agent runs (80 bugs × 20 runs) and 4,000 LIBRO calls.",
    364           "source": "haiku"
    365         },
    366         "compute_budget_stated": {
    367           "applies": true,
    368           "answer": false,
    369           "justification": "No compute budget or resource requirements are stated anywhere in the paper.",
    370           "source": "haiku"
    371         }
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "BRT Agent achieves 28% plausible BRT generation rate vs 10% by adapted LIBRO on 80 Google production bugs",
    378       "evidence": "Table 2: BRT Agent 85% candidate / 28% plausible; LIBRO 41% candidate / 10% plausible",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Providing generated BRTs to Passerine results in ~30% more bugs with plausible fixes (74% vs 57%)",
    383       "evidence": "Figure 3: 17/23 bugs fixed with BRT vs 13/23 without on the 23-bug subset where BRT Agent succeeded",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "EPR correctly selects a plausible fix from 20 APR-generated candidates in 70% of cases at top-1 ranking",
    388       "evidence": "Figure 5: precision@1 = 0.7, MRR@1 = 0.7",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "67% of plausible BRTs generated by BRT Agent are semantically equivalent or identical to oracle BRTs",
    393       "evidence": "Manual inspection: 19% identical + 48% semantically equivalent = 67% of plausible BRT patches",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "BRT Agent generalizes across 6 of 7 programming languages; only Dart produces 0% results",
    398       "evidence": "Table 3 language breakdown showing non-zero rates for Java (28%), C++ (16%), Go (17%), Python (45%), Kotlin (50%), TypeScript (100%)",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "Passerine takes fewer agent steps to generate plausible fixes when provided with BRTs",
    403       "evidence": "Figure 4 shows a leftward shift in step count distribution when BRT is provided as input",
    404       "supported": "moderate"
    405     }
    406   ],
    407   "methodology_tags": [
    408     "benchmark-eval",
    409     "case-study"
    410   ],
    411   "key_findings": "BRT Agent, combining a ReAct-based reasoning LLM with a proprietary fine-tuned code-editing LLM, achieves 28% plausible bug reproduction test generation on 80 Google production bugs—significantly outperforming adapted LIBRO (10%). Generated BRTs improve Google's APR system (Passerine) from fixing 57% to 74% of bugs on a 23-bug subset, with fewer agent steps required. The proposed Ensemble Pass Rate (EPR) metric achieves 70% top-1 precision for selecting correct fixes from pools of 20 APR-generated candidates. Both BRT Agent and LIBRO fail completely on Dart bugs, and 11% of BRT Agent's plausible patches are invalid due to unintended modification of existing tests.",
    412   "red_flags": [
    413     {
    414       "flag": "No statistical significance tests",
    415       "detail": "All comparisons (BRT Agent vs LIBRO, with/without BRT for APR) are reported as raw percentages without significance tests or confidence intervals despite the small sample sizes making chance effects plausible."
    416     },
    417     {
    418       "flag": "Tiny APR evaluation sample",
    419       "detail": "RQ2 and RQ3 are evaluated on only 23 bugs (those where BRT Agent happened to succeed), making the 30% improvement claim fragile and potentially inflated."
    420     },
    421     {
    422       "flag": "No ablation study",
    423       "detail": "The paper never isolates whether BRT Agent's advantage over LIBRO comes from the agent scaffolding, fine-tuned LLM, code search, or their combination—all factors are fully confounded."
    424     },
    425     {
    426       "flag": "Google-only, entirely non-reproducible evaluation",
    427       "detail": "All evaluation uses proprietary Google infrastructure, internal bugs, and internal LLMs; no external party can reproduce any result."
    428     },
    429     {
    430       "flag": "Unspecified model versions",
    431       "detail": "Models are described only as 'a Gemini model fine-tuned on Google's internal code' and 'a publicly available Gemini' without version numbers or snapshot dates."
    432     },
    433     {
    434       "flag": "Google employees evaluating Google systems",
    435       "detail": "Majority of authors are Google employees evaluating Google's own APR system (Passerine) and Google's proprietary LLMs with no independent validation."
    436     }
    437   ],
    438   "cited_papers": [
    439     {
    440       "title": "Large Language Models are Few-shot Testers: Exploring LLM-based General Bug Reproduction (LIBRO)",
    441       "relevance": "Primary baseline adapted and compared against in all BRT generation experiments"
    442     },
    443     {
    444       "title": "SWT-Bench: Testing and Validating Real-World Bug-Fixes with Code Agents",
    445       "relevance": "Most recent BRT generation benchmark; SWE-Agent+ results used for broader context comparison"
    446     },
    447     {
    448       "title": "Evaluating Agent-based Program Repair at Google (Passerine)",
    449       "relevance": "Concurrent work describing the APR system evaluated and the same 80-bug dataset"
    450     },
    451     {
    452       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    453       "relevance": "Agent framework conceptually similar to BRT Agent; SWE-Agent+ is a direct point of comparison"
    454     },
    455     {
    456       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    457       "relevance": "Theoretical framework underlying BRT Agent's reasoning loop design"
    458     },
    459     {
    460       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    461       "relevance": "Standard benchmark used to evaluate LIBRO; reference point for comparing BRT generation performance"
    462     },
    463     {
    464       "title": "Swe-bench: Can Language Models Resolve Real-World GitHub Issues?",
    465       "relevance": "Major benchmark for evaluating code agents; provides context for the field's evaluation practices"
    466     },
    467     {
    468       "title": "Evaluating Diverse Large Language Models for Automatic and General Bug Reproduction",
    469       "relevance": "Extended LIBRO evaluation providing additional baseline context"
    470     }
    471   ],
    472   "engagement_factors": {
    473     "practical_relevance": {
    474       "score": 3,
    475       "justification": "Demonstrates industrial-scale BRT generation at Google with concrete improvement in APR effectiveness—directly actionable for engineering teams."
    476     },
    477     "surprise_contrarian": {
    478       "score": 1,
    479       "justification": "Expected result that agent-based approach outperforms few-shot baseline; the 0% Dart result and the EPR precision-recall trade-offs are modestly interesting."
    480     },
    481     "fear_safety": {
    482       "score": 0,
    483       "justification": "No AI safety or risk concerns; purely a software engineering productivity paper."
    484     },
    485     "drama_conflict": {
    486       "score": 0,
    487       "justification": "No controversy or conflict angle; straightforward industrial evaluation."
    488     },
    489     "demo_ability": {
    490       "score": 1,
    491       "justification": "The BRT generation concept is demonstrable in open-source analogues (SWE-agent, LIBRO) but the actual Google system requires proprietary infrastructure."
    492     },
    493     "brand_recognition": {
    494       "score": 3,
    495       "justification": "Google authorship, Google production bugs, and evaluation on Gemini models provide strong brand recognition for the HN/tech audience."
    496     }
    497   },
    498   "hn_data": {
    499     "threads": [
    500       {
    501         "hn_id": "43876276",
    502         "title": "Agentic Bug Reproduction for Effective Automated Program Repair at Google",
    503         "points": 2,
    504         "comments": 0,
    505         "url": "https://news.ycombinator.com/item?id=43876276",
    506         "created_at": "2025-05-03T01:54:39Z"
    507       },
    508       {
    509         "hn_id": "45599001",
    510         "title": "Agentic Bug Reproduction for Effective Automated Program Repair at Google",
    511         "points": 1,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=45599001",
    514         "created_at": "2025-10-15T22:20:39Z"
    515       }
    516     ],
    517     "top_points": 2,
    518     "total_points": 3,
    519     "total_comments": 0
    520   }
    521 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs