scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31062B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Agentic Bug Reproduction for Effective Automated Program Repair at Google",
      6     "authors": [
      7       "Runxiang Cheng",
      8       "Michele Tufano",
      9       "Jürgen Cito",
     10       "José Cambronero",
     11       "Pat Rondon",
     12       "Renyao Wei",
     13       "Aaron Sun",
     14       "Satish Chandra"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv.org",
     18     "arxiv_id": "2502.01821",
     19     "doi": "10.48550/arXiv.2502.01821"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Abstract claims (28% vs 10% BRT rate, 30% more bugs with plausible fixes, EPR top-1 70%) are all supported by Tables 2-3, Figures 3-6 in the results sections.",
     27         "source": "opus"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper claims BRT Agent 'significantly outperforms' LIBRO but lacks ablation to identify which component causes the improvement. The claim that BRTs 'improve' APR is supported by controlled comparison but on only 23 bugs without statistical testing.",
     33         "source": "opus"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "The paper explicitly bounds findings to Google's internal environment (§7 External Validity) and acknowledges that 'the generalizability of our findings to other industrial settings requires further investigation.'",
     39         "source": "opus"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "§7 discusses alternative explanations: implementation bias in LIBRO adaptation, LLM differences, randomness. §6.2 acknowledges RQ2 results are an upper bound due to BRT selection with ground truth fixes.",
     45         "source": "opus"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper measures 'plausible BRT generation rate' (tests that fail on buggy code and pass on fixed code) and frames this as BRT Agent being effective for 'automated program repair.' The gap between plausible BRTs (the proxy — tests that exhibit fail-to-pass behavior) and truly correct BRTs (tests that validate the actual fix intent) is partially addressed via manual inspection (86% valid), but the broader claim of 'effective APR' from 28% success rate is not qualified as a proxy for debugging productivity.",
     51         "source": "opus"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "§7 'Threats to Validity' is a dedicated section covering internal, external, and construct validity threats.",
     59         "source": "opus"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "§7 discusses specific threats: the 80-bug dataset size limiting generalizability, implementation bias in LIBRO adaptation, specific mitigation of using same Gemini models, EPR as indirect measure of fix correctness, metrics not capturing readability/maintainability.",
     65         "source": "opus"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "§7 External Validity explicitly states findings are limited to Google's internal environment and that 'the specific tools, processes, and codebase characteristics may differ significantly from those in other companies.'",
     71         "source": "opus"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No funding disclosure or acknowledgments section is present in the paper.",
     79         "source": "opus"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Author affiliations are clearly listed: 6 authors from Google, 1 from UIUC, 1 from TU Wien. Footnote notes Cheng and Cito conducted research at Google.",
     85         "source": "opus"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "Google employees are evaluating Google's internal systems (BRT Agent, Passerine). Google has a direct interest in demonstrating their APR systems work well. The funder is not independent of the outcome.",
     91         "source": "opus"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests or financial interests statement is present in the paper.",
     97         "source": "opus"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "BRT is formally defined in Section 2.1 with the Fail-to-Pass (F→P) criterion, 'candidate BRT' and 'plausible BRT' are defined in Section 5.2.1, and EPR is defined in Section 5.2.3.",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Three explicit contributions are itemized at the end of the introduction: BRT Agent outperforming LIBRO, BRTs improving Passerine's fix generation, and the EPR metric for fix selection.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 3.3 explicitly contrasts with LIBRO and SWE-Agent+ by industrial context, usefulness dimension, and design differences; the paper also discusses concurrent work (Ahmed et al.) and explains how this work differs.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "No source code repository or archive is mentioned. The system is built on Google's internal infrastructure and no code is released.",
    128           "source": "opus"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "The dataset of 80 production bugs is from Google's internal issue tracking system (GITS) and is not publicly available.",
    134           "source": "opus"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "The paper mentions Google's internal development environment, Bazel build system, and fine-tuned Gemini models, but no specific environment specifications (versions, dependencies) are provided.",
    140           "source": "opus"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "No reproduction instructions are provided. The entire system relies on Google's proprietary infrastructure.",
    146           "source": "opus"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "Results are reported as raw percentages (e.g., 28%, 10%) without confidence intervals or error bars despite small sample sizes.",
    154           "source": "opus"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "The paper claims BRT Agent 'significantly outperforms LIBRO' but provides no statistical significance tests. Comparisons are based on raw percentage differences.",
    160           "source": "opus"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Effect sizes are conveyed through percentage improvements with baseline context: 28% vs 10% plausible BRT rate, 17/23 vs 13/23 bugs fixed, precision@1 = 0.7. The reader can assess magnitude.",
    166           "source": "opus"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "No justification for why 80 bugs were used. The paper acknowledges the small size as a threat to validity but does not provide power analysis or other justification.",
    172           "source": "opus"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "Despite running LIBRO 50 times and BRT Agent 20 times per bug, no variance or standard deviation across runs is reported. Only aggregate percentages are shown.",
    178           "source": "opus"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "LIBRO is adapted and used as a baseline comparison (§4.1, Table 2). SWE-Agent+ is discussed as related work but not directly compared on the same dataset.",
    186           "source": "opus"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "LIBRO (Kang et al., 2023) is a recent and relevant baseline for BRT generation. SWE-Agent+ (Mündler et al., 2024) is discussed but not directly evaluated on the same dataset due to Google's proprietary setting.",
    192           "source": "opus"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": false,
    197           "justification": "No ablation study is conducted to isolate the contribution of individual BRT Agent components (e.g., fine-tuned code-editing LLM, ReAct reasoning, code search). The improvement over LIBRO could be due to any combination of factors.",
    198           "source": "opus"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Multiple metrics are used: candidate BRTs, plausible BRTs, candidate-to-plausible rate (RQ1); number of bugs fixed, steps to fix, plausibility given BRT usage (RQ2); precision, recall, F1, MRR (RQ3).",
    204           "source": "opus"
    205         },
    206         "human_evaluation": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Manual inspection of plausible BRTs by two authors with a third resolving disagreements (§6.1.1). They classify BRTs as identical, semantically equivalent, valid with irrelevant additions, or invalid.",
    210           "source": "opus"
    211         },
    212         "held_out_test_set": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "The code-editing LLM's training data cutoff predates all bugs in the evaluation, preventing data leakage (§4.2.3). The 80-bug dataset was constructed independently.",
    216           "source": "opus"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Table 3 provides per-language breakdown of plausible BRTs across 7 programming languages. Figure 2 shows per-step action distribution.",
    222           "source": "opus"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "§6.1.1 discusses failure modes: LIBRO's main failure is build errors it cannot recover from. 11% of BRT Agent's plausible patches are invalid due to modifying existing tests. Table 5 shows termination reasons including steps exhausted (21%) and framework exceptions (7%).",
    228           "source": "opus"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "The paper reports that BRT Agent fails on 72% of bugs (only 28% plausible BRTs). Dart achieves 0% for both techniques. 21% of runs exhaust the step limit. The agent occasionally hallucinates non-existent actions (§6.1.2).",
    234           "source": "opus"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": false,
    241           "justification": "The paper says 'a Gemini model fine-tuned on Google's internal code' and 'a publicly available Gemini' without specifying exact model versions or snapshot dates for either.",
    242           "source": "opus"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": false,
    247           "justification": "The paper describes prompt elements (meta task description, change description, bug report) but does not provide actual prompt text. Only one example description is given: 'Add a test case that asserts the function returns null when given an empty input' (§4.2.3).",
    248           "source": "opus"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "§5.1.2: LIBRO temperature 0.7, top P 0.95, 50 runs per bug. BRT Agent temperature 0.2, top P 0.95, 20 runs per bug, 25 max steps. Three synthetic examples in system prompt.",
    254           "source": "opus"
    255         },
    256         "scaffolding_described": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "§4.2 describes BRT Agent's workflow in detail: initialization, ReAct reasoning loop, action set (Table 1: cat, code_search, edit, bazel test, finish), code-editing LLM integration, observation handling, iteration, and termination conditions.",
    260           "source": "opus"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": true,
    265           "justification": "§5.1.1 describes dataset construction: 80 production bugs from GITS, extracted via automated extraction and filtering plus manual curation. Each bug verified to have genuine fix. Dataset from concurrent work [30].",
    266           "source": "opus"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": false,
    273           "justification": "No raw data is available. The 80-bug dataset, generated BRTs, and experimental traces are all from Google's proprietary systems and are not released.",
    274           "source": "opus"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "§5.1.1 describes the dataset: 80 production bugs from GITS, reported and fixed by human developers, recent (since June 2024), spanning 7 languages, constructed via automated extraction/filtering and manual curation [30].",
    280           "source": "opus"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": false,
    284           "answer": false,
    285           "justification": "No human participants in a study sense. The bugs come from Google's issue tracker, not a recruited sample.",
    286           "source": "opus"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": true,
    291           "justification": "§5.1.1 documents the pipeline: bugs extracted from GITS → automated filtering → manual curation (verifying fixes address root cause) → final 80-bug dataset. Each sample includes GITS issue, ground truth fix with oracle BRT, buggy files, and test file.",
    292           "source": "opus"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": true,
    299           "justification": "§4.2.3 states 'the code-editing LLM's training data excludes all bugs, code changes, and BRTs used in our empirical evaluation—its training data cutoff predates the reporting of all bugs analyzed in this study.'",
    300           "source": "opus"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": true,
    305           "justification": "§4.2.3 explicitly addresses that the training data cutoff predates all bugs, preventing data leakage. The bugs are recent (since June 2024).",
    306           "source": "opus"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": true,
    311           "justification": "§4.2.3 addresses contamination by confirming temporal separation: training data cutoff predates the reporting of all 80 bugs (since June 2024).",
    312           "source": "opus"
    313         }
    314       },
    315       "human_studies": {
    316         "pre_registered": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants in an experimental study. The manual inspection of BRTs is an evaluation methodology, not a human subjects study.",
    320           "source": "opus"
    321         },
    322         "irb_or_ethics_approval": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants in the study.",
    326           "source": "opus"
    327         },
    328         "demographics_reported": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants in the study.",
    332           "source": "opus"
    333         },
    334         "inclusion_exclusion_criteria": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants in the study.",
    338           "source": "opus"
    339         },
    340         "randomization_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants in the study.",
    344           "source": "opus"
    345         },
    346         "blinding_described": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants in the study.",
    350           "source": "opus"
    351         },
    352         "attrition_reported": {
    353           "applies": false,
    354           "answer": false,
    355           "justification": "No human participants in the study.",
    356           "source": "opus"
    357         }
    358       },
    359       "cost_and_practicality": {
    360         "inference_cost_reported": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "No inference cost, API cost, or latency is reported despite running LIBRO 50 times per bug and BRT Agent 20 times per bug across 80 bugs (4000 + 1600 runs total).",
    364           "source": "opus"
    365         },
    366         "compute_budget_stated": {
    367           "applies": true,
    368           "answer": false,
    369           "justification": "No total computational budget, GPU hours, or API spend is stated despite significant compute usage.",
    370           "source": "opus"
    371         }
    372       },
    373       "experimental_rigor": {
    374         "seed_sensitivity_reported": {
    375           "applies": true,
    376           "answer": false,
    377           "justification": "Despite running 20 (BRT Agent) and 50 (LIBRO) runs per bug to account for stochasticity, no per-bug variance or seed sensitivity analysis is reported. Only aggregate percentages across all bugs.",
    378           "source": "opus"
    379         },
    380         "number_of_runs_stated": {
    381           "applies": true,
    382           "answer": true,
    383           "justification": "§5.1.2 explicitly states: 50 runs per bug for LIBRO, 20 runs per bug for BRT Agent.",
    384           "source": "opus"
    385         },
    386         "hyperparameter_search_budget": {
    387           "applies": true,
    388           "answer": false,
    389           "justification": "The paper mentions 'small-scale experiments on different prompt structures, paraphrases, and input' for LIBRO prompt crafting (§4.1) but does not quantify the search budget or configurations tried.",
    390           "source": "opus"
    391         },
    392         "best_config_selection_justified": {
    393           "applies": true,
    394           "answer": false,
    395           "justification": "Temperature and top-P values (0.7/0.95 for LIBRO, 0.2/0.95 for BRT Agent) are stated as following prior work [20, 30] but no justification for why these are optimal or how they were selected.",
    396           "source": "opus"
    397         },
    398         "multiple_comparison_correction": {
    399           "applies": false,
    400           "answer": false,
    401           "justification": "No statistical tests are performed, so multiple comparison correction is not applicable.",
    402           "source": "opus"
    403         },
    404         "self_comparison_bias_addressed": {
    405           "applies": true,
    406           "answer": false,
    407           "justification": "Google employees evaluate their own BRT Agent system against their adaptation of LIBRO. The paper does not acknowledge self-comparison bias despite the authors implementing both the system and the baseline adaptation.",
    408           "source": "opus"
    409         },
    410         "compute_budget_vs_performance": {
    411           "applies": true,
    412           "answer": false,
    413           "justification": "BRT Agent uses a reasoning LLM + code-editing LLM with up to 25 iterative steps, while LIBRO uses a single LLM call. This compute difference is not discussed or controlled for.",
    414           "source": "opus"
    415         },
    416         "benchmark_construct_validity": {
    417           "applies": true,
    418           "answer": true,
    419           "justification": "§7 Construct Validity discusses limitations of the BRT evaluation metrics, noting they 'may not fully capture all aspects of a BRT, such as its readability, maintainability.' Manual inspection is used to supplement automated metrics.",
    420           "source": "opus"
    421         },
    422         "scaffold_confound_addressed": {
    423           "applies": true,
    424           "answer": false,
    425           "justification": "BRT Agent uses a multi-step agentic scaffold (ReAct reasoning + code-editing LLM + bazel test feedback loop) while LIBRO uses a single LLM call. The paper does not control for or discuss the scaffolding difference — improvements could stem from the iterative scaffold rather than the fine-tuned code-editing LLM. Section VII mentions LLM differences but not scaffolding confound.",
    426           "source": "opus"
    427         }
    428       },
    429       "data_leakage": {
    430         "temporal_leakage_addressed": {
    431           "applies": true,
    432           "answer": true,
    433           "justification": "§4.2.3 explicitly states the code-editing LLM's training data cutoff predates reporting of all bugs (since June 2024), addressing temporal leakage.",
    434           "source": "opus"
    435         },
    436         "feature_leakage_addressed": {
    437           "applies": true,
    438           "answer": true,
    439           "justification": "The experimental setup is transparent about what information each technique receives. LIBRO gets buggy files + test file; BRT Agent gets only buggy files. Ground truth test file is not given to BRT Agent (§5.1.2).",
    440           "source": "opus"
    441         },
    442         "non_independence_addressed": {
    443           "applies": true,
    444           "answer": false,
    445           "justification": "No discussion of whether the 80 bugs are independent (e.g., from different projects, different codebases) or whether bugs from the same project could share structural similarities affecting results.",
    446           "source": "opus"
    447         },
    448         "leakage_detection_method": {
    449           "applies": true,
    450           "answer": true,
    451           "justification": "Temporal separation is used as a concrete leakage prevention method: training data cutoff predates all evaluated bugs (§4.2.3).",
    452           "source": "opus"
    453         }
    454       }
    455     }
    456   },
    457   "claims": [
    458     {
    459       "claim": "BRT Agent generates plausible BRTs for 28% of the 80 evaluated Google production bugs, compared to 10% by adapted LIBRO.",
    460       "evidence": "Table 2 reports candidate BRT rates of 85% vs 41% and plausible BRT rates of 28% vs 10% for BRT Agent vs LIBRO.",
    461       "supported": "strong"
    462     },
    463     {
    464       "claim": "Providing generated BRTs to Passerine results in 30% more bugs with plausible fixes (17/23 vs 13/23).",
    465       "evidence": "Figure 3 and Section 6.2 report 17/23 (74%) bugs fixed with BRT vs 13/23 (57%) without, a 31% relative increase.",
    466       "supported": "moderate"
    467     },
    468     {
    469       "claim": "EPR (Ensemble Pass Rate) correctly selects a plausible fix as top-1 ranked from a pool of 20 candidates in 70% of cases.",
    470       "evidence": "Figure 5 shows Precision@1 = 0.7 and MRR = 0.7 for Top-K selection via EPR.",
    471       "supported": "strong"
    472     },
    473     {
    474       "claim": "67% of plausible BRTs generated by BRT Agent are semantically equivalent or identical to oracle BRTs.",
    475       "evidence": "Section 6.1 manual inspection: 19% identical + 48% semantically equivalent = 67%, from two authors with third resolving disagreements.",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "Providing BRTs reduces the number of agent steps Passerine needs to generate a plausible fix.",
    480       "evidence": "Figure 4 shows a leftward shift in the step distribution when BRT is provided, but no specific mean or median reduction is quantified.",
    481       "supported": "weak"
    482     },
    483     {
    484       "claim": "BRT Agent generalizes across 6 programming languages, generating plausible BRTs in all but Dart.",
    485       "evidence": "Table 3 shows non-zero plausible BRT rates for Java (28%), C++ (16%), Go (17%), Python (45%), Kotlin (50%), TypeScript (100%); Dart is 0%.",
    486       "supported": "strong"
    487     }
    488   ],
    489   "methodology_tags": [
    490     "benchmark-eval",
    491     "case-study"
    492   ],
    493   "key_findings": "BRT Agent, an LLM-agent approach using a fine-tuned Gemini model for code editing, generates plausible Bug Reproduction Tests for 28% of 80 Google production bugs compared to 10% by LIBRO, with 67% of plausible BRTs semantically matching the oracle. Providing generated BRTs to Google's Passerine APR system increases the proportion of bugs receiving plausible fixes from 57% to 74% (on the 23-bug subset where BRTs were generated), suggesting BRTs provide actionable context that reduces debugging effort. The proposed Ensemble Pass Rate (EPR) metric achieves 70% precision in ranking a correct fix first from a 20-candidate pool, offering a practical fix-selection mechanism when no existing test has failed. Both code and data remain proprietary to Google, severely limiting external reproducibility.",
    494   "red_flags": [
    495     {
    496       "flag": "Non-independent evaluators",
    497       "detail": "Six of eight authors are Google employees evaluating Google's own proprietary tools (Passerine, fine-tuned Gemini), creating a strong conflict of interest with no independent replication possible."
    498     },
    499     {
    500       "flag": "Small RQ2 sample",
    501       "detail": "The impact of BRTs on fix generation (RQ2) is evaluated on only 23 bugs — the subset where BRT Agent happened to succeed — making the 30% improvement claim highly sensitive to small-sample variance."
    502     },
    503     {
    504       "flag": "No statistical significance tests",
    505       "detail": "No significance tests or confidence intervals are reported for any comparison (e.g., 28% vs 10% on n=80, or 17/23 vs 13/23), making it impossible to assess whether results exceed noise."
    506     },
    507     {
    508       "flag": "Model versions unspecified",
    509       "detail": "The models are described only as 'a Gemini model fine-tuned on Google's internal code' and 'a publicly available Gemini', without version identifiers or snapshot dates, making results impossible to reproduce or compare."
    510     },
    511     {
    512       "flag": "LIBRO given different inputs",
    513       "detail": "LIBRO is given a pre-identified test file as input while BRT Agent is not, introducing a confound in the head-to-head comparison that is not analyzed via ablation."
    514     },
    515     {
    516       "flag": "No code or data released",
    517       "detail": "All code, models, and data are proprietary to Google; there is zero external reproducibility."
    518     }
    519   ],
    520   "cited_papers": [
    521     {
    522       "title": "Large Language Models are Few-shot Testers: Exploring LLM-based General Bug Reproduction",
    523       "relevance": "LIBRO is the primary baseline for BRT generation; this paper adapts and evaluates it in an industrial setting."
    524     },
    525     {
    526       "title": "SWT-Bench: Testing and Validating Real-World Bug-Fixes with Code Agents",
    527       "relevance": "Introduces SWT-Bench benchmark and SWE-Agent+ for BRT generation in Python; most closely related concurrent work."
    528     },
    529     {
    530       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    531       "relevance": "SWE-Agent is adapted as SWE-Agent+ in SWT-Bench; represents the state-of-the-art agentic baseline for software engineering tasks."
    532     },
    533     {
    534       "title": "Evaluating Agent-based Program Repair at Google",
    535       "relevance": "Describes Passerine, the Google APR system whose performance is improved by BRTs in this paper; shares dataset and authors."
    536     },
    537     {
    538       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    539       "relevance": "Provides the ReAct framework underlying BRT Agent's reasoning-action loop."
    540     },
    541     {
    542       "title": "Swe-bench: Can Language Models Resolve Real-World GitHub Issues?",
    543       "relevance": "Foundational benchmark for LLM-based software engineering; SWT-Bench is built on top of it."
    544     },
    545     {
    546       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    547       "relevance": "Standard Java bug benchmark used by LIBRO; comparison point for BRT generation performance."
    548     },
    549     {
    550       "title": "AutoCodeRover: Autonomous Program Improvement",
    551       "relevance": "State-of-the-art APR agent evaluated in SWT-Bench alongside SWE-Agent; represents agent-based APR landscape."
    552     }
    553   ],
    554   "engagement_factors": {
    555     "practical_relevance": {
    556       "score": 2,
    557       "justification": "Describes an actionable agent-based approach for automated bug reproduction and repair, though the specific tool is Google-internal and not publicly available."
    558     },
    559     "surprise_contrarian": {
    560       "score": 1,
    561       "justification": "Results confirm the expected advantage of agentic approaches over simpler prompting, with no major counterintuitive findings."
    562     },
    563     "fear_safety": {
    564       "score": 0,
    565       "justification": "No safety, security, or risk angle discussed."
    566     },
    567     "drama_conflict": {
    568       "score": 0,
    569       "justification": "No controversy or conflict; straightforward comparison of two internal techniques."
    570     },
    571     "demo_ability": {
    572       "score": 0,
    573       "justification": "Entirely built on Google's proprietary codebase, fine-tuned models, and internal infrastructure with no public code or demo."
    574     },
    575     "brand_recognition": {
    576       "score": 3,
    577       "justification": "Google-authored paper about Google's internal bug repair infrastructure using Gemini, hitting high brand recognition."
    578     }
    579   },
    580   "hn_data": {
    581     "threads": [
    582       {
    583         "hn_id": "43876276",
    584         "title": "Agentic Bug Reproduction for Effective Automated Program Repair at Google",
    585         "points": 2,
    586         "comments": 0,
    587         "url": "https://news.ycombinator.com/item?id=43876276",
    588         "created_at": "2025-05-03T01:54:39Z"
    589       },
    590       {
    591         "hn_id": "45599001",
    592         "title": "Agentic Bug Reproduction for Effective Automated Program Repair at Google",
    593         "points": 1,
    594         "comments": 0,
    595         "url": "https://news.ycombinator.com/item?id=45599001",
    596         "created_at": "2025-10-15T22:20:39Z"
    597       }
    598     ],
    599     "top_points": 2,
    600     "total_points": 3,
    601     "total_comments": 0
    602   }
    603 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs