calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (16221B)
      1 {
      2   "calibration": {
      3     "paper_slug": "agentic-bug-reproduction-2025",
      4     "calibration_date": "2026-02-28",
      5     "sonnet_scan_date": "2026-02-28",
      6     "model": "opus",
      7     "agreement_rate": 1.0,
      8     "total_questions": 50,
      9     "agreements": 50,
     10     "disagreements": 0,
     11     "disagreement_details": []
     12   },
     13   "opus_checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL or code archive is provided anywhere in the paper. BRT Agent and the adapted LIBRO run on Google's internal infrastructure with no public release mentioned."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The 80-bug evaluation dataset comes from Google's internal issue tracking system (GITS) and is proprietary. No public dataset download link is provided."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No requirements.txt, Dockerfile, or dependency versions are provided. The paper references Google-internal infrastructure (Bazel, GITS) without specifying reproducible environment details."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The entire experimental setup relies on Google-internal tools, datasets, and models not accessible to outside researchers."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Results are reported as point estimates only (e.g., '28% plausible BRT generation rate', '70% precision at K=1'). Despite performing 20-50 runs per bug, no confidence intervals or error bars are reported."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical significance tests are used when comparing BRT Agent (28%) vs LIBRO (10%) or any other comparison in the paper. No p-values, t-tests, or other hypothesis tests are reported."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Raw percentage differences are given (28% vs 10%, 30% more bugs fixed) but no formal effect size measures such as Cohen's d or odds ratios are reported."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The 80-bug dataset is acknowledged in §7 as 'a relatively small dataset' that 'could limit the generalizability of our results,' but no power analysis or quantitative justification for the sample size is provided."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Despite running LIBRO 50 times and BRT Agent 20 times per bug, no standard deviation, variance, IQR, or any spread measure is reported in Table 2, Table 3, or anywhere else. Only aggregate pass/fail rates are shown."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "LIBRO is used as the primary baseline comparison throughout the paper (Table 2, Table 3). SWE-Agent+ is discussed as related work but not directly evaluated on the same dataset."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "LIBRO (ICSE 2023, TSE 2024 extension) is a contemporary baseline in BRT generation. SWE-Agent+ (NeurIPS 2024) is also cited as state-of-the-art and discussed in section 3."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No ablation study is conducted. Individual components of BRT Agent (fine-tuned code-editing LLM, agentic scaffolding, ReAct loop, code_search action) are not systematically isolated to measure their individual contributions."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "RQ1 uses Candidate BRTs, Plausible BRTs, and Candidate-to-Plausible rate. RQ3 uses Precision, Recall, F1-score, and MRR. Multiple metrics are used across the evaluation."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 6.1 describes manual inspection: 'two authors inspect the BRTs and a third author resolves any disagreement.' This constitutes human evaluation of the system's outputs."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section 4.2.3 states the code-editing LLM's 'training data cutoff predates the reporting of all bugs analyzed in this study.' All bugs are from June 2024 or later, ensuring no training overlap."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 3 provides a breakdown of plausible BRT rates by programming language (Java, C++, Go, Python, Kotlin, Dart, TypeScript) for both LIBRO and BRT Agent."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 6.1 discusses failure modes: LIBRO's main failure is build errors from which it cannot recover. BRT Agent failures include step exhaustion (21%), irrelevant assertions (16%), invalid patches modifying existing tests (11%), and hallucinated non-existent actions."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Several negative results are reported: Dart achieves 0% plausible BRTs for both techniques; LIBRO drops from 33% on Defects4J to 10% on Google data; the agent hallucinates actions not in its action set; 11% of plausible BRT patches are invalid."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims are confirmed: 28% vs 10% (Table 2), 30% more bugs with plausible fixes (§6.2: 17/23 vs 13/23), 70% precision at top-1 (§6.3.1, Figure 5). All major abstract claims match the results."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper claims 'providing BRTs to Passerine results in 30% more bugs with plausible fixes' — a causal claim. The comparison is with vs without BRT on the same 23 bugs (a selected subset where BRT Agent succeeded), but there is no randomization, no discussion of confounds, and no acknowledgment that the subset selection could bias results."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper explicitly bounds scope: 'our study focuses exclusively on Google's internal development environment' (§7) and 'the generalizability of our findings to other industrial settings requires further investigation.'"
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The Threats to Validity section (§7) lists methodological limitations but does not discuss alternative explanations for the observed results. For example, it does not consider whether the fine-tuned Gemini model alone (vs the agentic scaffolding) explains the improvement, or whether the 23-bug subset for RQ2 is biased."
    130       }
    131     },
    132     "setup_transparency": {
    133       "model_versions_specified": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper refers to 'a Gemini model fine-tuned on Google's internal code' and 'a publicly available Gemini' (§5.1.2) without specifying which Gemini version (e.g., 1.5 Pro, 2.0) or any snapshot/API date."
    137       },
    138       "prompts_provided": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper describes prompts in natural language (§4.2.3, §4.1) and mentions 'three synthetic BRT generation examples in the system prompt' but does not provide the actual prompt text. Only a meta task description fragment is quoted. The reader cannot reconstruct the prompts."
    142       },
    143       "hyperparameters_reported": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Section 5.1.2 reports temperature (0.7 for LIBRO, 0.2 for BRT Agent), top-P (0.95 for both), number of runs (50 for LIBRO, 20 for BRT Agent), and max steps (25 for BRT Agent)."
    147       },
    148       "scaffolding_described": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 4.2 describes the ReAct loop, Table 1 details all available actions (cat, code_search, edit, bazel test, finish), the dual-LLM architecture (reasoning vs code-editing), the change description mechanism, and termination conditions."
    152       },
    153       "data_preprocessing_documented": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 5.1.1 documents the dataset construction: automated extraction and filtering, followed by manual curation ensuring fixes address root causes. The paper references [30] for full pipeline details and explains the 80 vs 78 bug discrepancy in a footnote."
    157       }
    158     },
    159     "limitations_and_scope": {
    160       "limitations_section_present": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section 7 'Threats to Validity' provides substantive discussion across internal validity, external validity, and construct validity."
    164       },
    165       "threats_to_validity_specific": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The threats are specific to this study: small 80-bug dataset limiting generalizability, implementation bias from LIBRO adaptation, same Gemini model family for both techniques, LLM randomness, and EPR as an indirect proxy for fix correctness."
    169       },
    170       "scope_boundaries_stated": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The paper states results are for Google's internal environment only, findings may not generalize to other industrial settings, and metrics 'may not fully capture all aspects of a BRT, such as its readability, maintainability' (§7)."
    174       }
    175     },
    176     "data_integrity": {
    177       "raw_data_available": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "The 80-bug evaluation dataset from Google's internal issue tracker (GITS) is proprietary and not publicly available. Raw data cannot be independently verified."
    181       },
    182       "data_collection_described": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 5.1.1 describes data collection: bugs from GITS, reported and fixed by human developers, from diverse Google projects, all recent since June 2024, spanning 7 languages. Dataset constructed via automated extraction, filtering, and manual curation [30]."
    186       },
    187       "recruitment_methods_described": {
    188         "applies": false,
    189         "answer": false,
    190         "justification": "No human participants are involved. The data source is Google's internal issue tracker, not participant recruitment. Bug selection is automated and curated, not recruited."
    191       },
    192       "data_pipeline_documented": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The pipeline is described: automated extraction → filtering → manual curation, with criteria (fixes verified to address root cause). A footnote explains the 80 vs 78 bug discrepancy with concurrent work [30]."
    196       }
    197     },
    198     "conflicts_of_interest": {
    199       "funding_disclosed": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "No acknowledgments section or funding statement is present. A footnote notes Cheng and Cito conducted research at Google, but no explicit funding disclosure exists."
    203       },
    204       "affiliations_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Author affiliations are listed: 6 of 8 authors are from Google. A footnote explicitly states Cheng and Cito conducted this research at Google. The corporate connection is transparent."
    208       },
    209       "funder_independent_of_outcome": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "Research conducted at Google, evaluating Google's own APR system (Passerine) and fine-tuned Gemini models on Google's proprietary data. Google has a direct financial interest in positive results for its tools."
    213       },
    214       "financial_interests_declared": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear in the paper."
    218       }
    219     },
    220     "contamination": {
    221       "training_cutoff_stated": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Section 4.2.3 states the code-editing LLM's 'training data cutoff predates the reporting of all bugs analyzed in this study,' addressing the training-data boundary."
    225       },
    226       "train_test_overlap_discussed": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Section 4.2.3 explicitly addresses this: 'the code-editing LLM's training data excludes all bugs, code changes, and BRTs used in our empirical evaluation—its training data cutoff predates the reporting of all bugs analyzed in this study, preventing any potential data leakage.'"
    230       },
    231       "benchmark_contamination_addressed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "The evaluation uses internal Google bugs reported since June 2024, which postdate the training cutoff of all models used. Contamination risk is directly addressed in §4.2.3."
    235       }
    236     },
    237     "human_studies": {
    238       "pre_registered": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants. This is a software engineering evaluation study using automated tools on an internal bug dataset."
    242       },
    243       "irb_or_ethics_approval": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants involved. The study evaluates automated tools on an internal bug dataset."
    247       },
    248       "demographics_reported": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants. The manual inspectors (§6.1) are research team members, not recruited participants."
    252       },
    253       "inclusion_exclusion_criteria": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. Bug selection criteria are covered under data collection."
    257       },
    258       "randomization_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participant randomization applicable. The study evaluates automated tools, not a controlled experiment with human subjects."
    262       },
    263       "blinding_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participant blinding applicable. Manual inspection in §6.1 uses two inspectors with a third for disagreements, but this is not a blinded study design."
    267       },
    268       "attrition_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants involved in the study."
    272       }
    273     },
    274     "cost_and_practicality": {
    275       "inference_cost_reported": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No API costs, token counts, or latency figures are reported. Running 80 bugs * 20 runs * up to 25 steps each with Gemini models represents substantial usage, but costs are not quantified."
    279       },
    280       "compute_budget_stated": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No total compute budget, GPU hours, or API spend is reported. The computational resources required for 50 LIBRO runs and 20 BRT Agent runs per bug on 80 bugs are not quantified."
    284       }
    285     }
    286   }
    287 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs