scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24728B)
      1 {
      2   "paper": {
      3     "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
      4     "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2403.17134"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "RepairAgent is the first autonomous LLM-based agent for program repair, fixing 164 bugs on Defects4J (including 39 not fixed by prior techniques) at a median cost of 14 cents per bug using GPT-3.5. Ablation studies show search tools, state machine guidance, and long-term memory each contribute significantly. The approach generalizes to GitBug-Java (13/100 correct fixes), though performance drops on multi-line and multi-file bugs.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "The paper states 'We will release the implementation of RepairAgent as open-source to foster future work' — this is a promise of future release, not an actual release with a URL."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The evaluation uses publicly available Defects4J and GitBug-Java datasets. The paper also mentions that LLM interaction logs are available for analysis."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper mentions Python 3.10, Docker, AutoGPT framework, and ANTLR, but does not provide a requirements.txt, Dockerfile, or detailed dependency listing sufficient to recreate the environment."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The implementation section (IV) describes tools used but not how to run the system."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Results are reported as point estimates (e.g., 164 bugs fixed, 14 cents per bug). No confidence intervals or error bars are provided despite the non-deterministic nature of LLM outputs."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper claims RepairAgent outperforms baselines (164 vs 162 for ChatRepair) but provides no statistical significance tests for these comparisons."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper reports absolute numbers of bugs fixed per approach and per project (Tables III, IV), giving context for comparisons (e.g., 164 vs 162 for ChatRepair, 90 vs 48 on Defects4Jv2)."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The full Defects4J (835 bugs) is used, which is standard, but the GitBug-Java subsample of 100/199 bugs and the ablation subsample of 100 bugs are not justified with any analysis."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance across runs is reported. The paper acknowledges 'non-deterministic output of LLMs' as a threat but does not report results across multiple runs."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Three baselines are compared: ChatRepair, ITER, and SelfAPR (Section V-A, Table III)."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "ChatRepair and ITER are described as 'the current state of the art' and are recent iterative LLM-based repair techniques."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Table VI presents ablation studies removing search tools, state machine, and long-term memory, plus testing realistic fault localization."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper reports plausible fixes, correct fixes, cost (tokens, dollars, time), and breakdowns by bug complexity (single-line, multi-line, multi-file)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Correctness is determined by manual inspection: 'we manually determine whether the RepairAgent-generated fix is semantically consistent with the developer-created fix' (Section V-A)."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "GitBug-Java serves as a held-out evaluation with bugs discovered after GPT-3.5's training cutoff (2023 bugs vs January 2022 cutoff)."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table III provides per-project breakdowns of results, and Table IV breaks down by bug complexity type (single-line, multi-line, multi-file)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section VI-A discusses unfixed bugs, noting the agent 'sometimes suggests complex fixes for bugs that only require simple modifications' and struggles with multi-file bugs."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The ablation study reports configurations that perform worse. The GitBug-Java evaluation shows weaker performance on multi-line/multi-file bugs. The paper also notes RepairAgent fixes fewer single-line bugs than ChatRepair (115 vs 133)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims of 164 bugs fixed, 39 unique, 14 cents per bug, and 270K tokens are all supported by Tables III, IV, and Figure 9."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims from ablation studies (e.g., removing search tools halves effectiveness) use controlled single-variable manipulation on the same 100-bug sample (Table VI)."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title says 'Program Repair' broadly but results are only on Java bugs (Defects4J and GitBug-Java). The paper does not bound generalization to Java or to the specific bug types tested."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section VI-B discusses data leakage, missing test cases, fault localization accuracy, and non-deterministic LLM output as alternative explanations for the results."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper carefully distinguishes plausible fixes (pass tests) from correct fixes (semantically match developer fix), explicitly noting that plausible fixes 'are not necessarily correct' (Section V-B)."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper specifies 'GPT-3.5-0125 from OpenAI' (Section IV), which is a specific version identifier."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The prompt structure is described in detail (Table I, Sections III-C.1-8) but actual prompt text is not provided — only descriptions of what each section contains."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Some parameters are stated (40 cycles max, 30 fix variants, 12K token context, 4K token generation limit) but LLM sampling parameters (temperature, top-p) are not reported."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The agent scaffolding is described in extensive detail: the finite state machine (Figure 2), 14 tools (Table II), dynamic prompt format (Table I), middleware components, and the iterative cycle (Definition 1)."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The GitBug-Java sampling procedure is described (random sample of 100/199, at least one and at most two per project). For Defects4J, the full dataset is used. Test output cleaning by the middleware is described."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section VI-B 'Threats to Validity and Limitations' provides substantive discussion of four specific threats."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats include: GPT-3.5 may have seen Defects4J (mitigated by GitBug-Java evaluation), Defects4J requires failing test cases which limits real-world applicability, fault localization dependency, and non-deterministic LLM output."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what it does NOT show. It does not bound to Java, to bugs with failing test cases, or to the specific bug complexity levels where the approach works well."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper mentions LLM interaction logs are 'available for further analysis' but does not provide a URL or archive. Raw experimental data (per-bug results, logs) is not released."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Defects4J (835 bugs from 17 Java projects) and GitBug-Java (199 bugs from 55 projects, 100 sampled) are well-described standard benchmarks."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. The data sources are standard benchmarks (Defects4J, GitBug-Java)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline from bug input through agent cycles to fix validation is documented. The correctness determination pipeline (syntactic match → manual semantic check) is described in Section V-A."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: University of Stuttgart (Bouzenia, Pradel) and UC Davis (Devanbu). No evaluated product is affiliated with these institutions."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed. The paper uses OpenAI's GPT-3.5 but authors are not affiliated with OpenAI."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "The paper states GPT-3.5's training data cutoff is January 2022, and uses this to justify the GitBug-Java experiment (2023 bugs)."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Section VI-B explicitly discusses data leakage: 'GPT-3.5 may have seen parts of the Java projects we evaluate on during training.' The GitBug-Java experiment with post-cutoff bugs is the mitigation."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Defects4J was created in 2014, well before GPT-3.5's training. The paper addresses this with the GitBug-Java evaluation (2023 bugs, post-cutoff) and concludes 'RepairAgent is not much affected by potential data leakage.'"
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Section V-C and Figure 9 report median 270K tokens, 14 cents per bug, and 920 seconds median time. Distributions for fixed vs unfixed bugs are provided."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "Total API spend for the full evaluation is not stated. Per-bug costs are given but the total compute budget (e.g., total dollars spent, total GPU hours for Docker test execution) is not reported."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No multiple-seed experiments are reported. The paper acknowledges LLM non-determinism as a threat but does not run multiple seeds."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The paper does not state how many runs produced the reported results. It appears to be a single run."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Parameters like 40 cycles, 30 fix variants, and Levenshtein threshold 0.1 are stated but no search budget or justification for how these values were selected is provided."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Default configuration parameters (40 cycles, 30 variants) are used without justification for why these values were selected over alternatives."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors compare their system against baselines using 'patches provided by the authors of the respective approaches' (fair comparison), but do not acknowledge self-comparison bias or the possibility that their own system received more tuning attention."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Section V-C compares costs across approaches: RepairAgent at 14 cents vs ChatRepair at 42 cents (or 14 cents adjusted), and 920s median vs ITER's 4.57 hours. The paper also notes RepairAgent generates 117 patches vs ITER's 1000."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The paper does not discuss whether Defects4J actually measures program repair capability in realistic settings. Threats mention missing test cases and fault localization but not whether the benchmark's bug distribution is representative."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "RepairAgent's scaffold is fundamentally different from baselines (agent with tools vs iterative prompting). The paper attributes improvements to the agent-based approach but does not disentangle scaffold effects from other design choices."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "The paper explicitly addresses temporal leakage by noting GPT-3.5's January 2022 cutoff and evaluating on GitBug-Java (2023 bugs) to test for data leakage effects."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "The paper uses perfect fault localization by default, which provides information not available in real settings. While acknowledged and ablated (Table VI), it is not framed as feature leakage."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether Defects4J bugs share structural similarities or whether the projects in the training data create non-independence between train and test."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "The GitBug-Java evaluation uses a temporal split (2023 bugs vs January 2022 training cutoff) as a concrete leakage prevention method, showing similar performance on single-line bugs."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "RepairAgent correctly fixes 164 bugs on Defects4J, including 39 not fixed by any prior technique.",
    364       "evidence": "Table III shows 164 correct fixes. Figure 6 shows the Venn diagram with 39 unique fixes.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "RepairAgent establishes a new state of the art in program repair on Defects4J.",
    369       "evidence": "164 fixes vs ChatRepair's 162 (Table III). However, RepairAgent fixes fewer on Defects4Jv1.2 (74 vs 114) and more on v2 (90 vs 48).",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "The median cost is 14 cents per bug (270K tokens) using GPT-3.5.",
    374       "evidence": "Figure 9 shows token/cost distributions. Comparable to ChatRepair's adjusted cost.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "RepairAgent is not much affected by data leakage of Defects4J.",
    379       "evidence": "GitBug-Java evaluation (Table V) shows 9/19 single-line bugs fixed (similar rate), but only 4/81 multi-line/multi-file bugs. The complexity difference in GitBug-Java confounds the comparison.",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "Search tools, state machine guidance, and long-term memory each contribute significantly to effectiveness.",
    384       "evidence": "Table VI ablation: removing search tools drops from 21 to 11 correct fixes; removing state machine drops to 14; single-cycle memory drops to 6.",
    385       "supported": "moderate"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "Perfect fault localization assumption",
    391       "detail": "The default evaluation assumes perfect fault localization (oracle providing exact buggy lines), which is unrealistic. The ablation with GZoltar shows a 25% drop in effectiveness and 81% cost increase, suggesting the main results overstate real-world applicability."
    392     },
    393     {
    394       "flag": "Single-run results with non-deterministic model",
    395       "detail": "Results appear to be from a single run of a non-deterministic LLM. The 164 vs 162 comparison with ChatRepair could easily be within noise, but no variance or multiple runs are reported."
    396     },
    397     {
    398       "flag": "Marginal improvement claimed as state of the art",
    399       "detail": "164 vs 162 bugs (1.2% improvement) is claimed as 'establishing a new state of the art.' Without statistical testing or multiple runs, this difference is not meaningful. RepairAgent actually fixes fewer bugs on Defects4Jv1.2 (74 vs 114)."
    400     },
    401     {
    402       "flag": "Confounded generalization claim",
    403       "detail": "The data leakage mitigation (GitBug-Java) is confounded by the dataset having more complex bugs (mean 577 modified tokens vs 381 for Defects4J), making it impossible to isolate the leakage effect from the complexity effect."
    404     }
    405   ],
    406   "cited_papers": [
    407     {
    408       "title": "Keep the conversation going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    409       "authors": ["C. S. Xia", "L. Zhang"],
    410       "year": 2023,
    411       "relevance": "Primary baseline — iterative LLM-based repair without agent approach."
    412     },
    413     {
    414       "title": "Iter: Iterative neural repair for multi-location patches",
    415       "authors": ["H. Ye", "M. Monperrus"],
    416       "year": 2024,
    417       "relevance": "Baseline iterative repair approach for multi-location bugs."
    418     },
    419     {
    420       "title": "SelfAPR: Self-supervised program repair with test execution diagnostics",
    421       "authors": ["H. Ye", "M. Martinez", "X. Luo", "T. Zhang", "M. Monperrus"],
    422       "year": 2022,
    423       "relevance": "Self-supervised repair baseline using test execution feedback."
    424     },
    425     {
    426       "title": "A survey on large language model based autonomous agents",
    427       "authors": ["L. Wang", "C. Ma", "X. Feng"],
    428       "year": 2023,
    429       "relevance": "Survey of LLM-based autonomous agents — foundational context for agent-based SE."
    430     },
    431     {
    432       "title": "Augmented language models: a survey",
    433       "authors": ["G. Mialon", "R. Dessì", "M. Lomeli"],
    434       "year": 2023,
    435       "arxiv_id": "2302.07842",
    436       "relevance": "Survey of tool-augmented LLMs that inspired RepairAgent's approach."
    437     },
    438     {
    439       "title": "Toolformer: Language models can teach themselves to use tools",
    440       "authors": ["T. Schick", "J. Dwivedi-Yu"],
    441       "year": 2023,
    442       "arxiv_id": "2302.04761",
    443       "relevance": "Foundational work on LLMs using tools via APIs."
    444     },
    445     {
    446       "title": "Evaluating large language models trained on code",
    447       "authors": ["M. Chen"],
    448       "year": 2021,
    449       "arxiv_id": "2107.03374",
    450       "relevance": "Codex/HumanEval — foundational LLM code generation benchmark."
    451     },
    452     {
    453       "title": "AutoCodeRover: Autonomous program improvement",
    454       "authors": ["Y. Zhang", "H. Ruan", "Z. Fan", "A. Roychoudhury"],
    455       "year": 2024,
    456       "relevance": "Concurrent autonomous agent for program improvement, directly comparable approach."
    457     },
    458     {
    459       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    460       "authors": ["J. Yang", "C. E. Jimenez", "K. Lieret", "S. Yao"],
    461       "year": 2024,
    462       "relevance": "Concurrent agent-based approach for automated software engineering tasks."
    463     },
    464     {
    465       "title": "CodaMOSA: Escaping coverage plateaus in test generation with pre-trained large language models",
    466       "authors": ["C. Lemieux", "J. P. Inala", "S. K. Lahiri", "S. Sen"],
    467       "year": 2023,
    468       "relevance": "LLM-augmented test generation — related agent-like approach for SE."
    469     },
    470     {
    471       "title": "De-hallucinator: Iterative grounding for LLM-based code completion",
    472       "authors": ["A. Eghbali", "M. Pradel"],
    473       "year": 2024,
    474       "arxiv_id": "2401.01701",
    475       "relevance": "Addresses LLM hallucination in code generation, motivating RepairAgent's search tools."
    476     },
    477     {
    478       "title": "CIGAR: Cost-efficient program repair with LLMs",
    479       "authors": ["D. Hidvégi", "K. Etemadi", "S. Bobadilla", "M. Monperrus"],
    480       "year": 2024,
    481       "relevance": "Cost-efficient LLM repair approach, relevant to cost analysis comparisons."
    482     }
    483   ]
    484 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs