scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29674B)
      1 {
      2   "paper": {
      3     "title": "RelRepair: Enhancing Automated Program Repair by Retrieving Relevant Code",
      4     "authors": ["Shunyu Liu", "Guangdong Bai", "Mark Utting", "Guowei Yang"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2509.16701",
      8     "doi": "10.48550/arXiv.2509.16701"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "RelRepair uses retrieval-augmented generation to enhance LLM-based automated program repair by retrieving relevant function signatures and code snippets from the project codebase. The approach fixes 101 bugs in Defects4J V1.2 (112.8% improvement over base ChatGPT) and achieves 48.3% fix rate on ManySStuBs4J (up from 31.2% baseline). Query rewriting is critical, boosting total repairs from 20 to 54 in ablation. SnipRepair (code snippet retrieval) handles complex bugs more effectively than SigRepair (function signature retrieval), but at higher computational cost.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, GitHub link, or archive link is provided anywhere in the paper. The implementation is described (Section 5.3) but no source code is released."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses two publicly available benchmarks: Defects4J V1.2 (Section 5.1, ref [25]) and ManySStuBs4J (ref [26]). Both are standard public datasets."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Section 5.3 mentions Python, gpt-3.5, SentenceBERT, and CodeBERT, but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The algorithmic descriptions (Algorithms 1-3) describe the approach but not how to run the experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates (e.g., 101 bugs fixed, 48.3% fix rate). No confidence intervals or error bars appear in any table or figure."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims RelRepair 'outperforms' baselines based solely on comparing raw counts (e.g., 101 vs 98 vs 76). No statistical significance tests are performed."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Percentage improvements with baseline context are reported: '112.8%' improvement over BaseChatGPT, '37.2%' improvement with ChatGPT-4, fix rate 'from 31.2% to 48.3%' (Section 6.1.2, 6.2.2)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "For ManySStuBs4J, 30 bugs per category are selected 'to balance efficiency and representativeness' (Section 5.1), but no power analysis or statistical justification is given. For Defects4J, the standard benchmark is used without justification of sufficiency."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single experimental runs despite the stochastic nature of LLM sampling at temperature=1."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares against four state-of-the-art APR tools (ChatRepair, ThinkRepair, Mulpor, RAP-Gen) plus a BaseChatGPT baseline without retrieval (Section 5.2, Table 1)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include ThinkRepair (2024), Mulpor (2024), ChatRepair (2023), and RAP-Gen (2023), all published within 1-2 years of this work."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "RQ3 (Section 6.3) provides a thorough ablation: BaseRepair vs SigRepair vs SnipRepair (Table 4), query rewriting on/off (Table 5), parameter selection analysis (Figure 8), and resource consumption analysis (Figure 7)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The paper uses essentially one metric: number of correct patches (patches passing all tests and semantically equivalent to the actual fix). Section 5.2 states 'we adopt this widely used evaluation metric: correct patches.' Fix rate is just a normalization of the same count."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 5.2 and Section 7 describe manual validation: 'we carefully examined each generated patch to ensure accurate validation' and 'manual validation to identify plausible patches that are functionally equivalent to the actual fixes.'"
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Defects4J V1.2 and ManySStuBs4J are standard, established benchmarks not used for tuning the approach. Retrieval is done per-bug from the project codebase, not from the benchmark itself."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by project for Defects4J (Table 1, Table 3, Table 4) and by 16 bug categories for ManySStuBs4J (Table 6, Figure 6)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The paper focuses on successes (e.g., Chart-10 example in Section 6.1.2). No systematic error analysis or discussion of where/why the approach fails is provided. The Venn diagram shows complementarity but not failure patterns."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 6.3.2 reports that SigRepair has limited effectiveness for complex bugs, query rewriting removal causes dramatic degradation (54→20 fixes, Table 5), and SigRepair saturates quickly despite increased patches (Figure 7)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims '101 bugs in Defects4J V1.2' matches Table 1, '17.1% improvement in ManySStuBs4J' and '48.3%' fix rate are supported in Section 6.1.2 and Table 6/Figure 6."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims that retrieval 'improves' repair. The ablation study (RQ3) systematically isolates each component (BaseRepair, SigRepair, SnipRepair) and query rewriting (Table 5), providing adequate controlled single-variable manipulation for these causal claims."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper tests only on Java benchmarks but makes broad claims: 'retrieving relevant code from the project codebase is essential for enhancing APR' (Section 9). The title 'Enhancing Automated Program Repair' does not specify Java. The abstract claims extend to 'APR tasks' generally."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The threats to validity section (Section 7) is brief and does not discuss alternative explanations for the observed improvements — e.g., whether simply providing more context tokens (regardless of relevance) would help, or whether the improvement comes from increased sampling budget rather than retrieval quality."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures 'correct patches' (passing all tests and semantically equivalent to the fix) and claims to fix bugs. The measurement matches the claim granularity — no proxy gap exists."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Section 5.3 says 'gpt-3.5 model from the ChatGPT family' without a specific version (e.g., gpt-3.5-turbo-0613). RQ2 uses 'ChatGPT-4', 'CodeLlama-13b', 'DeepSeek-33b' but without snapshot dates or API versions."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Only a fragment of the prompt is shown: 'You are an expert in program repair' (Section 4.1.1). The full prompts used for BaseRepair, SigRepair query rewriting (promptSIG), and SnipRepair are described in natural language but not provided in full."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 5.3 reports temperature=1. Retrieval parameters are specified: K=25 for SigRepair, K=15 intra + K=15 inter for SnipRepair. Patch counts: 1 (BaseRepair), 20 (SigRepair), 300 (SnipRepair). Timeout: 5 hours. α=0.5, β=0.5 initial weights."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The three-stage pipeline (BaseRepair → SigRepair → SnipRepair) is thoroughly described with formal algorithms (Algorithms 1-3), a framework diagram (Figure 3), and detailed descriptions of each stage's query rewriting, indexing, and retrieval components (Sections 4.1-4.3)."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Sections 4.1.2 and 4.2.2 describe how retrieval datasets are constructed per bug: variable-based and file-based function selection for SigRepair, intra-file and inter-file function extraction for SnipRepair, with SentenceBERT and CodeBERT encoding. Section 5.1 describes the ManySStuBs4J sampling (30 per category)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 'Threats to Validity' discusses internal threats (manual validation correctness) and external threats (representativeness of evaluation subjects)."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "Section 7 is brief (two short paragraphs) with fairly generic content. Internal threat: manual validation may be inaccurate (standard caveat). External threat: subjects may not be representative (standard caveat). No threats specific to this study's design choices are discussed."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what the results do NOT show. No mention that results are limited to Java, to single-function bugs, to settings with perfect fault localization, or that they do not demonstrate practical applicability beyond benchmark settings."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Generated patches, retrieval results, and detailed per-bug outcomes are not released. Only aggregate counts appear in tables."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 5.1 describes Defects4J V1.2 (391 bugs, 6 Java projects, single-function focus yielding 255 bugs) and ManySStuBs4J (150,000+ bugs, 16 categories, 30 per category selected for 480 total). Table 2 provides detailed statistics."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data comes from standard public benchmarks (Defects4J V1.2, ManySStuBs4J)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "For ManySStuBs4J, the paper says 'we select 30 bugs from each category' (Section 5.1) without explaining the selection method (random sampling? first 30? specific criteria?). The path from raw benchmark data to final evaluation results is incompletely documented."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All four authors are clearly listed as affiliated with The University of Queensland, Brisbane, Australia. They do not evaluate a product from their own institution."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed. The paper uses OpenAI's API but there is no indication of OpenAI funding."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement or financial disclosure appears in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper uses gpt-3.5 (and gpt-4, CodeLlama, DeepSeek in RQ2) but never states training data cutoff dates for any model."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "Defects4J V1.2 (published 2014) and ManySStuBs4J (published 2020) are both well-known benchmarks that likely appear in GPT-3.5's training data. This potential overlap is never discussed."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Both benchmarks were published years before GPT-3.5's training cutoff. The paper does not discuss contamination risk despite using models that could have memorized solutions from these widely-studied benchmarks."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. The paper evaluates automated tools on software benchmarks."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Figure 7 shows per-bug API costs (up to $0.27 for 300 patches in SnipRepair). Section 5.3 mentions a 5-hour timeout and 'average repair time is significantly lower, typically under 20 minutes.'"
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "While per-bug costs are shown in Figure 7, the total computational budget for the full experimental campaign (API spend across all 735+ bugs, total GPU time for embeddings) is not reported."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "LLM sampling uses temperature=1 (stochastic), but results are reported from what appears to be a single experimental run with no seed sensitivity analysis or multiple-seed results."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 5.3 explicitly states the number of candidate patches per stage: 1 for BaseRepair, 20 for SigRepair, up to 300 for SnipRepair. The SigRepair process is repeated 20 times (Algorithm 2)."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Figure 8 shows parameter sensitivity curves for K values in SigRepair and SnipRepair, but the total compute spent on search and the number of configurations tried across the full parameter space are not reported."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Figure 8 shows performance vs. parameter values with clear diminishing-returns justification for chosen K=25 (SigRepair) and K=15 (SnipRepair-Intra and SnipRepair-Inter). Selection is based on performance plateaus."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper compares against multiple baselines across multiple projects and bug categories but performs no statistical tests at all, let alone corrections for multiple comparisons."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement and evaluate their own system against baselines. For some baselines they 'reference reported outcomes from prior studies' (Section 5.2), but they do not acknowledge the systematic bias of authors evaluating their own method."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Figure 7 directly plots the number of correct patches against API cost for both SigRepair and SnipRepair, showing the cost-performance tradeoff explicitly."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper uses Defects4J V1.2 and ManySStuBs4J without discussing whether these benchmarks adequately measure real-world APR capability. No discussion of construct validity or comparison with alternative evaluation approaches."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "When comparing against ThinkRepair, RAP-Gen, and other baselines, different scaffolding/retrieval strategies are used. The paper does not disentangle whether improvements come from the retrieval mechanism vs. the additional context/sampling budget. RQ2 partially addresses this by testing multiple LLMs with the same scaffold."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "Defects4J V1.2 (2014) and ManySStuBs4J (2020) both predate GPT-3.5's training data. Solutions to these bugs may be in the training data. This temporal leakage risk is not discussed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The paper assumes perfect fault localization (Section 6.1.1), providing statement-level fault information. While this follows prior work conventions, the paper does not discuss how this information leakage affects real-world applicability."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether training data and test data are independent. The LLMs may have seen Defects4J bugs and their fixes during pre-training."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods (canary strings, membership inference, n-gram overlap analysis, decontamination) are used."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "RelRepair fixes 101 bugs in Defects4J V1.2, outperforming all compared LLM-based APR approaches.",
    365       "evidence": "Table 1 shows 101 total fixes vs. ThinkRepair (98), ChatRepair (76), Mulpor (70), RAP-Gen (42), and BaseChatGPT (47). Detailed per-project breakdown in Table 1 (Section 6.1.2).",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "RelRepair uniquely fixes 28 bugs not fixed by any other compared tool.",
    370       "evidence": "Venn diagram (Figure 4) shows 28 unique fixes when compared to RAP-Gen, Mulpor, and ThinkRepair on Defects4J V1.2 (Section 6.1.2). ChatRepair is excluded from the comparison.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "RelRepair achieves a 48.3% fix rate on ManySStuBs4J, a 17.1% improvement over the 31.2% baseline.",
    375       "evidence": "Section 6.1.2 and Table 6/Figure 6 report fix rates per bug category and overall. Evaluation uses exact-match comparison against ground-truth fixes.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "RelRepair with ChatGPT-4 fixes 120 bugs, a 37.2% improvement over base ChatGPT-4 (86 bugs).",
    380       "evidence": "Table 3 (Section 6.2.2) shows results across four LLMs with and without RelRepair on Defects4J V1.2.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Query rewriting is critical: removing it drops total fixes from 54 to 20.",
    385       "evidence": "Table 5 (Section 6.3.2) shows SigRepair drops from 15 to 4 fixes and SnipRepair drops from 39 to 16 fixes without query rewriting.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "SnipRepair is more effective than SigRepair for complex bugs.",
    390       "evidence": "Table 4 shows SnipRepair fixes 39 bugs vs. SigRepair's 15 on Defects4J V1.2. Section 6.3.2 analyzes per-category differences on ManySStuBs4J showing SnipRepair excels on categories requiring deeper understanding.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No statistical tests or uncertainty quantification",
    397       "detail": "All comparisons are based on raw counts without any statistical tests, confidence intervals, or error bars. The difference between RelRepair (101) and ThinkRepair (98) is 3 bugs — without statistical testing, this difference may not be meaningful, especially with stochastic LLM sampling at temperature=1."
    398     },
    399     {
    400       "flag": "Benchmark contamination risk ignored",
    401       "detail": "Defects4J V1.2 (2014) and ManySStuBs4J (2020) are both well-known benchmarks likely present in GPT-3.5/4 training data. The paper never discusses whether the LLMs may have memorized solutions, which could inflate results for all approaches but especially the baseline."
    402     },
    403     {
    404       "flag": "Perfect fault localization assumption",
    405       "detail": "All experiments assume perfect statement-level fault localization (Section 6.1.1). This significantly simplifies the problem and inflates results compared to real-world settings where fault localization is itself a hard problem."
    406     },
    407     {
    408       "flag": "ManySStuBs4J exact-match evaluation",
    409       "detail": "For ManySStuBs4J, 'correctness of generated fixes is assessed solely by directly comparing the generated fix to the provided ground-truth solution; only exact matches are considered correct' (Section 6.1.1). This may severely undercount functionally correct but syntactically different patches."
    410     },
    411     {
    412       "flag": "Selective baseline comparison",
    413       "detail": "ChatRepair is excluded from the Venn diagram comparison (Figure 4) due to 'unavailability of its generated patches on GitHub.' This omits a key baseline from the uniqueness analysis while still including its numbers in Table 1."
    414     },
    415     {
    416       "flag": "No code released for a tool paper",
    417       "detail": "Despite proposing a concrete tool/system with algorithms and implementation details, no source code repository is provided, preventing independent verification or reproduction."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Keep the conversation going: Fixing 162 out of 337 bugs for $0.42 each using chatgpt",
    423       "authors": ["C. S. Xia", "L. Zhang"],
    424       "year": 2023,
    425       "arxiv_id": "2304.00385",
    426       "relevance": "LLM-based APR using ChatGPT's conversational interface for iterative repair, direct baseline in this evaluation."
    427     },
    428     {
    429       "title": "ThinkRepair: Self-directed automated program repair",
    430       "authors": ["X. Yin", "C. Ni", "S. Wang", "Z. Li", "L. Zeng", "X. Yang"],
    431       "year": 2024,
    432       "relevance": "Chain-of-Thought-based APR technique, strongest baseline achieving 98 fixes on Defects4J V1.2."
    433     },
    434     {
    435       "title": "RAP-Gen: Retrieval-augmented patch generation with CodeT5 for automatic program repair",
    436       "authors": ["W. Wang", "Y. Wang", "S. Joty", "S. C. Hoi"],
    437       "year": 2023,
    438       "relevance": "Retrieval-augmented APR using historical bug-fix pairs, closest related approach to RelRepair."
    439     },
    440     {
    441       "title": "One size does not fit all: Multi-granularity patch generation for better automated program repair",
    442       "authors": ["B. Lin", "S. Wang", "M. Wen", "L. Chen", "X. Mao"],
    443       "year": 2024,
    444       "relevance": "Multi-granularity LLM-based APR approach, baseline in Defects4J comparison."
    445     },
    446     {
    447       "title": "Automated program repair in the era of large pre-trained language models",
    448       "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"],
    449       "year": 2023,
    450       "relevance": "Foundational study on LLM-based APR establishing experimental conventions followed by this paper."
    451     },
    452     {
    453       "title": "AutoCodeRover: Autonomous program improvement",
    454       "authors": ["Y. Zhang", "H. Ruan", "Z. Fan", "A. Roychoudhury"],
    455       "year": 2024,
    456       "arxiv_id": "2404.05427",
    457       "relevance": "Autonomous agentic APR integrating AST analysis, represents agentic approach to program repair."
    458     },
    459     {
    460       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    461       "authors": ["P. Lewis", "E. Perez", "A. Piktus"],
    462       "year": 2020,
    463       "relevance": "Foundational RAG paper that RelRepair's retrieval strategy is based on."
    464     },
    465     {
    466       "title": "CURE: Code-aware neural machine translation for automatic program repair",
    467       "authors": ["N. Jiang", "T. Lutellier", "L. Tan"],
    468       "year": 2021,
    469       "relevance": "Code-aware NMT approach to APR, representative of learning-based APR methods."
    470     },
    471     {
    472       "title": "Code Llama: Open foundation models for code",
    473       "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"],
    474       "year": 2023,
    475       "arxiv_id": "2308.12950",
    476       "relevance": "Open-source code LLM used as one of four models in RelRepair's multi-LLM evaluation (RQ2)."
    477     },
    478     {
    479       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    480       "authors": ["R. Just", "D. Jalali", "M. D. Ernst"],
    481       "year": 2014,
    482       "relevance": "Primary evaluation benchmark for APR research, used by RelRepair and all baselines."
    483     },
    484     {
    485       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    486       "authors": ["C. S. Xia", "L. Zhang"],
    487       "year": 2022,
    488       "relevance": "Zero-shot LLM APR establishing experimental conventions followed in this paper."
    489     },
    490     {
    491       "title": "How often do single-statement bugs occur? The ManySStuBs4J dataset",
    492       "authors": ["R.-M. Karampatsis", "C. Sutton"],
    493       "year": 2020,
    494       "relevance": "Large-scale single-statement bug dataset used as second evaluation benchmark in this paper."
    495     }
    496   ],
    497   "engagement_factors": {
    498     "practical_relevance": {
    499       "score": 2,
    500       "justification": "RAG for APR is a practically useful technique that practitioners could adapt, but no code is released for direct use."
    501     },
    502     "surprise_contrarian": {
    503       "score": 0,
    504       "justification": "Confirming that providing relevant context helps LLMs generate better code patches is expected and aligns with conventional wisdom about RAG."
    505     },
    506     "fear_safety": {
    507       "score": 0,
    508       "justification": "No safety or security concerns raised; the paper focuses on bug fixing, not adversarial or harmful applications."
    509     },
    510     "drama_conflict": {
    511       "score": 0,
    512       "justification": "No controversy, no challenges to existing benchmarks or claims by others."
    513     },
    514     "demo_ability": {
    515       "score": 0,
    516       "justification": "No code repository, demo, or installable tool is released."
    517     },
    518     "brand_recognition": {
    519       "score": 1,
    520       "justification": "Uses ChatGPT/GPT-4 which are well-known, but the authors are from an academic lab without high brand recognition."
    521     }
    522   }
    523 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs