scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27729B)
      1 {
      2   "paper": {
      3     "title": "Fixing Rust Compilation Errors using LLMs",
      4     "authors": ["Pantazis Deligiannis", "Akash Lal", "Nikita Mehrotra", "Aseem Rastogi"],
      5     "year": 2023,
      6     "venue": "ICSE 2025",
      7     "arxiv_id": "2308.05177",
      8     "doi": "10.1109/ICSE55347.2025.00022"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "The paper says 'We plan to open-source both our dataset as well as the implementation of RustAssistant' (Section 1) and 'We plan to release our dataset to enable further research' (Section 8). These are promises of future release, not actual releases."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "Same as above — 'We plan to release our dataset' is a promise, not a release. No download link or repository URL is provided."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No environment specifications, requirements files, or dependency versions are listed. The Rust compiler version (1.67.1) is mentioned for benchmark creation but not as a reproducibility specification."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided. The algorithm is described at a high level but no commands, scripts, or README instructions are included."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "All results are reported as point estimates (e.g., '92.59%', '73.63%') with no confidence intervals or error bars."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper claims GPT-4 'performs better than GPT-3.5' and compares prompt variants, but no statistical significance tests are reported."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Results are reported with baseline context — e.g., GPT-3.5 at 52.96% vs GPT-4 at 92.22% on micro-benchmarks (Table 1), and absolute numbers (143/270 vs 249/270). The Clippy comparison shows '2.4x more errors' fixed. Sufficient context to judge magnitude."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No justification is given for the sample sizes (270 micro-benchmarks, 50 SO questions, 182 commits). The 50 SO questions are described as 'the first 50 most relevant questions' with no power analysis."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be single-run numbers (temperature 0.2, top_p=1 for deterministic generation, but N=5 completions involves randomness)."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "GPT-3.5 serves as a baseline against GPT-4. For Clippy errors, Clippy's own auto-fix is used as a baseline (Table 6). Prompt ablation variants (P0-P4) provide internal baselines."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "GPT-3.5 and GPT-4 were state-of-the-art LLMs at the time of writing (August 2023). Clippy auto-fix is the standard tool baseline."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Section 5 RQ2 presents ablation studies: five prompt variants (P0-P4, Table 5), varying number of completions (N=1 vs N=5), and toggling error grouping on/off (Table 4)."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple metrics are used: Fix% (programs fixed), plus breakdown into Format errors, Build errors, and Test failures (Tables 1-2). For top-100 crates: both Commits% and Errors% (Table 3)."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "RQ4 (Section 5) involves qualitative manual examination of all 134 fixed commits from the top-100 crates, categorized into 4 semantic correctness categories (Table 7). The paper states 'a structured consensus-based manual evaluation involving multiple evaluators' (Section 6)."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The three datasets (micro-benchmarks, SO, top-100 crates) are separate evaluation sets. Micro-benchmarks and SO have independent unit tests. No tuning was done on the test data — the prompt was fixed across evaluations."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Figure 5 shows fix rate broken down by error category (Syntax, Type, Generics, Traits, Ownership, Lifetime). Table 6 breaks Clippy results by category (Complexity, Pedantic, Style). Table 7 categorizes fix quality."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 5 discusses specific failure modes: format errors, build errors, test failures. Concrete examples are given (e.g., the f64 bitwise operator example where GPT-4 uses wrong constant). Loop/stuck behavior and .toml editing limitations are discussed."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "P0 (basic prompt) achieves only ~10% accuracy (Table 4). Turning off error grouping significantly drops performance. GPT-4 N=5 on SO (72%) is slightly worse than N=1 (74%), an unexpected negative result. The paper reports cases where LLM gets stuck in fix/undo loops."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims 'peak accuracy of roughly 74% on real-world compilation errors' which matches Table 3 (73.63% commits fixed). The claim of 'high accuracy' is supported by the micro-benchmark result of 92.59%."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims like 'the changelog format improves accuracy' are supported by controlled ablation studies (P0 vs P4 in Tables 4-5) with single-variable manipulation. The claim that 'error grouping helps' is supported by toggling it on/off."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper bounds its scope: 'We focus on fixing code-related issues' (Section 2.1), explicitly excludes .toml configuration changes and unsafe code. Results are presented separately for each dataset. The title says 'Rust Compilation Errors' not 'all software bugs'."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 6 (Threats to Validity) discusses data contamination, the subjectivity of semantic correctness evaluation, and construct validity of handwritten test cases. The paper considers whether GPT-4's performance could be due to memorization."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper clearly distinguishes what it measures: compilation success (objective) vs semantic correctness (subjective). Section 2.1 explicitly states 'The former is an objective criterion while the latter is subjective.' RQ4 specifically addresses the gap between passing the compiler and producing correct fixes."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper says 'GPT-3.5-turbo (300B parameters)' and 'GPT-4' without specific version snapshots (e.g., gpt-4-0613). No API version dates are provided."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Figure 4 provides the full RustAssistant prompt template with all sections: preamble, error context, instructions, and detailed changelog format instructions with examples. The actual template text is reproduced verbatim."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Section 5 reports: frequency_penalty=0, presence_penalty=0, top_p=1, temperature=0.2, maximum output length=800 tokens."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Algorithm 1 provides the full RustAssistant pipeline: compilation → error extraction → prompt construction → LLM invocation → changelog parsing → patch application → recompilation loop. Error grouping, termination heuristics, and best-completion selection are all described in detail (Section 3)."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 4 documents dataset construction: micro-benchmarks cover 270/506 error codes with explicit exclusion criteria, SO questions filtered for relevance with stated criteria, top-100 crates commits identified by cloning and building locally with out-of-scope error filtering."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 6 'Threats to Validity' provides a dedicated, substantive discussion of internal and external threats."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 6 discusses specific threats: data contamination for the specific repositories used, the subjective nature of semantic correctness evaluation mitigated by consensus-based multi-evaluator review, construct validity of handwritten tests, and the limitation that Clippy evaluation only checks clippy passing (not test suite)."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 2.1 explicitly states scope boundaries: only .rs files (not .toml), no unsafe code, no foreign function interop errors. Section 6 acknowledges generalizability may vary across different datasets."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The dataset is not released. Only aggregate results are shown. The promise to release is not fulfilled in the paper."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 4 describes collection in detail: micro-benchmarks written per error code referencing the Rust catalog, SO questions manually scraped with filtering criteria, top-100 crates from crates.io with commit history examination and local compilation."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. Data sources are public repositories, Stack Overflow, and compiler error codes."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline from raw data to analysis is documented: for SO, filtering criteria are stated; for top-100 crates, commits are cloned, built, filtered for in-scope errors (182 found); for micro-benchmarks, 270/506 error codes selected with exclusion criteria stated. Numbers at each stage are provided."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding disclosure or acknowledgments section is present in the paper."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All four authors are listed as Microsoft Research employees with their locations (Redmond and Bengaluru)."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "All authors are Microsoft Research employees. Microsoft has commercial interest in AI-powered developer tools (GitHub Copilot). The paper evaluates OpenAI models (GPT-3.5, GPT-4), and Microsoft is a major investor in OpenAI. The funder is not independent of the outcome."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is present. Microsoft's investment in OpenAI and its commercial developer tool products (GitHub Copilot, VS Code) create potential financial interests that are not declared."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No training data cutoff dates are stated for GPT-3.5 or GPT-4. The paper acknowledges contamination risk in Section 6 but does not state when training data was collected."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "Section 6 explicitly discusses this: 'it might be possible that fixes to the compilation issues that we mined from open source might have already been included in the training data.' They argue the fixes 'were never presented online in the form of a fix or alongside the corresponding compiler error' though the fixed code may appear in later repository versions."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "While the paper discusses contamination conceptually (Section 6), it does not use any concrete detection method. The SO questions and top-100 crate commits were publicly available before GPT-4's training. The paper acknowledges 'There is no ideal way to completely remove contamination' but does not apply temporal splits or detection methods."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants. The study evaluates LLM performance on code benchmarks."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No API costs, token counts, or per-example costs are reported despite the tool making multiple LLM calls per error (up to 15 iterations shown in Figures 6-7) with N=5 completions each."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No total API spend, token consumption, or wall-clock time for the evaluation is reported."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Temperature is set to 0.2 (not 0) and top_p=1, so outputs are not fully deterministic. No results across multiple seeds or runs are reported."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "It is not stated how many times the full evaluation was run. Results appear to be from a single run."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The hyperparameters (temperature=0.2, max_tokens=800, window=±50 lines) appear chosen but no search budget or justification for these values is provided."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "The prompt ablation study (P0-P4 in Table 5) systematically varies features and reports all configurations. The N=1 vs N=5 variation is also reported for all datasets. Selection of best prompt is transparent."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons across the many comparisons made (2 models × 2 N values × 3 datasets × 5 prompt variants)."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors evaluate their own system (RustAssistant) and do not acknowledge potential bias in their implementation of the pipeline or their manual evaluation of fix quality."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "GPT-4 with N=5 uses roughly 5x the API calls of N=1, but no cost-performance tradeoff analysis is presented. The iterations (Figures 6-7) show compute scaling but not cost."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "The paper discusses what its benchmarks test: micro-benchmarks test isolated error fixing, SO tests harder real-world patterns, and top-100 crates test multi-error real-world scenarios. Section 2.1 distinguishes compilation correctness (objective) from semantic correctness (subjective)."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "The scaffold (RustAssistant pipeline) is fixed across model comparisons (GPT-3.5 vs GPT-4). The ablation study explicitly varies scaffold components (error grouping, prompt format) as independent variables."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The SO questions and open-source crate commits predate GPT-4's training. The paper acknowledges contamination risk but does not analyze temporal overlap between benchmark creation dates and model training periods."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "The tool provides compiler error messages (including suggestions like 'help: consider annotating with...') directly to the LLM. This is by design, but the paper does not discuss whether this constitutes feature leakage relative to a real developer scenario."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether the micro-benchmarks or SO examples overlap with or are similar to code in the LLMs' training data beyond the general contamination acknowledgment."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No concrete leakage detection method is used. The paper only discusses contamination conceptually and notes 'There is no ideal way to completely remove contamination.'"
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "RustAssistant achieves 92.59% fix rate on micro-benchmarks with GPT-4 (N=5)",
    363       "evidence": "Table 1: 250/270 micro-benchmarks fixed. Unit tests verify semantic correctness.",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "RustAssistant achieves 73.63% commit fix rate on top-100 Rust crates with GPT-4 (N=5)",
    368       "evidence": "Table 3: 134/182 commits fixed, 846/925 individual errors fixed (91.46%).",
    369       "supported": "moderate"
    370     },
    371     {
    372       "claim": "GPT-4 significantly outperforms GPT-3.5 for fixing Rust compilation errors",
    373       "evidence": "Tables 1-3 consistently show GPT-4 outperforming GPT-3.5 across all datasets (e.g., 92.59% vs 73.70% on micro-benchmarks, 73.63% vs 35.71% on top-100 crates).",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "The changelog prompt format is critical, raising accuracy from ~10% to ~74%",
    378       "evidence": "Table 4 shows P0 (basic) at 9.89% vs P4 (full changelog) at comparable result. Table 5 shows P1 at 10.74% vs P4 at 73.70% on micro-benchmarks with GPT-3.5.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "RustAssistant fixes 2.4x more Clippy errors than Clippy's own auto-fix",
    383       "evidence": "Table 6: 259/346 (74.86%) vs 109/346 (31.50%).",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Most GPT-4 fixes on real-world code are semantically correct",
    388       "evidence": "Table 7: Of 134 fixed commits, only 9 have different runtime behavior. 55 are unambiguous, 41 match developer fix, 29 are non-matching but same behavior.",
    389       "supported": "moderate"
    390     }
    391   ],
    392   "methodology_tags": ["benchmark-eval"],
    393   "key_findings": "RustAssistant, an LLM-based tool for fixing Rust compilation errors, achieves 92.59% on micro-benchmarks and 73.63% on real-world commits from top-100 Rust crates using GPT-4. The changelog prompt format is critical — accuracy drops to ~10% without it. Ablation studies show each prompt component (line prefixes, localization, fix description) contributes incrementally. Manual inspection of 134 fixed commits found only 9 with potentially altered runtime behavior.",
    394   "red_flags": [
    395     {
    396       "flag": "Company evaluating commercially-aligned technology",
    397       "detail": "All authors are Microsoft Research employees. Microsoft is a major investor in OpenAI (whose GPT-3.5/GPT-4 are evaluated) and sells AI-powered developer tools (GitHub Copilot). This conflict is not acknowledged."
    398     },
    399     {
    400       "flag": "No statistical tests on any comparison",
    401       "detail": "Claims of superiority (GPT-4 vs GPT-3.5, prompt variants) are based purely on comparing point estimates with no significance tests, confidence intervals, or variance reporting."
    402     },
    403     {
    404       "flag": "Single-run results with stochastic components",
    405       "detail": "Temperature=0.2 and N=5 completions introduce randomness, but results appear to be from a single run with no reproducibility analysis."
    406     },
    407     {
    408       "flag": "Unreleased artifacts despite promises",
    409       "detail": "The paper promises to release both the dataset and tool implementation, but as of the paper text, neither is released. No repository URL is provided."
    410     },
    411     {
    412       "flag": "No cost reporting despite expensive pipeline",
    413       "detail": "The tool may call GPT-4 dozens of times per error (up to 15 iterations × 5 completions = 75 API calls per error) but no cost analysis is provided."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Automated Repair of Programs from Large Language Models",
    419       "authors": ["Zhiyu Fan", "Xiang Gao", "Martin Mirchev", "Abhik Roychoudhury", "Shin Hwei Tan"],
    420       "year": 2023,
    421       "doi": "10.1109/ICSE48619.2023.00128",
    422       "relevance": "Evaluates LLMs (Codex) for automated program repair on Java defects, directly comparable approach."
    423     },
    424     {
    425       "title": "Practical Program Repair in the Era of Large Pre-trained Language Models",
    426       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    427       "year": 2022,
    428       "arxiv_id": "2210.14179",
    429       "relevance": "Extensive study of 9 pre-trained language models for APR across three languages, establishes that larger models perform better."
    430     },
    431     {
    432       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models",
    433       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    434       "year": 2023,
    435       "doi": "10.1109/ICSE48619.2023.00129",
    436       "relevance": "Experiments with GPT-3 series, CodeT5, InCoder on Defects4J and other benchmarks for LLM-based APR."
    437     },
    438     {
    439       "title": "InferFix: End-to-End Program Repair with LLMs",
    440       "authors": ["Matthew Jin", "Syed Shahriar", "Michele Tufano", "Xin Shi", "Shuai Lu", "Neel Sundaresan", "Alexey Svyatkovskiy"],
    441       "year": 2023,
    442       "arxiv_id": "2303.07263",
    443       "relevance": "Uses fine-tuned LLM to fix static analysis errors (CodeQL), related approach of using formal tool feedback with LLMs."
    444     },
    445     {
    446       "title": "Examining Zero-Shot Vulnerability Repair with Large Language Models",
    447       "authors": ["Hammond Pearce", "Benjamin Tan", "Baleegh Ahmad", "Ramesh Karri", "Brendan Dolan-Gavitt"],
    448       "year": 2023,
    449       "doi": "10.1109/SP46215.2023.10179420",
    450       "relevance": "Large-scale study of LLMs for zero-shot cybersecurity bug repair, evaluates multiple models."
    451     },
    452     {
    453       "title": "Can OpenAI's Codex Fix Bugs?: An evaluation on QuixBugs",
    454       "authors": ["Julian Aron Prenner", "Hlib Babii", "Romain Robbes"],
    455       "year": 2022,
    456       "doi": "10.1145/3524459.3527351",
    457       "relevance": "Early evaluation of Codex for bug fixing on Java and Python, establishes LLM capability for APR."
    458     },
    459     {
    460       "title": "Repair Is Nearly Generation: Multilingual Program Repair with LLMs",
    461       "authors": ["Harshit Joshi", "José Pablo Cambronero Sánchez", "Sumit Gulwani", "Vu Le", "Ivan Radicek", "Gust Verbruggen"],
    462       "year": 2022,
    463       "arxiv_id": "2208.11640",
    464       "relevance": "RING system uses retrieval-augmented few-shot prompting to fix syntactic errors in multiple languages."
    465     },
    466     {
    467       "title": "Revisiting the Plastic Surgery Hypothesis via Large Language Models",
    468       "authors": ["Chunqiu Steven Xia", "Yifeng Ding", "Lingming Zhang"],
    469       "year": 2023,
    470       "arxiv_id": "2303.10494",
    471       "relevance": "FitRepair combines LLMs with plastic surgery hypothesis and fine-tuning strategies for APR."
    472     },
    473     {
    474       "title": "GPT-4 Technical Report",
    475       "authors": ["OpenAI"],
    476       "year": 2023,
    477       "arxiv_id": "2303.08774",
    478       "relevance": "Technical report for GPT-4, one of the two LLMs evaluated in this study."
    479     },
    480     {
    481       "title": "Retrieval-Based Prompt Selection for Code-Related Few-Shot Learning",
    482       "authors": ["Noor Nashid", "Mifta Sintaha", "Ali Mesbah"],
    483       "year": 2023,
    484       "doi": "10.1109/ICSE48619.2023.00205",
    485       "relevance": "Retrieval-based prompt engineering for code tasks, relevant to the prompting techniques used in RustAssistant."
    486     },
    487     {
    488       "title": "DeepFix: Fixing Common C Language Errors by Deep Learning",
    489       "authors": ["Rahul Gupta", "Soham Pal", "Aditya Kanade", "Shirish K. Shevade"],
    490       "year": 2017,
    491       "relevance": "Early deep learning approach to fixing compilation errors, predecessor to LLM-based APR."
    492     }
    493   ]
    494 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs