scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26903B)
      1 {
      2   "paper": {
      3     "title": "The Art of Repair: Optimizing Iterative Program Repair with Instruction-Tuned Models",
      4     "authors": [
      5       "Fernando Vallecillos Ruiz",
      6       "Max Hort",
      7       "Leon Moonen"
      8     ],
      9     "year": 2025,
     10     "venue": "EASE 2025 (29th International Conference on Evaluation and Assessment in Software Engineering)",
     11     "arxiv_id": "2505.02931",
     12     "doi": "10.5281/zenodo.15294695"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Section 7 (Data Availability) states: 'The replicability package for this work is available online' with a Zenodo DOI link (https://doi.org/10.5281/zenodo.15294695). The package includes source code, generated patches, and fine-tuned models."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The benchmarks used (HumanEval-Java and Defects4J) are publicly available. Additionally, the replication package includes generated patches for both benchmarks by all models. The fine-tuning data comes from a public source (Zhu et al. 2021, ref [58])."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions using the LUMI supercomputer and eX3 infrastructure but does not provide a requirements.txt, Dockerfile, conda environment, or detailed library versions. No environment specification section is present in the paper text."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Section 7 states the replication package includes 'the source code required to replicate the experiments presented in this work.' The paper also provides detailed descriptions of the pipeline, prompt templates, strategies, and evaluation metrics in Section 3, which together with the replication package constitute reproduction instructions."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Results in Tables 1 and 2 and Figure 4 report only point estimates (number of problems with plausible patches). No confidence intervals, error bars, or uncertainty measures are provided."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper makes comparative claims (e.g., '78% improvement', 'decreased from 107... to 100') but does not use any statistical significance tests. All comparisons are based on raw number differences."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports percentage improvements with baseline context. For example, 'the performance of CodeLlama increased the number of fixed problems to 107 (78% improvement)' from a base of 60. Tables 1 and 2 provide absolute numbers with baselines, and Figure 4 shows differences relative to Strategy A."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is given for the benchmark sizes (164 bugs in HumanEval-Java, 217 in Defects4J). No power analysis or discussion of whether these sample sizes are adequate for the claims made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper uses beam search (deterministic decoding), so there is only one run per configuration. No variance, standard deviation, or spread measures are reported. While deterministic decoding means identical re-runs would yield the same results, there is no assessment of sensitivity to hyperparameter choices."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper includes base (non-fine-tuned) models as baselines and compares FFT and LoRA fine-tuning approaches. Strategy A (non-iterative) serves as the baseline for iterative strategies. Results are compared across multiple model configurations."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The models used (Llama 3.1, DeepSeek-Coder, CodeLlama) are contemporary. The paper compares against and discusses related work including RepairLLaMA (2024), MORepair (2024), and Li et al. (2024). The baselines are recent and relevant."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper systematically varies: (1) fine-tuning technique (FFT vs LoRA vs base), (2) dataset size (1K, 30K, 65K), and (3) generation strategy (7 strategies varying iterations and outputs). These variations function as ablations showing the contribution of each factor."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The paper uses a single primary metric: 'number of problems for which at least one plausible patch is generated.' While position analysis and manual correctness verification add depth, these are not separate evaluation metrics in the traditional sense."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 3.6 reports: 'we manually inspected 3,298 plausible patches. Of these, 3,167 were confirmed to be correct, while 131 were found to be overfitting to the test suite.' This constitutes human evaluation of the system's outputs."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The evaluation benchmarks (HumanEval-Java and Defects4J) are separate from the fine-tuning data (GitHub commits from Zhu et al. 2021). The paper explicitly discusses data leakage risk and uses HumanEval-Java specifically because of its recency to reduce contamination risk."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by model (3 models), fine-tuning approach (base, FFT, LoRA), dataset size (1K, 30K, 65K), benchmark (HumanEval-Java, Defects4J), and strategy (A through G). Tables 1 and 2 and Figure 4 provide detailed per-configuration results."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses where approaches fail: overfitting with larger datasets, diminishing returns with too many iterations for fine-tuned models, and the reduction in iterative capability after fine-tuning. Section 3.6 reports 131 patches overfitting to tests. The Venn diagrams (Figure 3) show problems only solvable by specific strategies."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Several negative results are reported: (1) increasing dataset size beyond 1K sometimes decreases performance (Finding 2); (2) fine-tuned models show reduced effectiveness with iterations (Finding 6); (3) Strategy G dropped for Defects4J due to excessive resource demands with diminishing returns; (4) 131 of 3,298 inspected patches were overfitting."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 'improvements of up to 78% in the number of plausible patches' — supported by Table 1 (CodeLlama from 60 to 107). Claims about diminishing returns with larger datasets are supported by Tables 1-2. Claims about iterative benefits for base models are supported by Figure 4 and RQ2 results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims through controlled single-variable manipulations: varying dataset size while holding other factors constant, varying strategy while holding model constant, etc. The ablation-style design (systematically varying one factor at a time) provides adequate support for claims like 'fine-tuning reduces iterative capability.'"
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper tests only on Java with two benchmarks but states in the conclusion: 'This work does not only advance the field of APR, but it also provides practical guidance for the deployment of LLMs in complex pipelines.' The threats section says 'The insights in this paper should generalize to arbitrary programming languages,' which is an unbounded generalization claim from Java-only experiments."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper discusses multiple alternative explanations for the overfitting phenomenon (Section 4.1.1): data quality, overfitting, and limited model capacity, systematically ruling some out. For fine-tuned models' reduced iterative ability, they discuss over-specialization vs. reduced zero-shot flexibility as competing explanations."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper specifies exact model variants with HuggingFace links: 'meta-llama/Llama-3.1-8B-Instruct' (footnote 1), 'meta-llama/CodeLlama-7b-Instruct-hf' (footnote 3), 'deepseek-ai/deepseek-coder-6.7b-instruct' (footnote 4). These are specific, versioned model identifiers."
    138       },
    139       "prompts_provided": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 3.4 provides the exact prompt templates used: the initial repair prompt, the failed-test feedback prompt, and the compilation-error feedback prompt. The templates include the actual text with placeholder variables clearly defined."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper mentions using 'beam-based search decoding strategy without stochastic sampling' but does not report specific hyperparameters such as beam width, temperature, max tokens, learning rate for fine-tuning, number of training epochs, batch size, or LoRA rank/alpha. The threats section mentions 'standardized hyperparameters' but does not list them."
    148       },
    149       "scaffolding_described": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.4 describes the iterative pipeline in detail: how inputs are formatted with bug delimiters, how outputs are parsed, the validation phase (plausible/wrong/timeout/uncompilable), feedback extraction (failed test code or compilation errors), and how feedback is incorporated into subsequent iterations. Section 3.5 details the seven generation strategies."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 3.3 describes the fine-tuning data source (143,666 samples from GitHub Java projects, ref [58]), the subset creation (1K, 30K, 65K samples), and the benchmark selection (217 single-hunk bugs from Defects4J v2.0, 164 bugs from HumanEval-Java). Section 3.4 explains how inputs are formatted with bug delimiters."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 5 is a dedicated 'Threats to validity' section discussing internal, external, and construct validity threats in substantive detail."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The threats section discusses specific issues: data leakage from Defects4J into pretraining data (mitigated by using HumanEval-Java), hyperparameter selection bias, limitation to Java benchmarks, plausibility vs. correctness gap (with 131/3,298 overfitting patches quantified), and the stopping criteria at 10 patches."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "While the threats section mentions Java-only benchmarks, the paper does not explicitly state what the results do NOT show. The conclusion makes broad claims ('provides practical guidance for the deployment of LLMs in complex pipelines') without explicitly bounding the scope. No explicit list of what was not tested or what claims are not being made."
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 7 states the replication package includes 'the generated patches for both benchmarks by all models used.' Section 3.6 confirms that 'all generated patch files, the code including the seed used to randomly sample these patches, and manual assessments are released in our replication package.'"
    182       },
    183       "data_collection_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 3.3 describes the fine-tuning data source (Zhu et al. 2021, 143,666 single-hunk fixes from GitHub Java projects) and the benchmarks (Defects4J v2.0 with 217 single-hunk bugs, HumanEval-Java with 164 bugs). The data sources and their characteristics are clearly documented."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No human participants were recruited. The study uses publicly available benchmarks and models. NA."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3.4 documents the full pipeline from input (buggy functions with delimiters) through prompting, output parsing, validation (compile/test), feedback extraction, and iterative refinement. The pipeline steps are described in sufficient detail with specific templates and outcome categories."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The Acknowledgments section lists funding from the Research Council of Norway (secureIT project, IKTPLUSS #288787), the European Union Horizon Europe Marie Sklodowska-Curie Actions (#101151798), and computing resources from eX3 (#270053) and LUMI supercomputer via Sigma2."
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "All three authors are affiliated with Simula Research Laboratory, Oslo, Norway, as stated in the paper header. They are not evaluating their own company's products."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Funding comes from the Research Council of Norway and EU Horizon Europe, which are public funding bodies with no financial stake in the outcome of APR research. The evaluated models (Llama, CodeLlama, DeepSeek-Coder) are not from the authors' institution or funders."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No competing interests or financial interests statement is present in the paper. The absence of such a disclosure is noted; there is no section addressing potential conflicts."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The paper does not state the training data cutoff dates for any of the three models used (Llama 3.1, CodeLlama, DeepSeek-Coder). This is relevant because Defects4J has been publicly available since 2014."
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Section 5 explicitly discusses this: 'One key internal threat is the potential data leakage from the benchmarks into the pretraining data of the models, specially for older widely-known benchmarks like Defects4J.' They mitigate by using HumanEval-Java and cite Ramos et al. (2025) on LLM memorization of bug benchmarks."
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "The paper explicitly addresses contamination: HumanEval-Java is used because of 'its recency, it reduces the risk of data leakage in the pre-training' (Section 3.3). The threats section discusses this risk for Defects4J and cites work showing Llama3.1 has 'less susceptibility to memorization' (ref [62])."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in the study. The manual inspection of patches is a verification step, not a human subjects study."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "The paper discusses GPU memory requirements qualitatively ('up to 200GB of GPU memory after repeated iterations') but does not report inference cost, wall-clock time, tokens consumed, or cost per bug. This is notable since the paper motivates developer-centric efficiency."
    280       },
    281       "compute_budget_stated": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The paper mentions using the LUMI supercomputer and eX3 infrastructure but does not state total GPU hours, training time, or total computational budget for the experiments."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "Fine-tuning with only 1K samples can improve plausible patch generation by up to 78% over base models.",
    291       "evidence": "Table 1 shows CodeLlama FFT(1K) generates plausible patches for 107 problems vs. 60 for the base model on HumanEval-Java (Section 4.1.1).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Increasing fine-tuning dataset size beyond certain thresholds leads to diminishing or declining performance, likely due to overfitting.",
    296       "evidence": "Tables 1-2 show CodeLlama FFT decreasing from 107 (1K) to 100 (65K) on HumanEval-Java, and DeepSeek-Coder from 129 (1K) to 122 (65K). Similar patterns on Defects4J (Section 4.1.1).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Base models consistently benefit from iterative feedback strategies while fine-tuned models show reduced effectiveness with iterations.",
    301       "evidence": "Figure 4 shows base models gaining +4 to +46 problems with iterative strategies, while fine-tuned models lose up to -34 problems with high iteration counts (Section 4.2).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "There are unique problems that only iterative base models can solve and that fine-tuned models cannot.",
    306       "evidence": "Figure 3 Venn diagrams show 10 unique problems (12% of solutions) on HumanEval-Java and 26 unique problems (19%) on Defects4J solvable only by iterative base models (Section 4.2.1).",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Complex problems benefit more from iterative refinement than simpler problems.",
    311       "evidence": "Defects4J (more complex) shows larger gains from iterative strategies than HumanEval-Java. For example, Llama3.1 Base goes from 28 (Strategy A) to 74 (Strategy F) on Defects4J — a 164% increase (Section 4.2.2).",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "The insights generalize to arbitrary programming languages.",
    316       "evidence": "Section 5 (Threats to validity) asserts this but experiments are conducted only on Java benchmarks. No cross-language evaluation is provided.",
    317       "supported": "weak"
    318     }
    319   ],
    320   "methodology_tags": [
    321     "benchmark-eval"
    322   ],
    323   "key_findings": "The paper shows that fine-tuning instruction-tuned LLMs with as few as 1K samples can improve automatic program repair by up to 78%, but larger datasets often yield diminishing returns due to overfitting. Base models benefit substantially from iterative feedback strategies, while fine-tuned models perform best with fewer iterations on simpler tasks. Complex benchmarks (Defects4J) amplify the benefits of iterative refinement. The study identifies model-specific optimal strategies and reveals that some problems are uniquely solvable only through iterative base-model approaches.",
    324   "red_flags": [
    325     {
    326       "flag": "No statistical significance tests",
    327       "detail": "All comparative claims are made by comparing raw numbers without statistical tests. With stochastic elements in the pipeline (despite beam search, the selection of subsets for fine-tuning, benchmark characteristics), the absence of significance tests makes it hard to distinguish signal from noise in the differences reported."
    328     },
    329     {
    330       "flag": "No uncertainty quantification",
    331       "detail": "Results are single-point estimates with no confidence intervals or error bars. The use of deterministic beam search reduces but does not eliminate concerns — different random seeds for fine-tuning data subsets could yield different results."
    332     },
    333     {
    334       "flag": "Missing hyperparameter details",
    335       "detail": "Critical hyperparameters like beam width, learning rate, number of training epochs, batch size, and LoRA rank/alpha are not reported in the paper. The threats section mentions 'standardized hyperparameters' but the actual values are absent."
    336     },
    337     {
    338       "flag": "Unbounded generalization claims",
    339       "detail": "The paper claims insights 'should generalize to arbitrary programming languages' and provides 'practical guidance for the deployment of LLMs in complex pipelines' despite testing only on Java with two benchmarks."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "Automated Program Repair in the Era of Large Pre-Trained Language Models",
    345       "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"],
    346       "year": 2023,
    347       "relevance": "Foundational LLM-based APR study at ICSE, directly relevant to evaluating LLM capabilities for code repair."
    348     },
    349     {
    350       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    351       "authors": ["I. Bouzenia", "P. Devanbu", "M. Pradel"],
    352       "year": 2024,
    353       "arxiv_id": "2403.17134",
    354       "relevance": "Agentic APR approach using LLMs with tool integration, directly relevant to agentic AI for software engineering."
    355     },
    356     {
    357       "title": "RepairLLaMA: Efficient Representations and Fine-Tuned Adapters for Program Repair",
    358       "authors": ["A. Silva", "S. Fang", "M. Monperrus"],
    359       "year": 2024,
    360       "arxiv_id": "2312.15698",
    361       "relevance": "LoRA-based fine-tuning for APR showing PEFT outperforming full fine-tuning, relevant to LLM adaptation for code tasks."
    362     },
    363     {
    364       "title": "A Comprehensive Evaluation of Parameter-Efficient Fine-Tuning on Automated Program Repair",
    365       "authors": ["G. Li", "C. Zhi", "J. Chen", "J. Han", "S. Deng"],
    366       "year": 2024,
    367       "arxiv_id": "2406.05639",
    368       "relevance": "Evaluates PEFT techniques for APR, finding diminishing returns with dataset size — directly comparable to this paper's findings."
    369     },
    370     {
    371       "title": "Multi-Objective Fine-Tuning for Enhanced Program Repair with LLMs",
    372       "authors": ["B. Yang", "H. Tian", "J. Ren"],
    373       "year": 2024,
    374       "arxiv_id": "2404.12636",
    375       "relevance": "MORepair: multi-objective fine-tuning of instruction LLMs for APR, a direct competitor approach."
    376     },
    377     {
    378       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    379       "authors": ["A. Madaan"],
    380       "year": 2023,
    381       "relevance": "Foundational work on LLM self-refinement through iterative feedback, central to the iterative repair paradigm studied here."
    382     },
    383     {
    384       "title": "Conversational Automated Program Repair",
    385       "authors": ["C. S. Xia", "L. Zhang"],
    386       "year": 2023,
    387       "arxiv_id": "2301.13246",
    388       "relevance": "Pioneered conversational APR with LLMs using test feedback, directly motivating this paper's iterative approach."
    389     },
    390     {
    391       "title": "Keep the Conversation Going: Fixing 162 out of 337 Bugs for $0.42 Each Using ChatGPT",
    392       "authors": ["C. S. Xia", "L. Zhang"],
    393       "year": 2023,
    394       "arxiv_id": "2304.00385",
    395       "relevance": "Reports cost-effective conversational APR with ChatGPT, relevant to both LLM capability evaluation and cost-effectiveness."
    396     },
    397     {
    398       "title": "CigaR: Cost-efficient Program Repair with LLMs",
    399       "authors": ["D. Hidvégi", "K. Etemadi", "S. Bobadilla", "M. Monperrus"],
    400       "year": 2024,
    401       "arxiv_id": "2402.06598",
    402       "relevance": "Cost-efficient LLM-based APR, relevant to practical deployment constraints of AI programming tools."
    403     },
    404     {
    405       "title": "Are Large Language Models Memorizing Bug Benchmarks?",
    406       "authors": ["D. Ramos", "C. Mamede", "K. Jain"],
    407       "year": 2025,
    408       "arxiv_id": "2411.13323",
    409       "relevance": "Directly addresses benchmark contamination in LLM-based APR, a key methodological concern for code generation evaluations."
    410     },
    411     {
    412       "title": "Teaching Large Language Models to Self-Debug",
    413       "authors": ["X. Chen", "M. Lin", "N. Schärli", "D. Zhou"],
    414       "year": 2023,
    415       "arxiv_id": "2304.05128",
    416       "relevance": "Self-debugging paradigm for LLMs using execution feedback, foundational for iterative code repair approaches."
    417     },
    418     {
    419       "title": "RLEF: Grounding Code LLMs in Execution Feedback with Reinforcement Learning",
    420       "authors": ["J. Gehring", "K. Zheng", "J. Copet"],
    421       "year": 2024,
    422       "relevance": "Uses reinforcement learning with execution feedback to train code LLMs, relevant to training approaches for code generation."
    423     }
    424   ]
    425 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs