scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27367B)
      1 {
      2   "paper": {
      3     "title": "Assessing the Latent Automated Program Repair Capabilities of Large Language Models using Round-Trip Translation",
      4     "authors": [
      5       "Fernando Vallecillos Ruiz",
      6       "Anastasiia Grishina",
      7       "Max Hort",
      8       "Leon Moonen"
      9     ],
     10     "year": 2024,
     11     "venue": "ACM Transactions on Software Engineering and Methodology (TOSEM)",
     12     "arxiv_id": "2401.07994",
     13     "doi": "10.1145/3771922"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper provides a Zenodo replication package: 'The replication package is available for download from Zenodo: https://doi.org/10.5281/zenodo.10500593' (footnote 1, Section 1). The code for RTT and results are released."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "All four benchmarks (QuixBugs, HumanEval-Java, Defects4J v1.2, Defects4J v2.0) are publicly available. The paper also releases their manual assessment of over 5,000 patches alongside the replication package (Section 5.2.2)."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Section 4.9 specifies hardware: '3 NVIDIA V100 GPUs or 2 NVIDIA A100 GPUs' and '32-Core AMD EPYC 7601 CPU with 2TB RAM.' Model versions and hyperparameters are in Tables 1 and 2. Specific model URLs are provided in footnotes 4-10. However, no requirements.txt or Dockerfile is mentioned in the paper itself; the environment details are described in prose and via model links."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper provides a Zenodo replication package (footnote 1) and states 'We release the code for RTT and results obtained to ensure replication and verification of our work.' The pipeline is described in detail in Sections 4.3-4.6 with exact prompts, hyperparameters, and postprocessing steps."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper reports standard deviations (e.g., Table 3 and Table 4 with '± STD') but does not report confidence intervals or error bars. Standard deviation is reported as a spread measure but no formal confidence intervals are constructed."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper makes comparative claims (e.g., 'a strong correlation is observed,' 'larger models consistently provide better results') and reports Pearson's r correlations but does not perform statistical significance tests (no p-values, no hypothesis tests) for any comparisons between models or approaches."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper reports absolute counts of bugs fixed with baseline context (e.g., 'plausible patches for 100 of 164 bugs with GPT-4,' '97 were found to be correct,' '46 bugs that were missed by LLMs specifically fine-tuned for APR'). Tables 3-7 provide full breakdowns. Pearson's r correlation values are reported (0.60, 0.77, 0.22, 0.15)."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No justification is given for the choice of 10 runs, the selection of 9 models, or the use of these specific 4 benchmarks beyond stating they are 'diverse' and 'widely used.' No power analysis is discussed."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Standard deviation is reported across 10 runs for all open-source models in Tables 3, 4, and 7 (e.g., 'SantaCoder: 31.4 ± 2.0'). The paper also reports Any Run and Every Run aggregations to show variance."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper compares against prior work by Jiang et al. [25], including base models, fine-tuned models, and DL-based APR techniques (CURE, Reward, Recoder, KNOD) in Tables 5 and 6. Figure 4 directly compares GPT-4 RTT against 10 LLMs from prior work."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The baselines from Jiang et al. [25] are from ICSE 2023 and include models like InCoder-6.7B and fine-tuned variants. The paper also discusses related concurrent work using GPT-3.5 with 200 patches [74]. The comparison is against reasonably recent APR methods."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper systematically varies multiple components: intermediate representation (PL vs NL, Section 5.1 vs 5.2), temperature settings (Section 5.3, Table 8), model sizes (Tables 3-4), and specific intermediate programming languages (C#, C++, Python). These variations function as ablations of the RTT pipeline."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Section 4.7 lists seven metrics: compilability, plausibility, test pass rate, exact match, BLEU, and CodeBLEU. Results are reported across multiple metrics in Sections 5.1-5.4."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Section 4.8 describes manual assessment of correctness for HumanEval-Java patches. Table 7 reports manually verified correct fixes. Over 5,000 patches were manually assessed. Section 5.5 provides qualitative analysis of generated patches."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The benchmarks have fixed test suites. HumanEval-Java was specifically constructed after training of most models to mitigate data leakage (Section 4.2). The paper explicitly discusses this: 'HumanEval-Java was not available during training of most the LLMs used in this study.'"
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Results are broken down per benchmark (QuixBugs, HumanEval-Java, Defects4J v1.2, v2.0), per model, per intermediate language, and per temperature setting. Tables 3-8 and Figures 3-8 provide detailed breakdowns."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 5.5 provides detailed qualitative analysis of failure cases: misleading method names, common logical problems (e.g., CORRECT_PARENTHESIS generating checks for multiple bracket types), linked list handling failures, repetitive token generation by smaller models. Section 5.4.6 discusses limitations."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper reports that RTT through PL yields very low plausibility rates (Table 3, at most 5 bugs). The paper explicitly states 'RTT does not outperform state-of-the-art NMT and cloze-style fine-tuned models for APR' (Section 8). Section 5.4.6 discusses RTT diluting coding style."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims 'plausible patches for 100 of 164 bugs with GPT-4 on HumanEval-Java' (Table 4), '97 are found to be correct' (Table 7), and '46 bugs that were missed by LLMs specifically fine-tuned for APR' (Section 5.2.1). The abstract also notes limitations, consistent with Sections 5.4.6 and 8."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The paper's causal claim is that RTT can fix bugs via 'regression toward the mean' (Section 3.1). This is framed as a hypothesis and tested empirically. The ablation-style experiments (varying intermediate languages, temperatures, model sizes) provide controlled evidence. The paper is appropriately cautious, stating 'we hypothesize' rather than making strong causal assertions."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Section 6 (Threats to Validity) explicitly bounds generalization: 'we focused our evaluation on single-hunk bugs, therefore, effectiveness may not transfer to more complex multi-hunk or multi-file bugs.' The paper notes results are limited to Java and to nine transformer-based models. External validity threats are specifically discussed."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Section 6 discusses data leakage as an alternative explanation ('there is a risk that they were used during training'), mitigated by HumanEval-Java and low exact match rates. The paper also discusses that test-suite-based plausibility may overfit to tests (construct validity). Section 5.5 discusses data leakage for specific examples (e.g., is_prime method possibly memorized)."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Footnote 3 specifies exact API model versions: 'gpt-3.5-turbo-1106, gpt-4-0613, and gpt-4o-mini-2024-07-18.' Open-source models are specified by name and parameter count (e.g., 'PLBART-base (140M)', 'InCoder 1.3B and 6.7B', 'StarCoderBase 15.5B') with links to specific model pages in footnotes 4-10."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Section 4.5 provides exact prompt text for both GPT models (system message and user prompts for forward and backward translation) and open-source models (infilling prompt templates with actual format). The prompts include the actual text sent to models."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Table 2 reports all hyperparameters: number of beams, temperature for both legs, and top-p for all nine models. Section 4.4 explains the rationale for each setting. Section 5.3 provides a temperature sweep analysis."
    149       },
    150       "scaffolding_described": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "The RTT pipeline is not agentic scaffolding — it is a two-step translation pipeline without feedback loops, memory, or tool use. The pipeline is fully described but does not constitute agentic scaffolding."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 4.3 describes the four-step pipeline (preprocessing, translation, postprocessing, evaluation). Section 4.6 details postprocessing for method signature overwriting (Figure 2). The preprocessing includes extracting buggy functions, adding prefixes/suffixes/masks, and removing newlines (Step 1)."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 6 ('Threats to Validity') provides a dedicated, substantive discussion of four threat categories: internal, construct, external, and conclusion validity, structured following Wohlin et al. [70]. Section 5.4.6 also discusses limitations of RTT for APR."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The threats are specific to this study: data leakage risk for specific benchmarks, plausibility vs. correctness distinction, single-hunk bug limitation, GPT models run only once due to seed unavailability, manual correctness assessment limited to HumanEval-Java due to resource constraints (Section 4.8), and cost constraints for GPT-4 temperature sweep."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "The paper explicitly states: 'we focused our evaluation on single-hunk bugs, therefore, effectiveness may not transfer to more complex multi-hunk or multi-file bugs' (Section 6). It states RTT was only applied to Java. It notes 'we applied the approach using only nine transformer-based models' and that extending requires models meeting RTT requirements (Section 6)."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The Zenodo replication package (https://doi.org/10.5281/zenodo.10500593) contains the code and results. The manual assessment of over 5,000 patches is released (Section 5.2.2). Results were also reported to Weights & Biases (Section 4.7)."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The benchmarks are well-established and described in Section 4.2 (QuixBugs: 40 bugs, HumanEval-Java: 164 bugs, Defects4J v1.2: 130 single-hunk bugs, Defects4J v2.0: 89 single-hunk bugs). The generation procedure (5 forward translations x 5 backward translations = 25 candidate patches per bug, 10 runs with different seeds) is described in Section 4.3."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants were recruited. The manual assessment was done by the paper's authors (reviewers examining patches). Standard benchmarks were used rather than recruited data."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The full pipeline is documented in Section 4.3 (four steps: preprocessing, round-trip translation, postprocessing, evaluation). Section 4.6 describes testability ensurance. The pipeline handles edge cases (context window overflow for one bug with StarCoderBase, Section 4.6)."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The Acknowledgments section lists funding: Research Council of Norway (secureIT project, IKTPLUSS #288787), European Union Horizon Europe Marie Sklodowska-Curie Actions (#101151798), eX3 infrastructure (#270053), and Sigma2/LUMI supercomputer access."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "All four authors are affiliated with Simula Research Laboratory, Norway. Affiliations are listed on the first page. No author is affiliated with any of the model providers being evaluated (OpenAI, Meta, etc.)."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Funding comes from public research councils (Research Council of Norway, EU Horizon Europe) that have no financial stake in whether RTT-based program repair succeeds or fails."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The paper does not state explicit training data cutoff dates for the models. It notes that 'HumanEval-Java was not available during training of most the LLMs used in this study' and that 'HumanEval-Java was constructed after the training of any of the models used in this work (except GPT-4o-mini)' (Section 6), but no specific cutoff dates are provided."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Section 6 discusses data leakage: 'there is a risk that they were used during training.' The paper uses HumanEval-Java specifically to mitigate this. Section 5.1 notes only 0.03% exact match rate across generated patches. Section 5.5 discusses specific cases where data leakage may explain model behavior (is_prime method)."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "The paper explicitly addresses this: HumanEval-Java was created after model training. For older benchmarks, the paper notes '0.03% of the generated candidate patches' are exact matches (Section 6). The qualitative analysis in Section 5.5 examines whether models memorized benchmarks, concluding they did not copy-paste answers."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants in the study. The manual patch assessment was performed by the paper's authors as expert reviewers, not as human subjects research."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants. The study evaluates LLMs on code benchmarks."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in the study."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in the study."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in the study."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the study."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the study."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": true,
    280         "justification": "Footnote 13 states: 'The approximate cost for the single run using GPT-4 was ~140USD.' Section 5.3 notes GPT-4 was not included in the temperature sweep because 'it is significantly more expensive than any other model.'"
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": true,
    285         "justification": "Section 4.9 specifies hardware: '3 NVIDIA V100 GPUs or 2 NVIDIA A100 GPUs' for patch generation and '32-Core AMD EPYC 7601 CPU with 2TB RAM' for test validation. LUMI supercomputer access is acknowledged. However, total GPU hours or total API spend across all experiments are not explicitly stated."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "RTT through English generates plausible patches for 100 of 164 bugs with GPT-4 on HumanEval-Java, and 97 are correct in manual assessment.",
    292       "evidence": "Table 4 shows GPT-4 achieves 100 plausible patches on HumanEval-Java in a single run. Table 7 shows 97 manually verified correct patches. Section 5.2.2 describes the manual assessment methodology.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "RTT uniquely generates plausible patches for 46 bugs not fixed by LLMs fine-tuned for APR.",
    297       "evidence": "Section 5.2.1, Figure 4, and Table 6 show the comparison. The 46 unique bugs are from the union across all RTT models and benchmarks compared to results from Jiang et al. [25].",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "There is a strong correlation between model size and plausibility rate (Pearson's r = 0.77).",
    302       "evidence": "Section 5.2 reports Pearson's r = 0.77 for model size vs plausibility rate and r = 0.60 for model size vs compilable patches, excluding models with undisclosed sizes (GPT models).",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "RTT does not outperform state-of-the-art NMT and cloze-style fine-tuned models for APR overall.",
    307       "evidence": "Table 5 shows fine-tuned InCoder-6.7B achieves 56/38/24/70 plausible patches on the four benchmarks, while RTT with the same model achieves 10/8/14/41 (Table 4, Any Run). Section 8 explicitly states this limitation.",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "Temperature affects RTT repair performance, with different optimal temperatures for different models.",
    312       "evidence": "Table 8 shows GPT-3.5 peaks at T=1.0 (130 total plausible patches) while GPT-4o-mini peaks at T=0.6 (138 total). Section 5.3 provides detailed analysis.",
    313       "supported": "strong"
    314     },
    315     {
    316       "claim": "RTT's hypothesis is that it fixes bugs through regression toward the mean, where LLMs replace infrequent buggy patterns with frequent bug-free patterns.",
    317       "evidence": "Section 3.1 presents the theoretical motivation based on software naturalness (Ray et al. [54]). The empirical results support the plausibility of this hypothesis but do not definitively prove causation. CodeBLEU analysis (Section 5.4.4) shows patches regress toward similar distributions.",
    318       "supported": "moderate"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "benchmark-eval"
    323   ],
    324   "key_findings": "The paper presents the first systematic evaluation of round-trip translation (RTT) with LLMs for automated program repair, testing 9 LLMs across 4 Java benchmarks. GPT-4 using natural language as an intermediate representation achieves 100 plausible patches (97 correct) on 164 HumanEval-Java bugs, and RTT uniquely fixes 46 bugs missed by fine-tuned APR models. RTT through programming languages yields very low fix rates, indicating the intermediate representation must differ sufficiently from the source. While RTT does not outperform state-of-the-art fine-tuned approaches overall, it offers complementary bug-fixing capabilities at no fine-tuning cost.",
    325   "red_flags": [
    326     {
    327       "flag": "No statistical significance tests",
    328       "detail": "The paper reports Pearson correlations and compares models based on raw counts but never performs statistical significance tests. Claims like 'a strong correlation is observed' and comparative statements between models lack formal statistical backing."
    329     },
    330     {
    331       "flag": "GPT models run only once",
    332       "detail": "GPT-3.5, GPT-4, and GPT-4o-mini were each run only once (due to no seed control at the time and cost constraints), while open-source models were run 10 times. This makes the GPT results less reliable as single-run outcomes, yet GPT-4 is the paper's top-performing model."
    333     },
    334     {
    335       "flag": "Manual assessment limited to one benchmark",
    336       "detail": "Correctness was manually verified only for HumanEval-Java. For QuixBugs and Defects4J, only plausibility (test pass rate) is reported, which the paper acknowledges 'has inherent limitations, such as ignoring potential overfitting to test code' (Section 6)."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Impact of Code Language Models on Automated Program Repair",
    342       "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"],
    343       "year": 2023,
    344       "doi": "10.1109/icse48619.2023.00125",
    345       "relevance": "Primary comparison baseline; evaluated LLMs for APR on the same four benchmarks with NMT and cloze-style approaches."
    346     },
    347     {
    348       "title": "Evaluating Large Language Models Trained on Code",
    349       "authors": ["Mark Chen", "Jerry Tworek"],
    350       "year": 2021,
    351       "arxiv_id": "2107.03374",
    352       "relevance": "Introduced HumanEval benchmark (basis for HumanEval-Java used in this study) and Codex for code generation."
    353     },
    354     {
    355       "title": "Automated Program Repair in the Era of Large Pre-Trained Language Models",
    356       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    357       "year": 2023,
    358       "doi": "10.1109/icse48619.2023.00129",
    359       "relevance": "Evaluated LLMs for APR showing that generating more patches improves results; relevant comparison for RTT sampling strategy."
    360     },
    361     {
    362       "title": "Keep the Conversation Going: Fixing 162 out of 337 Bugs for $0.42 Each Using ChatGPT",
    363       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    364       "year": 2023,
    365       "arxiv_id": "2304.00385",
    366       "relevance": "Demonstrated conversational APR with ChatGPT achieving high fix rates; relevant comparison for RTT's cost-effectiveness."
    367     },
    368     {
    369       "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-Shot Learning",
    370       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    371       "year": 2022,
    372       "doi": "10.1145/3540250.3549101",
    373       "relevance": "Explored zero-shot LLMs for cloze-style APR; related approach that RTT builds upon conceptually."
    374     },
    375     {
    376       "title": "Fully Autonomous Programming Using Iterative Multi-Agent Debugging with Large Language Models",
    377       "authors": ["Anastasiia Grishina", "Vadim Liventsev", "Aki Härmä", "Leon Moonen"],
    378       "year": 2025,
    379       "doi": "10.1145/3719351",
    380       "relevance": "Iterative multi-agent debugging framework from the same group; represents agentic approach to program repair that RTT could complement."
    381     },
    382     {
    383       "title": "Reflexion: An Autonomous Agent with Dynamic Memory and Self-Reflection",
    384       "authors": ["Noah Shinn", "Beck Labash", "Ashwin Gopinath"],
    385       "year": 2023,
    386       "arxiv_id": "2303.11366",
    387       "relevance": "Self-reflection framework for LLM agents; relevant to discussion of iterative debugging approaches that RTT could integrate with."
    388     },
    389     {
    390       "title": "Teaching Large Language Models to Self-Debug",
    391       "authors": ["Xinyun Chen", "Maxwell Lin", "Nathanael Schärli", "Denny Zhou"],
    392       "year": 2023,
    393       "arxiv_id": "2304.05128",
    394       "relevance": "LLM self-debugging approach relevant to understanding RTT's place in the broader landscape of iterative repair methods."
    395     },
    396     {
    397       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    398       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"],
    399       "year": 2022,
    400       "doi": "10.1109/sp46214.2022.9833571",
    401       "relevance": "Assessed security vulnerabilities in LLM-generated code; relevant to RTT's limitation of potentially introducing security flaws."
    402     },
    403     {
    404       "title": "GPT-4 Technical Report",
    405       "authors": ["OpenAI"],
    406       "year": 2023,
    407       "arxiv_id": "2303.08774",
    408       "relevance": "Technical report for GPT-4, the best-performing model in the RTT evaluation."
    409     },
    410     {
    411       "title": "StarCoder: May the Source Be with You!",
    412       "authors": ["Raymond Li", "Loubna Ben Allal"],
    413       "year": 2023,
    414       "arxiv_id": "2305.06161",
    415       "relevance": "Open-source code LLM used in the RTT evaluation; relevant to understanding open LLMs for code tasks."
    416     },
    417     {
    418       "title": "A Survey of Learning-based Automated Program Repair",
    419       "authors": ["Quanjun Zhang", "Chunrong Fang", "Yuxiang Ma", "Weisong Sun", "Zhenyu Chen"],
    420       "year": 2023,
    421       "arxiv_id": "2301.03270",
    422       "relevance": "Survey of learning-based APR methods providing context for where RTT fits in the APR landscape."
    423     }
    424   ]
    425 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs