scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30595B)
      1 {
      2   "paper": {
      3     "title": "Enhancing Automated Program Repair through Fine-tuning and Prompt Engineering",
      4     "authors": [
      5       "Rishov Paul",
      6       "Md. Mohib Hossain",
      7       "Mohammed Latif Siddiq",
      8       "Masum Hasan",
      9       "Anindya Iqbal",
     10       "Joanna C. S. Santos"
     11     ],
     12     "year": 2023,
     13     "venue": "arXiv",
     14     "arxiv_id": "2304.07840"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "Fine-tuned PLBART and CodeT5 models significantly outperform prior baselines on two code review-based program repair datasets, with CodeT5 achieving up to 25.65% improvement in Top-5 accuracy on the Tufano dataset. Zero-shot and few-shot prompting with GPT-3.5-Turbo and Code-DaVinci-Edit-001 shows competitive but mixed results, with heuristic post-processing substantially improving accuracy. Manual developer analysis reveals that even the best models fulfill code review intentions in only ~53-59% of cases, indicating language models are not yet practical for automated program repair.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "A Zenodo replication package is provided at https://doi.org/10.5281/zenodo.8122636, described as containing 'all the scripts used to gather the data and results.'"
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Both datasets used (Tufano et al. and Review4Repair) are publicly available from prior work, and the replication package on Zenodo includes the processed data."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions using an 'NVIDIA GeForce RTX 2070-8GB GPU' but does not provide library versions, requirements.txt, Dockerfile, or other dependency specifications needed to recreate the environment."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not include step-by-step reproduction instructions. The Zenodo replication package contains scripts, but the paper itself provides no README-like instructions or 'Reproducing Results' section."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results in Tables II and III are reported as point estimates (e.g., '25.28%', '29.82%') with no confidence intervals or error bars."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper claims models 'outperform' baselines and reports improvements like '+10.23%' but never applies statistical significance tests (no p-values, t-tests, or other tests)."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Table II reports absolute improvements over baselines (e.g., '+5.69', '+10.23') alongside both the baseline and model absolute values, providing sufficient context to assess effect magnitude."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "For RQ3 (manual analysis), the paper justifies sample sizes: 'we randomly collected 314 test samples from Tufano et al. and 340 test samples from Review4Repair datasets in order to achieve a 95% confidence interval and 5% error of margin.'"
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No variance, standard deviation, or results across multiple runs are reported. All results appear to be from single experimental runs."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper compares against R4R CC (Review4Repair baseline) and Tufano 2-encoder (Tufano et al. baseline), both from the respective prior works."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The baselines are from the original dataset papers (Review4Repair 2022 and Tufano et al. 2021), which are the most relevant and recent prior work on these specific datasets."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No ablation study is conducted. The paper compares different models but does not ablate components within any approach (e.g., removing pre-training, removing code review input) to isolate their contributions."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The paper uses five metrics: Top-1 Accuracy, Top-5 Accuracy, Top-10 Accuracy (exact match at different beam sizes), BLEU-4, and CodeBLEU."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "RQ3 involves two software developers manually scoring repaired code from all five models based on whether the generated fix fulfills the intention of the code review, with Cohen's Kappa for inter-rater agreement."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Both datasets use explicit train/validation/test splits. The Tufano dataset has 13,756/1,719/1,719 splits. The Review4Repair dataset was reorganized to 53,198/2,956/2,955 splits."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Figure 2 provides per-category breakdowns across three fix categories (Insert, Delete, Update) for both datasets, showing each model's performance in each category."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section V.A discusses specific failure observations from developers: vague reviews ('nice'), context-dependent reviews ('check my previous comment'), and ground truth misalignment with focus scope. Section III-B5 details common failure patterns in LLM outputs."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports several negative findings: few-shot prompting does not consistently outperform zero-shot (Table III, Review4Repair after heuristics), LLM accuracy is 'not satisfactory' overall, and manual analysis shows models fulfill review intentions in only ~40-59% of cases."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims pre-trained models 'notably outperformed each of the previous models' (supported by Table II) and that 'the practical application of using LLMs in the context of automated program repair is still a long way off' (supported by RQ3 results showing <60% review fulfillment)."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The conclusion states 'this boost in accuracy is due to mostly the learned parameters of the model rather than the architecture itself,' a causal claim about the source of improvement. No ablation separating architecture from pre-trained weights is conducted to support this. The comparison conflates model architecture, pre-training corpus, and fine-tuning effects."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title 'Enhancing Automated Program Repair through Fine-tuning and Prompt Engineering' is broad, but results are limited to Java code from two code review datasets. The abstract discusses 'automated program repair' without bounding to Java or code review-based repair. The threats to validity section acknowledges the single-language limitation but the title and abstract do not."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Section VI discusses data leakage as an alternative explanation for GPT-3.5 performance: 'One possible reason might be that these LLMs were also trained with our aforementioned datasets.' It also discusses hyperparameter sensitivity and dataset quality issues (vague reviews) as confounding factors."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper explicitly distinguishes between automated metrics (exact match, BLEU, CodeBLEU) and actual review fulfillment (RQ3 manual analysis), acknowledging that exact match may miss valid alternative fixes. RQ3 was specifically designed to address this proxy gap."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "GPT-3.5-Turbo is referenced without a version/snapshot date (e.g., 'gpt-3.5-turbo-0301'). Code-DaVinci-Edit-001 is a specific model name. PLBART and CodeT5 reference their original papers but not specific checkpoint versions."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Listing 2 provides the full prompt structure for both zero-shot and few-shot prompting, including the actual command text ('Refactor the Buggy Code using the Review without comments') and system role content ('You are a coding assistant. You generate only the source code.'). Fill values come from public datasets."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Temperature=0, top_p=1, frequency_penalty=0, presence_penalty=0 for LLMs. For fine-tuning: beam sizes (1, 5, 10), epochs (11, 12, 45), batch size=4, accumulated gradient steps=8, input length=512, target length=200, patience=10."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The paper evaluates models via direct fine-tuning and single-turn prompting."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section III-A1 describes preprocessing in detail: concatenation of buggy code and review with special tokens, removal of extra spaces and newlines, handling of samples exceeding 512 tokens (57+6 removed), and reorganization of Review4Repair into 90/5/5 splits."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section VI 'Threats to Validity' is a dedicated section discussing internal and external validity threats."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The paper discusses specific threats: hyperparameter search space limitations for this specific architecture, confinement to Java as the only programming language, and potential data leakage in GPT-3.5-Turbo due to training cutoff dates relative to dataset publication."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper explicitly states 'the datasets consisted of only Java codes and respective code reviews in the English language; hence, our focus was confined to a single programming language' and notes that 'the coverage of our findings is limited.'"
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Both datasets are publicly available from prior work (Tufano et al. and Review4Repair), and the Zenodo replication package provides the processed data and scripts."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section III-A1 describes both datasets: Tufano et al. contains 17,194 samples from Gerrit and GitHub, Review4Repair contains 56,211 training and 2,961 test samples from Gerrit. Preprocessing steps are detailed."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "For RQ3, two developers are described as having 'one year of industry experience in a Fortune 500 company' but no details on how they were recruited or selected, or whether this introduces bias."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The pipeline from raw datasets through preprocessing (tokenization, length filtering, split reorganization), model input creation, output generation, heuristic post-processing, and evaluation is documented across Sections III-A and III-B, including counts of removed samples (57+6 over 512 tokens)."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding source is mentioned anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are listed: BUET (Bangladesh), University of Notre Dame, and University of Rochester. None of the authors are affiliated with companies whose products are being evaluated (OpenAI)."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding is disclosed, so independence of funders cannot be assessed. The absence of a funding statement does not confirm the work is unfunded."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement or financial interest disclosure is present in the paper. Two authors list IQVIA email addresses but this affiliation is not discussed in relation to the research."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "The paper states 'The knowledge cut-off of these two models is September 2021' for GPT-3.5-Turbo and Code-DaVinci-Edit-001 in Section VI."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "Section VI discusses potential overlap: 'One possible reason might be that these LLMs were also trained with our aforementioned datasets. As a result, there might be a data leakage.' They note the Tufano dataset was published before September 2021 while Review4Repair was published after."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "The paper acknowledges contamination risk: the Tufano dataset was published before the models' training cutoff and could be in training data. They note 'As these models are black-box, there is no way we can verify if there is data leakage for these datasets.'"
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "No pre-registration is mentioned for the manual analysis study (RQ3)."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "No IRB or ethics board approval is mentioned for the developer evaluation study."
    259       },
    260       "demographics_reported": {
    261         "applies": true,
    262         "answer": true,
    263         "justification": "The two developers are described as having 'one year of industry experience in a Fortune 500 company and significant involvement in the code review process in software development.'"
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": true,
    267         "answer": false,
    268         "justification": "No inclusion or exclusion criteria are stated for selecting the two developer evaluators. No screening process is described."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "This is a rating/evaluation task, not an experimental study with conditions assigned to participants. Randomization of participant assignment to conditions is not applicable."
    274       },
    275       "blinding_described": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "The paper does not describe whether the two developers were blinded to which model produced each repair. Knowing the model source could introduce bias in their ratings."
    279       },
    280       "attrition_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "Attrition is not explicitly reported. While results imply both raters completed all evaluations, there is no explicit statement of how many started vs. finished."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No API costs are reported for GPT-3.5-Turbo or Code-DaVinci-Edit-001 usage, despite making API calls for all test samples across multiple prompting strategies."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "The paper mentions using an 'NVIDIA GeForce RTX 2070-8GB GPU' but does not state GPU hours, total training time, or total API spend."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No mention of multiple random seeds. Results appear to be from single training runs for the fine-tuned models."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The number of experimental runs is not stated. It appears each model was trained once per configuration. For API calls, temperature=0 makes output deterministic but this is not explicitly discussed as justification for single runs."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The paper mentions experimenting with 'various numbers of epochs' and different beam sizes but does not report the total number of configurations tried or the compute spent on hyperparameter search."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "Epoch selection is justified via validation performance: 'we used 11 epochs because we found that the model's performance remains unchanged after 11 epochs' with patience=10. CodeT5 was fine-tuned for 45 epochs 'based on observing validation losses.'"
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "Many comparisons are made (multiple models × 2 datasets × multiple metrics) but no correction for multiple comparisons is applied. No statistical tests are used at all."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors fine-tune their own models and compare against baseline results from other papers without acknowledging the systematic bias of evaluating one's own system."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The paper does not compare performance relative to compute budget. Fine-tuning PLBART/CodeT5 and calling OpenAI APIs have very different compute costs, but this is not discussed."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper does not discuss whether exact match on these specific code review datasets actually measures automated program repair capability, or whether the datasets are representative of real-world repair scenarios."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is involved. Models are evaluated via direct fine-tuning or single-turn prompting."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": true,
    349         "justification": "Section VI discusses temporal leakage: the Tufano et al. dataset was published before the models' September 2021 training cutoff, meaning solutions could be in training data. The Review4Repair dataset was published after the cutoff."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the evaluation setup provides information that would not be available in real-world usage (e.g., the <START>/<END> tags marking the exact buggy location)."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether train and test examples share structural similarities (e.g., from the same repositories, same authors, or near-duplicate code patterns)."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "The paper acknowledges leakage risk but explicitly states 'As these models are black-box, there is no way we can verify if there is data leakage' without applying any detection method (canary strings, n-gram overlap, membership inference)."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Fine-tuned PLBART and CodeT5 significantly outperform baseline models on both code review-based APR datasets.",
    371       "evidence": "Table II shows CodeT5 achieves +10.23% Top-1 accuracy on Review4Repair and +21.12% on Tufano et al. PLBART achieves +5.69% and +20.82% respectively.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "CodeT5 generally outperforms PLBART across most metrics and fix categories.",
    376       "evidence": "Table II and Figure 2 show CodeT5 achieves higher accuracy in Insert and Update categories on both datasets, and higher Top-1 accuracy overall.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "The accuracy boost comes primarily from pre-trained weights rather than architecture.",
    381       "evidence": "The conclusion claims this but no ablation separating architecture from pre-training is conducted. The evidence is the observation that both models (different architectures) improve over baselines.",
    382       "supported": "weak"
    383     },
    384     {
    385       "claim": "Heuristic post-processing substantially improves zero-shot GPT-3.5-Turbo accuracy.",
    386       "evidence": "Table III shows accuracy improves from 6.9% to 22.06% on Review4Repair and from 17.86% to 30.13% on Tufano et al. after applying heuristics.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Code-DaVinci-Edit-001 achieves state-of-the-art on some metrics without fine-tuning.",
    391       "evidence": "Table III shows it achieves the best CodeBLEU (88.42%) on Review4Repair and best accuracy (40.70%) and CodeBLEU (88.63%) on Tufano et al.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Language models are not yet capable of practically fulfilling code review intentions in automated repair.",
    396       "evidence": "Table IV (RQ3) shows the best models fulfill review intentions in only 52.65% (Review4Repair, CodeT5) and 58.92% (Tufano, Code-DaVinci-Edit-001) of cases.",
    397       "supported": "moderate"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "No statistical significance tests",
    403       "detail": "All comparative claims ('outperforms', 'significant improvement') are based on comparing raw accuracy numbers without any statistical significance tests, despite multiple models and datasets being compared."
    404     },
    405     {
    406       "flag": "Very small evaluator pool for manual analysis",
    407       "detail": "Only two developers evaluated the repaired code for RQ3. While Cohen's Kappa is reported (0.51-0.68, moderate agreement), two raters is minimal for a manual evaluation study. No blinding is described."
    408     },
    409     {
    410       "flag": "Single-run results without variance",
    411       "detail": "All fine-tuning results appear to be from single experimental runs with no variance or seed sensitivity analysis. Neural network training can produce substantially different results across random seeds."
    412     },
    413     {
    414       "flag": "Acknowledged but unmitigated data leakage risk",
    415       "detail": "The paper acknowledges that GPT-3.5-Turbo may have been trained on the Tufano dataset (published before its September 2021 cutoff) but does not apply any detection or mitigation method, making the GPT results potentially inflated."
    416     },
    417     {
    418       "flag": "Causal claim without adequate evidence",
    419       "detail": "The paper claims the performance boost is 'due to mostly the learned parameters of the model rather than the architecture itself' without conducting an ablation study that isolates pre-trained weights from architectural differences."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "Unified pre-training for program understanding and generation",
    425       "authors": ["W. Ahmad", "S. Chakraborty", "B. Ray", "K.-W. Chang"],
    426       "year": 2021,
    427       "relevance": "PLBART model used in this study; pre-trained model for code understanding and generation tasks."
    428     },
    429     {
    430       "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation",
    431       "authors": ["Y. Wang", "W. Wang", "S. Joty", "S. C. Hoi"],
    432       "year": 2021,
    433       "relevance": "CodeT5 model used in this study; pre-trained encoder-decoder for code tasks."
    434     },
    435     {
    436       "title": "Review4Repair: Code review aided automatic program repairing",
    437       "authors": ["F. Huq", "M. Hasan", "M. M. A. Haque", "S. Mahbub", "A. Iqbal", "T. Ahmed"],
    438       "year": 2022,
    439       "relevance": "One of the two datasets and baseline models used; demonstrates code review improves APR."
    440     },
    441     {
    442       "title": "Towards automating code review activities",
    443       "authors": ["R. Tufano", "L. Pascarella", "M. Tufano", "D. Poshyvanyk", "G. Bavota"],
    444       "year": 2021,
    445       "doi": "10.1109/ICSE43902.2021.00027",
    446       "relevance": "One of the two datasets and baseline models used; demonstrated transformer models for code review-based repair."
    447     },
    448     {
    449       "title": "Evaluating large language models trained on code",
    450       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    451       "year": 2021,
    452       "arxiv_id": "2107.03374",
    453       "relevance": "Codex paper; Code-DaVinci-Edit-001 used in this study derives from Codex."
    454     },
    455     {
    456       "title": "Language models are few-shot learners",
    457       "authors": ["T. B. Brown", "B. Mann", "N. Ryder"],
    458       "year": 2020,
    459       "arxiv_id": "2005.14165",
    460       "relevance": "GPT-3 paper; GPT-3.5-Turbo used in this study derives from GPT-3 family."
    461     },
    462     {
    463       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    464       "authors": ["Z. Feng", "D. Guo", "D. Tang"],
    465       "year": 2020,
    466       "relevance": "Pre-trained code model; relevant to understanding pre-trained model capabilities for code tasks."
    467     },
    468     {
    469       "title": "SequenceR: Sequence-to-sequence learning for end-to-end program repair",
    470       "authors": ["Z. Chen", "S. Kommrusch", "M. Tufano"],
    471       "year": 2021,
    472       "relevance": "Sequence-to-sequence approach for automated program repair; directly relevant to APR methodology."
    473     },
    474     {
    475       "title": "CoCoNut: Combining context-aware neural translation models using ensemble for program repair",
    476       "authors": ["T. Lutellier", "V. H. Pham", "L. Pang", "Y. Li", "M. Wei", "L. Tan"],
    477       "year": 2020,
    478       "relevance": "Neural machine translation approach for APR using ensemble methods."
    479     },
    480     {
    481       "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions",
    482       "authors": ["H. Pearce", "B. Ahmad", "B. Tan", "B. Dolan-Gavitt", "R. Karri"],
    483       "year": 2022,
    484       "doi": "10.1109/SP46214.2022.00057",
    485       "relevance": "Security evaluation of LLM-generated code; relevant to understanding quality and safety of AI code generation."
    486     },
    487     {
    488       "title": "GitHub Copilot AI pair programmer: Asset or liability?",
    489       "authors": ["A. Moradi Dakhel", "V. Majdinasab", "A. Nikanjam", "F. Khomh"],
    490       "year": 2023,
    491       "relevance": "Evaluates correctness of Copilot-generated code; relevant to LLM code generation quality assessment."
    492     },
    493     {
    494       "title": "Exploring the effectiveness of large language models in generating unit tests",
    495       "authors": ["M. L. Siddiq", "J. C. S. Santos", "R. H. Tanvir"],
    496       "year": 2023,
    497       "relevance": "Uses zero-shot prompting with GPT-3.5-Turbo for code generation tasks; directly relevant methodology."
    498     },
    499     {
    500       "title": "Extracting training data from large language models",
    501       "authors": ["N. Carlini", "F. Tramèr", "E. Wallace"],
    502       "year": 2020,
    503       "relevance": "Data leakage and training data extraction from LLMs; relevant to contamination concerns in LLM evaluation."
    504     }
    505   ],
    506   "engagement_factors": {
    507     "practical_relevance": {
    508       "score": 2,
    509       "justification": "Provides concrete fine-tuning and prompt engineering recipes for code repair with publicly available tools and datasets."
    510     },
    511     "surprise_contrarian": {
    512       "score": 1,
    513       "justification": "The finding that LLMs still fail to fulfill review intentions ~50% of the time is mildly surprising given the hype around LLMs for code tasks."
    514     },
    515     "fear_safety": {
    516       "score": 0,
    517       "justification": "No safety, security, or AI risk concerns raised."
    518     },
    519     "drama_conflict": {
    520       "score": 0,
    521       "justification": "No controversy or conflict with established claims."
    522     },
    523     "demo_ability": {
    524       "score": 1,
    525       "justification": "Zenodo replication package with scripts is available, but no live demo or pip-installable tool."
    526     },
    527     "brand_recognition": {
    528       "score": 1,
    529       "justification": "Uses GPT-3.5-Turbo (recognizable) but the authors and institutions are not widely known."
    530     }
    531   }
    532 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs