scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (35259B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing Automated Program Repair through Fine-tuning and Prompt Engineering",
      6     "authors": [
      7       "Rishov Paul",
      8       "Md. Mohib Hossain",
      9       "Mohammed Latif Siddiq",
     10       "Masum Hasan",
     11       "Anindya Iqbal",
     12       "Joanna C. S. Santos"
     13     ],
     14     "year": 2023,
     15     "venue": "arXiv",
     16     "arxiv_id": "2304.07840",
     17     "doi": null
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract claims pre-trained models 'notably outperformed each of the previous models' (supported by Table II) and that 'the practical application of using LLMs in the context of automated program repair is still a long way off' (supported by RQ3 results showing <60% review fulfillment).",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The conclusion states 'this boost in accuracy is due to mostly the learned parameters of the model rather than the architecture itself,' a causal claim about the source of improvement. No ablation separating architecture from pre-trained weights is conducted to support this. The comparison conflates model architecture, pre-training corpus, and fine-tuning effects.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title 'Enhancing Automated Program Repair through Fine-tuning and Prompt Engineering' is broad, but results are limited to Java code from two code review datasets. The abstract discusses 'automated program repair' without bounding to Java or code review-based repair. The threats to validity section acknowledges the single-language limitation but the title and abstract do not.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Section VI discusses data leakage as an alternative explanation for GPT-3.5 performance: 'One possible reason might be that these LLMs were also trained with our aforementioned datasets.' It also discusses hyperparameter sensitivity and dataset quality issues (vague reviews) as confounding factors.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper explicitly distinguishes between automated metrics (exact match, BLEU, CodeBLEU) and actual review fulfillment (RQ3 manual analysis), acknowledging that exact match may miss valid alternative fixes. RQ3 was specifically designed to address this proxy gap.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section VI 'Threats to Validity' is a dedicated section discussing internal and external validity threats.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The paper discusses specific threats: hyperparameter search space limitations for this specific architecture, confinement to Java as the only programming language, and potential data leakage in GPT-3.5-Turbo due to training cutoff dates relative to dataset publication.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper explicitly states 'the datasets consisted of only Java codes and respective code reviews in the English language; hence, our focus was confined to a single programming language' and notes that 'the coverage of our findings is limited.'",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding source is mentioned anywhere in the paper. There is no acknowledgments section listing grants or sponsors.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are listed: BUET (Bangladesh), University of Notre Dame, and University of Rochester. None of the authors are affiliated with companies whose products are being evaluated (OpenAI).",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No funding is disclosed, so independence of funders cannot be assessed. The absence of a funding statement does not confirm the work is unfunded.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement or financial interest disclosure is present in the paper. Two authors list IQVIA email addresses but this affiliation is not discussed in relation to the research.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section II (Background) explicitly defines all key terms — automated program repair, code review, LLMs, zero-shot learning, and few-shot learning — with concrete examples and citations.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section I lists six explicit contributions including validation of LLM-based code repair, PLBART vs. CodeT5 comparison, zero/few-shot investigation, manual analysis, and a replication package.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section VII (Related Works) substantively engages with prior APR approaches (SemFix, Getafix, SequenceR, DeepFix, CoCoNut) and directly positions contributions relative to Review4Repair and Tufano et al.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "A Zenodo replication package is provided at https://doi.org/10.5281/zenodo.8122636, described as containing 'all the scripts used to gather the data and results.'",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "Both datasets used (Tufano et al. and Review4Repair) are publicly available from prior work, and the replication package on Zenodo includes the processed data.",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The paper mentions using an 'NVIDIA GeForce RTX 2070-8GB GPU' but does not provide library versions, requirements.txt, Dockerfile, or other dependency specifications needed to recreate the environment.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "The paper does not include step-by-step reproduction instructions. The Zenodo replication package contains scripts, but the paper itself provides no README-like instructions or 'Reproducing Results' section.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "All results in Tables II and III are reported as point estimates (e.g., '25.28%', '29.82%') with no confidence intervals or error bars.",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "The paper claims models 'outperform' baselines and reports improvements like '+10.23%' but never applies statistical significance tests (no p-values, t-tests, or other tests).",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Table II reports absolute improvements over baselines (e.g., '+5.69', '+10.23') alongside both the baseline and model absolute values, providing sufficient context to assess effect magnitude.",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": true,
    169           "justification": "For RQ3 (manual analysis), the paper justifies sample sizes: 'we randomly collected 314 test samples from Tufano et al. and 340 test samples from Review4Repair datasets in order to achieve a 95% confidence interval and 5% error of margin.'",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No variance, standard deviation, or results across multiple runs are reported. All results appear to be from single experimental runs.",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "The paper compares against R4R CC (Review4Repair baseline) and Tufano 2-encoder (Tufano et al. baseline), both from the respective prior works.",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "The baselines are from the original dataset papers (Review4Repair 2022 and Tufano et al. 2021), which are the most relevant and recent prior work on these specific datasets.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": false,
    195           "justification": "No ablation study is conducted. The paper compares different models but does not ablate components within any approach (e.g., removing pre-training, removing code review input) to isolate their contributions.",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "The paper uses five metrics: Top-1 Accuracy, Top-5 Accuracy, Top-10 Accuracy (exact match at different beam sizes), BLEU-4, and CodeBLEU.",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "RQ3 involves two software developers manually scoring repaired code from all five models based on whether the generated fix fulfills the intention of the code review, with Cohen's Kappa for inter-rater agreement.",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Both datasets use explicit train/validation/test splits. The Tufano dataset has 13,756/1,719/1,719 splits. The Review4Repair dataset was reorganized to 53,198/2,956/2,955 splits.",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Figure 2 provides per-category breakdowns across three fix categories (Insert, Delete, Update) for both datasets, showing each model's performance in each category.",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section V.A discusses specific failure observations from developers: vague reviews ('nice'), context-dependent reviews ('check my previous comment'), and ground truth misalignment with focus scope. Section III-B5 details common failure patterns in LLM outputs.",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The paper reports several negative findings: few-shot prompting does not consistently outperform zero-shot (Table III, Review4Repair after heuristics), LLM accuracy is 'not satisfactory' overall, and manual analysis shows models fulfill review intentions in only ~40-59% of cases.",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": false,
    239           "justification": "GPT-3.5-Turbo is referenced without a version/snapshot date (e.g., 'gpt-3.5-turbo-0301'). Code-DaVinci-Edit-001 is a specific model name. PLBART and CodeT5 reference their original papers but not specific checkpoint versions.",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Listing 2 provides the full prompt structure for both zero-shot and few-shot prompting, including the actual command text ('Refactor the Buggy Code using the Review without comments') and system role content ('You are a coding assistant. You generate only the source code.'). Fill values come from public datasets.",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Temperature=0, top_p=1, frequency_penalty=0, presence_penalty=0 for LLMs. For fine-tuning: beam sizes (1, 5, 10), epochs (11, 12, 45), batch size=4, accumulated gradient steps=8, input length=512, target length=200, patience=10.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "No agentic scaffolding is used. The paper evaluates models via direct fine-tuning and single-turn prompting.",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Section III-A1 describes preprocessing in detail: concatenation of buggy code and review with special tokens, removal of extra spaces and newlines, handling of samples exceeding 512 tokens (57+6 removed), and reorganization of Review4Repair into 90/5/5 splits.",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "Both datasets are publicly available from prior work (Tufano et al. and Review4Repair), and the Zenodo replication package provides the processed data and scripts.",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section III-A1 describes both datasets: Tufano et al. contains 17,194 samples from Gerrit and GitHub, Review4Repair contains 56,211 training and 2,961 test samples from Gerrit. Preprocessing steps are detailed.",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": false,
    283           "justification": "For RQ3, two developers are described as having 'one year of industry experience in a Fortune 500 company' but no details on how they were recruited or selected, or whether this introduces bias.",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The pipeline from raw datasets through preprocessing (tokenization, length filtering, split reorganization), model input creation, output generation, heuristic post-processing, and evaluation is documented across Sections III-A and III-B, including counts of removed samples (57+6 over 512 tokens).",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": true,
    297           "justification": "The paper states 'The knowledge cut-off of these two models is September 2021' for GPT-3.5-Turbo and Code-DaVinci-Edit-001 in Section VI.",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": true,
    303           "justification": "Section VI discusses potential overlap: 'One possible reason might be that these LLMs were also trained with our aforementioned datasets. As a result, there might be a data leakage.' They note the Tufano dataset was published before September 2021 while Review4Repair was published after.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": true,
    309           "justification": "The paper acknowledges contamination risk: the Tufano dataset was published before the models' training cutoff and could be in training data. They note 'As these models are black-box, there is no way we can verify if there is data leakage for these datasets.'",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "No pre-registration is mentioned for the manual analysis study (RQ3).",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": true,
    322           "answer": false,
    323           "justification": "No IRB or ethics board approval is mentioned for the developer evaluation study.",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": true,
    328           "answer": true,
    329           "justification": "The two developers are described as having 'one year of industry experience in a Fortune 500 company and significant involvement in the code review process in software development.'",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": true,
    334           "answer": false,
    335           "justification": "No inclusion or exclusion criteria are stated for selecting the two developer evaluators. No screening process is described.",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "This is a rating/evaluation task, not an experimental study with conditions assigned to participants. Randomization of participant assignment to conditions is not applicable.",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": true,
    346           "answer": false,
    347           "justification": "The paper does not describe whether the two developers were blinded to which model produced each repair. Knowing the model source could introduce bias in their ratings.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": true,
    352           "answer": false,
    353           "justification": "Attrition is not explicitly reported. While results imply both raters completed all evaluations, there is no explicit statement of how many started vs. finished.",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "No API costs are reported for GPT-3.5-Turbo or Code-DaVinci-Edit-001 usage, despite making API calls for all test samples across multiple prompting strategies.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "The paper mentions using an 'NVIDIA GeForce RTX 2070-8GB GPU' but does not state GPU hours, total training time, or total API spend.",
    368           "source": "opus"
    369         }
    370       },
    371       "experimental_rigor": {
    372         "seed_sensitivity_reported": {
    373           "applies": true,
    374           "answer": false,
    375           "justification": "No mention of multiple random seeds. Results appear to be from single training runs for the fine-tuned models.",
    376           "source": "opus"
    377         },
    378         "number_of_runs_stated": {
    379           "applies": true,
    380           "answer": false,
    381           "justification": "The number of experimental runs is not stated. It appears each model was trained once per configuration. For API calls, temperature=0 makes output deterministic but this is not explicitly discussed as justification for single runs.",
    382           "source": "opus"
    383         },
    384         "hyperparameter_search_budget": {
    385           "applies": true,
    386           "answer": false,
    387           "justification": "The paper mentions experimenting with 'various numbers of epochs' and different beam sizes but does not report the total number of configurations tried or the compute spent on hyperparameter search.",
    388           "source": "opus"
    389         },
    390         "best_config_selection_justified": {
    391           "applies": true,
    392           "answer": true,
    393           "justification": "Epoch selection is justified via validation performance: 'we used 11 epochs because we found that the model's performance remains unchanged after 11 epochs' with patience=10. CodeT5 was fine-tuned for 45 epochs 'based on observing validation losses.'",
    394           "source": "opus"
    395         },
    396         "multiple_comparison_correction": {
    397           "applies": true,
    398           "answer": false,
    399           "justification": "Many comparisons are made (multiple models × 2 datasets × multiple metrics) but no correction for multiple comparisons is applied. No statistical tests are used at all.",
    400           "source": "opus"
    401         },
    402         "self_comparison_bias_addressed": {
    403           "applies": true,
    404           "answer": false,
    405           "justification": "The authors fine-tune their own models and compare against baseline results from other papers without acknowledging the systematic bias of evaluating one's own system.",
    406           "source": "opus"
    407         },
    408         "compute_budget_vs_performance": {
    409           "applies": true,
    410           "answer": false,
    411           "justification": "The paper does not compare performance relative to compute budget. Fine-tuning PLBART/CodeT5 and calling OpenAI APIs have very different compute costs, but this is not discussed.",
    412           "source": "opus"
    413         },
    414         "benchmark_construct_validity": {
    415           "applies": true,
    416           "answer": false,
    417           "justification": "The paper does not discuss whether exact match on these specific code review datasets actually measures automated program repair capability, or whether the datasets are representative of real-world repair scenarios.",
    418           "source": "opus"
    419         },
    420         "scaffold_confound_addressed": {
    421           "applies": false,
    422           "answer": false,
    423           "justification": "No scaffolding is involved. Models are evaluated via direct fine-tuning or single-turn prompting.",
    424           "source": "opus"
    425         }
    426       },
    427       "data_leakage": {
    428         "temporal_leakage_addressed": {
    429           "applies": true,
    430           "answer": true,
    431           "justification": "Section VI discusses temporal leakage: the Tufano et al. dataset was published before the models' September 2021 training cutoff, meaning solutions could be in training data. The Review4Repair dataset was published after the cutoff.",
    432           "source": "opus"
    433         },
    434         "feature_leakage_addressed": {
    435           "applies": true,
    436           "answer": false,
    437           "justification": "No discussion of whether the evaluation setup provides information that would not be available in real-world usage (e.g., the <START>/<END> tags marking the exact buggy location).",
    438           "source": "opus"
    439         },
    440         "non_independence_addressed": {
    441           "applies": true,
    442           "answer": false,
    443           "justification": "No discussion of whether train and test examples share structural similarities (e.g., from the same repositories, same authors, or near-duplicate code patterns).",
    444           "source": "opus"
    445         },
    446         "leakage_detection_method": {
    447           "applies": true,
    448           "answer": false,
    449           "justification": "The paper acknowledges leakage risk but explicitly states 'As these models are black-box, there is no way we can verify if there is data leakage' without applying any detection method (canary strings, n-gram overlap, membership inference).",
    450           "source": "opus"
    451         }
    452       }
    453     }
    454   },
    455   "claims": [
    456     {
    457       "claim": "Fine-tuned PLBART and CodeT5 significantly outperform prior baselines on code review-guided automated program repair",
    458       "evidence": "Table II: CodeT5 improves Top-10 accuracy by 24.72pp on Tufano et al. dataset; PLBART by 20.41pp. On Review4Repair, both improve by ~9-10pp over R4R CC baseline.",
    459       "supported": "strong"
    460     },
    461     {
    462       "claim": "CodeT5 outperforms PLBART primarily due to better NL/PL comprehension captured in pre-training",
    463       "evidence": "Authors attribute the difference to CodeT5's identifier-aware pre-training, but no ablation study is provided to isolate this factor from architectural differences.",
    464       "supported": "weak"
    465     },
    466     {
    467       "claim": "Zero-shot GPT-3.5-Turbo substantially underperforms fine-tuned models on exact match accuracy",
    468       "evidence": "Table III: Zero-shot GPT-3.5-Turbo with heuristics achieves 22.06% (R4R) vs. CodeT5's 29.82%; 31.70% (Tufano) vs. CodeT5's 33.28%.",
    469       "supported": "moderate"
    470     },
    471     {
    472       "claim": "Heuristic post-processing improves GPT-3.5-Turbo exact match accuracy by 12-16 percentage points",
    473       "evidence": "Table III shows accuracy improvement from 6.9% to 22.06% (+15.16pp) on R4R and 17.86% to 31.70% (+13.84pp) on Tufano after applying five heuristics.",
    474       "supported": "strong"
    475     },
    476     {
    477       "claim": "Code-DaVinci-Edit-001 achieves competitive or best performance without any fine-tuning",
    478       "evidence": "Table III: Code-DaVinci-Edit-001 achieves 40.70% accuracy on Tufano et al. (best among all models tested) and highest CodeBLEU on both datasets without task-specific fine-tuning.",
    479       "supported": "strong"
    480     },
    481     {
    482       "claim": "Language models fail to fulfill code review intentions in over 40% of cases across all models",
    483       "evidence": "Table IV: Best fulfillment rates are 58.92% (Code-DaVinci-Edit-001 on Tufano) and 52.65% (CodeT5 on R4R), implying 41-47% failure rates even for best-performing models.",
    484       "supported": "strong"
    485     },
    486     {
    487       "claim": "GPT-3.5-Turbo's strong performance may be inflated by training data contamination",
    488       "evidence": "Section VI acknowledges Tufano et al. dataset predates the September 2021 knowledge cutoff but cannot be verified given black-box model nature.",
    489       "supported": "weak"
    490     }
    491   ],
    492   "methodology_tags": [
    493     "benchmark-eval",
    494     "qualitative"
    495   ],
    496   "key_findings": "Fine-tuned PLBART and CodeT5 substantially outperform prior seq2seq baselines on code-review-guided automated program repair, with CodeT5 improving Top-10 accuracy by up to 25 percentage points on the Tufano et al. dataset. Zero-shot GPT-3.5-Turbo underperforms fine-tuned models on exact match but benefits significantly (~15pp) from heuristic post-processing, while Code-DaVinci-Edit-001 achieves competitive or best results without fine-tuning. Manual developer evaluation reveals that even best-performing models fulfill code review intentions in fewer than 60% of cases, demonstrating that automated program repair guided by code review remains far from production-ready despite benchmark accuracy improvements. Contamination is acknowledged but unresolvable for black-box LLMs given the Tufano dataset predates the September 2021 training cutoff.",
    497   "red_flags": [
    498     {
    499       "flag": "No significance tests for main comparisons",
    500       "detail": "Tables II and III report only point estimates with no statistical significance tests for primary accuracy comparisons, despite claiming models 'significantly' outperform baselines."
    501     },
    502     {
    503       "flag": "Single-run results without variance",
    504       "detail": "All accuracy metrics are single-run point estimates with no variance, standard deviation, or confidence intervals reported, making result reliability unknown."
    505     },
    506     {
    507       "flag": "Unverifiable data contamination for LLMs",
    508       "detail": "The Tufano et al. dataset was published before GPT-3.5's September 2021 knowledge cutoff; the paper acknowledges but cannot rule out test set contamination in GPT model evaluations, potentially inflating LLM results."
    509     },
    510     {
    511       "flag": "Two-person unblinded human evaluation",
    512       "detail": "RQ3 uses only 2 developers as evaluators with no blinding procedure described, making it unclear whether raters knew which model produced which output during scoring."
    513     },
    514     {
    515       "flag": "No ablation study",
    516       "detail": "The paper does not include an ablation study isolating contributions of pre-training, fine-tuning, or code review inclusion, making it impossible to attribute which factors drive improvements."
    517     },
    518     {
    519       "flag": "Java-only generalizability",
    520       "detail": "All experiments use Java-only datasets; no testing on other programming languages substantially limits generalizability of claims about 'automated program repair' broadly."
    521     }
    522   ],
    523   "cited_papers": [
    524     {
    525       "title": "Review4Repair: Code review aided automatic program repairing",
    526       "relevance": "Primary dataset source and direct prior work establishing code-review-guided APR baseline; directly compared against throughout."
    527     },
    528     {
    529       "title": "Towards automating code review activities (Tufano et al., ICSE 2021)",
    530       "relevance": "Second primary dataset source and baseline; established the code review + APR paradigm this work extends."
    531     },
    532     {
    533       "title": "Unified pre-training for program understanding and generation (PLBART)",
    534       "relevance": "One of two fine-tuned models evaluated; key pre-trained model combining NL and PL understanding."
    535     },
    536     {
    537       "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation",
    538       "relevance": "Best-performing fine-tuned model in most experiments; identifier-aware pre-training is cited as key advantage."
    539     },
    540     {
    541       "title": "Evaluating large language models trained on code (Codex)",
    542       "relevance": "Base model underlying Code-DaVinci-Edit-001; establishes LLM code generation capability benchmark."
    543     },
    544     {
    545       "title": "Language models are few-shot learners (GPT-3/Brown et al.)",
    546       "relevance": "Foundation model for GPT-3.5-Turbo; establishes few-shot prompting paradigm used in this paper."
    547     },
    548     {
    549       "title": "CoCoNut: Combining context-aware neural translation models using ensemble for program repair",
    550       "relevance": "Prior learning-based APR approach without code review; contextualizes baseline repair capabilities."
    551     },
    552     {
    553       "title": "Exploring the effectiveness of large language models in generating unit tests (Siddiq et al.)",
    554       "relevance": "Sister study by overlapping authors using GPT-3.5-Turbo for code tasks; zero-shot prompting methodology directly borrowed."
    555     },
    556     {
    557       "title": "Pre-train, prompt, and predict: A systematic survey of prompting methods in NLP",
    558       "relevance": "Foundational survey establishing prompt engineering framework applied throughout this work."
    559     }
    560   ],
    561   "engagement_factors": {
    562     "practical_relevance": {
    563       "score": 2,
    564       "justification": "Provides concrete fine-tuning and prompt engineering recipes for code repair with publicly available tools and datasets."
    565     },
    566     "surprise_contrarian": {
    567       "score": 1,
    568       "justification": "The finding that LLMs still fail to fulfill review intentions ~50% of the time is mildly surprising given the hype around LLMs for code tasks."
    569     },
    570     "fear_safety": {
    571       "score": 0,
    572       "justification": "No safety, security, or AI risk concerns raised."
    573     },
    574     "drama_conflict": {
    575       "score": 0,
    576       "justification": "No controversy or conflict with established claims."
    577     },
    578     "demo_ability": {
    579       "score": 1,
    580       "justification": "Zenodo replication package with scripts is available, but no live demo or pip-installable tool."
    581     },
    582     "brand_recognition": {
    583       "score": 1,
    584       "justification": "Uses GPT-3.5-Turbo (recognizable) but the authors and institutions are not widely known."
    585     }
    586   },
    587   "hn_data": {
    588     "threads": [
    589       {
    590         "hn_id": "37215331",
    591         "title": "The Simplest Walking Robot: A bipedal robot with 1 actuator and 2 rigid bodies",
    592         "points": 59,
    593         "comments": 29,
    594         "url": "https://news.ycombinator.com/item?id=37215331"
    595       },
    596       {
    597         "hn_id": "37518075",
    598         "title": "Agents: An Open-Source Framework for Autonomous Language Agents",
    599         "points": 7,
    600         "comments": 1,
    601         "url": "https://news.ycombinator.com/item?id=37518075"
    602       },
    603       {
    604         "hn_id": "46100377",
    605         "title": "RIP Twitter API: A eulogy to its vast research contributions",
    606         "points": 4,
    607         "comments": 0,
    608         "url": "https://news.ycombinator.com/item?id=46100377"
    609       },
    610       {
    611         "hn_id": "40117178",
    612         "title": "RIP Twitter API: A eulogy to its research contributions",
    613         "points": 4,
    614         "comments": 0,
    615         "url": "https://news.ycombinator.com/item?id=40117178"
    616       },
    617       {
    618         "hn_id": "37478569",
    619         "title": "Brain-Inspired Computational Intelligence via Predictive Coding",
    620         "points": 4,
    621         "comments": 0,
    622         "url": "https://news.ycombinator.com/item?id=37478569"
    623       },
    624       {
    625         "hn_id": "47717676",
    626         "title": "Your Agent Is Mine: Measuring Malicious Attacks on the LLM Supply Chain",
    627         "points": 3,
    628         "comments": 1,
    629         "url": "https://news.ycombinator.com/item?id=47717676"
    630       },
    631       {
    632         "hn_id": "37189091",
    633         "title": "Calypso: LLMs as Dungeon Masters' Assistants",
    634         "points": 3,
    635         "comments": 0,
    636         "url": "https://news.ycombinator.com/item?id=37189091"
    637       },
    638       {
    639         "hn_id": "35613390",
    640         "title": "Nearby Stars' Close Encounters with the Brightest Earth Transmissions",
    641         "points": 2,
    642         "comments": 1,
    643         "url": "https://news.ycombinator.com/item?id=35613390"
    644       },
    645       {
    646         "hn_id": "36690558",
    647         "title": "AVX Timing Side-Channel Attacks Against Address Space Layout Randomization",
    648         "points": 2,
    649         "comments": 0,
    650         "url": "https://news.ycombinator.com/item?id=36690558"
    651       },
    652       {
    653         "hn_id": "37721210",
    654         "title": "Large Language Models as Superpositions of Cultural Perspectives",
    655         "points": 2,
    656         "comments": 0,
    657         "url": "https://news.ycombinator.com/item?id=37721210"
    658       }
    659     ],
    660     "top_points": 59,
    661     "total_points": 90,
    662     "total_comments": 32
    663   }
    664 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs