scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30181B)
      1 {
      2   "paper": {
      3     "title": "Automatic Patch Correctness Assessment with Large Language Model",
      4     "authors": [
      5       "Xin Zhou",
      6       "Bowen Xu",
      7       "Kisub Kim",
      8       "DongGyun Han",
      9       "Hung Huu Nguyen",
     10       "Thanh Le-Cong",
     11       "Junda He",
     12       "Bach Le",
     13       "David Lo"
     14     ],
     15     "year": 2024,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2303.00202",
     18     "doi": "10.48550/arXiv.2303.00202"
     19   },
     20   "scan_version": 3,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval"],
     23   "key_findings": "LLM4PatchCorrect uses Starcoder-7B with in-context learning to assess patch correctness in a cross-tool setting where no labeled patches from the target APR tool are available. It achieves 84.4% accuracy and 86.5% F1 on average across 22 APR tools, significantly outperforming prior static and dynamic APCA approaches. The contrastive learning-based retrieval of semantically similar patches is the most impactful component, contributing an 84.7% relative AUC improvement. Results generalize to the Bears benchmark with 92.1% average accuracy.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Section 6.5 states 'We publicly share our implementation and dataset for future comparisons' and Section 5.1 references a 'replication package,' but no URL, repository link, or archive is provided anywhere in the paper."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The evaluation uses publicly available datasets from Wang et al. [29] and Tian et al. [23], merged by Lin et al. [33], containing 1,179 patches from the Defects4J benchmark [60]. The underlying data is publicly accessible through these prior works."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper mentions using a 2080-Ti GPU with 12GB memory, HuggingFace library, and int8 quantization, but provides no requirements.txt, Dockerfile, or detailed dependency listing with library versions sufficient to recreate the environment."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided. The paper describes the methodology conceptually but does not include a README, scripts, or commands to replicate experiments."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Tables 2–4, 6, 8–10 report only point estimates for Accuracy, F1, and AUC. No confidence intervals, error bars, or ± notation appears anywhere in the results."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 5.1 states: 'we conduct the Wilcoxon signed-rank tests between LLM4PatchCorrect and all baselines to investigate whether the improvements are significant. The results show that LLM4PatchCorrect is statistically significantly better than all baselines (all p-values are less than 0.05).'"
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper reports both absolute values and relative improvement percentages throughout, e.g., '20.9% (84.4−69.8/69.8)' improvement against Tian et al. in accuracy (Section 5.1), providing sufficient context for effect size interpretation."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The dataset of 1,179 patches from 22 APR tools is used without any justification for adequacy or power analysis. Some individual APR tool test sets are very small (e.g., SOFix has only 11 patches, HDRepair 8 patches)."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. All results are single-point estimates with no indication of result stability."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Six baselines are compared: Patch-Sim [25], CodeBERT [40], Tian et al. [23], ODS [27], Quatrain [34], and Cache [33], covering both dynamic and static APCA approaches."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Cache (Lin et al., 2022) and Quatrain (Tian et al., 2022) represent the state-of-the-art at the time of writing. The baselines span from 2018 (Patch-Sim) to 2022 (Cache), which is appropriate."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 5.2 (RQ2) presents a systematic ablation study in Table 7, evaluating the contribution of bug information, test information, and retrieved patches, building from LLM-only to the full model."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Three evaluation metrics are used: Accuracy, F1-score, and AUC, reported in Tables 2, 3, and 4 respectively. Both averaged and weighted averaged results are provided."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Evaluation is entirely automated using pre-existing correctness labels. No human evaluation of the system's predictions is performed."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Cross-tool validation uses a leave-one-out design (Section 4.2): patches from each APR tool serve as the test set while patches from all other tools form the training set. Hyperparameter tuning used a separate 5% random split of the labeled patch pool (Section 4.5)."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Tables 2–4 provide per-APR-tool breakdowns for all 22 tools, including the correct:wrong ratio for each. Tables 8–9 provide per-tool results on the Bears benchmark."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 6.1 provides a case study showing how the retrieval module corrects an initial misprediction. Section 5.1 discusses SOFix's low scores due to class imbalance (10:1 ratio) and ODS inference failures on certain patches."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Tables 2–4 show cases where LLM4PatchCorrect does not improve over baselines, e.g., CapGen (-2.4% accuracy), GenProg (-4.2%), kPAR (-7.2%), Jaid (-1.9%). These are reported transparently."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims 84.4% accuracy and 86.5% F1, which match the average row in Tables 2 and 3. The claimed improvement ranges (10.2%–32.4% accuracy, 6.1%–24.1% F1, 10.1%–33.2% AUC) are supported by the tables."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The ablation study (Table 7) systematically adds components to isolate their individual contributions. The controlled single-variable manipulations in the ablation design support claims like 'the patch retrieval module leads to relative improvements of 11.4%, 3.9%, and 84.7%' (Section 5.2)."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title claims 'Automatic Patch Correctness Assessment with Large Language Model' without bounding to Java. All experiments use only Java patches from Defects4J and Bears benchmarks. No explicit statement that results may not generalize to other languages or patch types."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Section 6.5 discusses specific threats: selection bias from choosing particular LLMs, dataset selection bias from using specific benchmarks, prompt design optimality ('we cannot ensure that our prompt is optimal'), and model under-training. These are specific to the study design."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper measures classification accuracy/F1/AUC on manually labeled patch correctness data, and claims to assess patch correctness. The measurement directly matches the claimed outcome — no proxy gap exists between the metric and the framing."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Specific model identifiers are provided: Starcoder-7B, Starcoder-1B, Starcoder-3B, CodeLlama-7B, CodeGen2-3.7B, BLOOM-1.7B, CodeBERT (Table 1, Table 10). These are versioned open-source models with fixed weights."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The prompt template is given in Section 3.1: '{test-patch} Q: It was wrong or correct? A: It was'. Guiding information templates are shown in Section 3.3 and Figure 2. Figure 7 shows a complete concatenated example input."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Contrastive learning: lr=5e-5, batch_size=64, epochs=3 (Section 3.2.3). CodeBERT fine-tuning: lr=1e-5, epochs=8 (Section 4.3). Retrieval: k=10, β=0.9 (Section 4.5). int8 quantization for inference (Section 3.4.2). Inference uses argmax probability comparison (deterministic)."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. The approach is a direct inference pipeline: retrieve similar patches → concatenate guiding information → single LLM inference pass."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 4.1 documents dataset merging from two sources, duplicate removal via string matching, manual semantic duplicate check (identifying 2 pairs of semantically equivalent patches), and BPE tokenization (Section 3.1)."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 6.5 'Threats to Validity' contains three subsections: Threats to External Validity, Threats to Internal Validity, and Threats to Construct Validity, with substantive discussion in each."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Specific threats include: 'our study may have a selection bias by considering only several large pre-trained models,' 'the manually crafted prompt...we cannot ensure that our prompt is optimal,' and 'the large pre-trained model we employ...may have been under-trained.' These are specific to this study's design."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The paper does not explicitly state what the results do NOT show. No mention that results are limited to Java, to Defects4J-style bugs, or to the specific APR tool types tested. The future work section mentions other tasks but does not bound the current scope."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The evaluation uses publicly available patch datasets from Wang et al. [29] and Tian et al. [23], built on the public Defects4J [60] and Bears [65] benchmarks. Raw patches and labels are accessible through these prior works."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 4.1 describes the dataset: 1,179 patches from 22 APR tools, merged from two large-scale datasets, with correctness labels 'carefully labeled and checked' by developers. Deduplication steps are documented."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. Data comes from standard public benchmarks (Defects4J, Bears) and previously published patch datasets."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The pipeline is documented: two datasets merged (Section 4.1) → duplicate removal via string matching → manual semantic duplicate check (2 pairs found and removed) → cross-tool split for evaluation (Section 4.2) → tokenization via BPE (Section 3.1)."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding acknowledgment or grant information appears anywhere in the paper text provided."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: Singapore Management University, North Carolina State University, Royal Holloway University of London, and University of Melbourne."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No funding source is disclosed, so independence of funder from outcome cannot be verified."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests statement or financial interests declaration appears in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Starcoder-7B's training data cutoff is not stated. The paper mentions it is pre-trained on '1,000 billion tokens from 80+ programming languages' but does not specify the temporal range of the training corpus."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "While Section 4.2 removes identical patches between the cross-tool train/test splits, there is no discussion of whether Starcoder-7B's pre-training data includes the Defects4J benchmark code, patches, or bug descriptions."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "Defects4J has been publicly available since 2014 and its bugs/patches are widely discussed online. Starcoder was trained on public code repositories. The paper does not discuss whether Starcoder may have seen the benchmark data during pre-training."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study. The evaluation uses automated classification on pre-labeled datasets."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. The study evaluates patch classification on existing benchmarks."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in the study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in the study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Section 7 (Related Work) states 'LLM4PatchCorrect only costs 2.4 seconds for each patch,' compared to Invalidator's 5 hours + 7 minutes per patch. Hardware is specified as a 2080-Ti GPU with 12GB memory."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The paper mentions the 2080-Ti GPU and per-patch inference time (2.4s), but does not state total compute budget for the contrastive learning training, hyperparameter search, or full experimental evaluation."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No mention of multiple random seeds or seed sensitivity analysis. The contrastive learning training and any stochastic components are not analyzed for seed sensitivity."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The number of experimental runs is never stated. Results appear to be from single runs without any indication of repetition."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Section 4.5 reports the hyperparameter search: β=[0.80, 0.82, 0.85, 0.87, 0.9, 0.92, 0.95] and k=[2, 4, 6, 8, 10, 12, 14, 15] (7×8=56 configurations), conducted on a randomly split 5% of the labeled patch pool."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "Section 4.5 describes selection of k=10 and β=0.9 based on AUC performance on a separate 5% validation split of the labeled patch pool, with results visualized in Figure 8."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The paper performs Wilcoxon signed-rank tests across 22 APR tools for 6 baselines but does not mention any correction for multiple comparisons (e.g., Bonferroni, Holm)."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors implement most baselines themselves (except Cache, which uses the authors' released implementation). No discussion of self-comparison bias or the risk that their reimplementations may underperform the originals."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "Table 10 compares different LLM sizes but does not report performance as a function of compute budget. The compute difference between LLM4PatchCorrect (Starcoder-7B inference + CodeBERT contrastive training) and baselines (e.g., CodeBERT fine-tuning) is not discussed."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of whether Defects4J or Bears benchmarks are valid proxies for real-world APCA scenarios. The paper does not question whether these benchmarks' patch distributions represent realistic deployment settings."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No agentic scaffolding is used. The approach is a direct retrieval-augmented inference pipeline."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Not discussed. Defects4J bugs and patches were available online before Starcoder's training. The contrastive learning model (CodeBERT) was trained on ManySStuBs4J, but temporal ordering relative to evaluation data is not addressed."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The guiding information (bug descriptions, execution traces, test cases, coverage) is provided as features, but there is no discussion of whether any of these features leak correctness information beyond what would be available in a realistic deployment scenario."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": true,
    363         "justification": "Section 4.1 describes duplicate removal via string matching and manual semantic duplicate checking (identifying 2 pairs). Section 4.2 removes identical patches between train/test splits 'to avoid the data leaking issue.' Different APR tools can generate identical patches for the same bug."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": true,
    368         "justification": "Concrete deduplication methods are applied: string matching for identical patches and manual examination for semantic equivalence (Section 4.1). Section 4.2 explicitly removes training patches identical to test patches."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "LLM4PatchCorrect achieves 84.4% accuracy and 86.5% F1-score on average across 22 APR tools in cross-tool validation.",
    375       "evidence": "Tables 2 and 3 (Section 5.1) show per-tool and averaged results. Wilcoxon signed-rank tests confirm statistical significance (all p < 0.05).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "LLM4PatchCorrect outperforms all baseline APCA approaches including Cache (state-of-the-art) by 14.7% accuracy, 6.8% F1, and 30.7% AUC on average.",
    380       "evidence": "Tables 2–4 provide head-to-head comparisons. Improvements over Cache: 84.4 vs 73.6 accuracy, 86.5 vs 81.0 F1, 80.4 vs 61.5 AUC (Section 5.1).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "The contrastive learning-based patch retrieval module is the most impactful component, contributing 84.7% relative AUC improvement.",
    385       "evidence": "Table 7 (Section 5.2) shows ablation results: AUC jumps from 41.8 (LLM only) to 77.2 (LLM + retrieved patches). Bug/test information adds smaller incremental gains.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "LLM4PatchCorrect generalizes to the Bears benchmark with 92.1% average accuracy, outperforming the best baseline by 27.0%.",
    390       "evidence": "Tables 8–9 (Section 6.2) show results on Bears across 4 APR tools. Best baseline Quatrain achieves 72.5% vs LLM4PatchCorrect's 92.1%.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "LLM4PatchCorrect can filter out 90.4% of overfitting patches while increasing the correct patch ratio from 21.7% to 63.7%.",
    395       "evidence": "Confusion matrix analysis in Section 5.1 and Figure 9(b): 584/646 overfitting patches correctly identified, remaining patches have 109/(109+62) = 63.7% correct ratio.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Larger LLM model sizes generally yield better performance within the LLM4PatchCorrect framework.",
    400       "evidence": "Table 10 (Section 6.3) compares 6 models from 1B to 7B parameters. Starcoder-7B leads in accuracy (84.4%) and F1 (86.5%), though Starcoder-1B is competitive in F1 (85.9%).",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "No variance or seed sensitivity reporting",
    407       "detail": "All results are single-point estimates with no standard deviations, confidence intervals, or multi-seed analysis. The contrastive learning component involves stochastic training, but stability is never assessed."
    408     },
    409     {
    410       "flag": "Benchmark contamination unaddressed",
    411       "detail": "Starcoder-7B was trained on public code repositories that likely include Defects4J patches and bug-related discussions. The paper never discusses whether the LLM may have memorized correctness patterns for these specific bugs."
    412     },
    413     {
    414       "flag": "Promised artifacts not provided",
    415       "detail": "Section 6.5 claims 'We publicly share our implementation and dataset' and Section 5.1 references a 'replication package,' but no URL, repository link, or archive identifier is provided in the paper."
    416     },
    417     {
    418       "flag": "Very small test sets for some APR tools",
    419       "detail": "Several APR tools have tiny test sets: SOFix (11 patches, 10:1 ratio), HDRepair (8 patches), Cardumen (9 patches, 0 correct). Results on these tools have high variance and limited statistical meaning."
    420     },
    421     {
    422       "flag": "Bears benchmark evaluation limited",
    423       "detail": "The Bears benchmark evaluation (Section 6.2) uses only 4 APR tools, and some have very few patches (Kali: 6, RSRepair: 2). The 100% accuracy on RSRepair with only 2 patches is not meaningful."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Starcoder: may the source be with you!",
    429       "authors": ["R. Li", "L. B. Allal", "Y. Zi"],
    430       "year": 2023,
    431       "arxiv_id": "2305.06161",
    432       "relevance": "Core LLM backbone used in this work; key open-source code LLM for evaluating in-context learning on SE tasks."
    433     },
    434     {
    435       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    436       "authors": ["Z. Feng", "D. Guo", "D. Tang"],
    437       "year": 2020,
    438       "arxiv_id": "2002.08155",
    439       "relevance": "Used as baseline and as the base model for contrastive learning patch embeddings; foundational code representation model."
    440     },
    441     {
    442       "title": "Code Llama: Open Foundation Models for Code",
    443       "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"],
    444       "year": 2023,
    445       "arxiv_id": "2308.12950",
    446       "relevance": "Alternative open-source code LLM compared in the model size analysis experiment."
    447     },
    448     {
    449       "title": "Language Models are Few-Shot Learners",
    450       "authors": ["T. B. Brown", "B. Mann", "N. Ryder"],
    451       "year": 2020,
    452       "arxiv_id": "2005.14165",
    453       "relevance": "Introduced in-context learning paradigm that LLM4PatchCorrect builds upon."
    454     },
    455     {
    456       "title": "Context-aware code change embedding for better patch correctness assessment",
    457       "authors": ["B. Lin", "S. Wang", "M. Wen", "X. Mao"],
    458       "year": 2022,
    459       "relevance": "State-of-the-art APCA baseline (Cache) that this work outperforms; provides the evaluation dataset."
    460     },
    461     {
    462       "title": "Evaluating representation learning of code changes for predicting patch correctness in program repair",
    463       "authors": ["H. Tian", "K. Liu", "A. K. Kaboré"],
    464       "year": 2020,
    465       "relevance": "Prior APCA approach using code representation learning as baseline; provides part of the evaluation dataset."
    466     },
    467     {
    468       "title": "Invalidator: Automated patch correctness assessment via semantic and syntactic reasoning",
    469       "authors": ["T. Le-Cong", "D.-M. Luong", "X. B. D. Le"],
    470       "year": 2023,
    471       "arxiv_id": "2301.01113",
    472       "relevance": "Related APCA approach combining dynamic and static features; compared on inference time (5 hours vs 2.4 seconds)."
    473     },
    474     {
    475       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    476       "authors": ["C. S. Xia", "L. Zhang"],
    477       "year": 2022,
    478       "relevance": "AlphaRepair uses infilling-style LLM for program repair without fine-tuning, related zero-shot approach in APR domain."
    479     },
    480     {
    481       "title": "Evaluating large language models trained on code",
    482       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    483       "year": 2021,
    484       "arxiv_id": "2107.03374",
    485       "relevance": "Codex/HumanEval paper, foundational work on LLMs for code that motivates using LLMs for SE tasks."
    486     },
    487     {
    488       "title": "Automated patch correctness assessment: How far are we?",
    489       "authors": ["S. Wang", "M. Wen", "B. Lin"],
    490       "year": 2020,
    491       "relevance": "Large-scale empirical study of APCA approaches; provides evaluation dataset and Patch-Sim comparison data."
    492     },
    493     {
    494       "title": "Automated classification of overfitting patches with statically extracted code features",
    495       "authors": ["H. Ye", "J. Gu", "M. Martinez"],
    496       "year": 2022,
    497       "relevance": "ODS baseline using AST-level static code features for patch correctness classification."
    498     },
    499     {
    500       "title": "SimCSE: Simple contrastive learning of sentence embeddings",
    501       "authors": ["T. Gao", "X. Yao", "D. Chen"],
    502       "year": 2021,
    503       "arxiv_id": "2104.08821",
    504       "relevance": "Contrastive learning method (dropout-based positive sampling) adapted for patch embedding in this work."
    505     }
    506   ],
    507   "engagement_factors": {
    508     "practical_relevance": {
    509       "score": 2,
    510       "justification": "Directly useful for APR practitioners who need to filter overfitting patches, but requires implementing the full pipeline with Starcoder-7B and contrastive learning."
    511     },
    512     "surprise_contrarian": {
    513       "score": 1,
    514       "justification": "Applying LLMs via in-context learning to patch assessment is a natural extension rather than a surprising finding."
    515     },
    516     "fear_safety": {
    517       "score": 0,
    518       "justification": "No AI safety or security concerns raised; the work is about improving software quality assurance."
    519     },
    520     "drama_conflict": {
    521       "score": 0,
    522       "justification": "No controversial claims or conflicts with existing work; straightforward performance comparison."
    523     },
    524     "demo_ability": {
    525       "score": 0,
    526       "justification": "No code repository URL, demo, or installable tool is provided despite claims of sharing artifacts."
    527     },
    528     "brand_recognition": {
    529       "score": 1,
    530       "justification": "Uses Starcoder (BigCode project) and CodeBERT — known models but not top-tier brand recognition like GPT-4 or Copilot."
    531     }
    532   }
    533 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs