scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30971B)
      1 {
      2   "paper": {
      3     "title": "Parameter-Efficient Fine-Tuning with Attributed Patch Semantic Graph for Automated Patch Correctness Assessment",
      4     "authors": [
      5       "Zhenyu Yang",
      6       "Jingwen Wu",
      7       "Zhen Yang",
      8       "Zhongxing Yu"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2505.02629",
     13     "doi": "10.48550/arXiv.2505.02629"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "The paper proposes APSG (Attributed Patch Semantic Graph) and Graph-LoRA, a parameter-efficient fine-tuning method that incorporates graph structural information into LLMs for automated patch correctness assessment. Evaluated on five APCA datasets with three LLMs (StarCoder, CodeLlama, Llama3), the method improves accuracy by 3.1%–7.5% and F1 by 3.0%–7.1% over the prior state-of-the-art LLM4PatchCorrect. Ablation studies show all components (APSG graph structure, node attributes, GNN, attention-based fusion) contribute positively, with context nodes having the largest impact among APSG node types.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper states: 'Our replication package (including code, dataset, etc.) is available at https://github.com/SEdeepL/GraphLoRA.' A concrete GitHub URL is provided."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The datasets used (Wang, Merge, Balance, Lin) are from publicly available prior work based on Defects4J, Bugs.jar, and Bears benchmarks. The replication package also includes dataset materials. The newly constructed Multi-Benchmarks dataset is built from publicly available sources with the construction process documented."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Section 5.5 mentions 'Pytorch framework' and 'Ubuntu 22.04.5 server equipped with two RTX A6000 GPUs' but provides no specific library versions, requirements.txt, or detailed environment specification sufficient to recreate the setup."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper provides a GitHub URL for the replication package but does not include step-by-step reproduction instructions within the paper itself. No 'Reproducing Results' section or specific commands to run experiments are given."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All results in Tables 4–17 are reported as point estimates (e.g., '96.8%') without confidence intervals, error bars, or any uncertainty quantification, despite using 10-fold cross-validation."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper makes numerous claims that 'our method outperforms' baselines based solely on comparing raw percentage values. No statistical significance tests (t-test, Wilcoxon, bootstrap, etc.) are used anywhere in the paper."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports percentage improvements with baseline context throughout. For example, 'our method improves the accuracy and F1 score by 3.1% to 7.5% and 3.0% to 7.1% respectively' (Section 6.1), with full baseline values visible in results tables."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is provided for dataset sizes or power analysis. The datasets range from 542 to 7,794 patches, but there is no discussion of whether these sizes are adequate for the claims being made."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper states '10-fold cross-validation and take the average of 10 rounds' (Section 4.5) but does not report standard deviation, interquartile range, or any spread measure across the 10 folds in any table."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Extensive baselines are included: 2 dynamic methods (PATCH-SIM, Opad), 3 traditional static methods (S3, ssFix, CapGen), 6 ML-based methods (ODS, BERT-LR, BATS, PANTHER, CACHE, APPT), and 1 LLM-based method (LLM4PatchCorrect) with 3 LLM variants each. See Section 5.3."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Key baselines are recent: LLM4PatchCorrect (2024), APPT (2024), CACHE (2022), PANTHER (2023). The paper includes the most recent state-of-the-art methods in APCA."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Two comprehensive ablation studies are performed in Section 6.2: (1) model structure ablation progressively removing attention, Graph-LoRA, APSG attributes, APSG graph, and training (Tables 9–10); (2) APSG node ablation removing variable, control, and context nodes (Tables 11–12)."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Four evaluation metrics are used: accuracy, precision, recall, and F1 score (Section 5.4). All four are reported for every experiment."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Evaluation is entirely automated using accuracy, precision, recall, and F1 score computed against ground-truth labels. No human evaluation of the system's outputs is performed."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper uses 10-fold cross-validation where each fold serves as a held-out test set (Section 4.5). Additionally, cross-project evaluation (Section 6.3) uses entire projects as held-out test sets."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down per dataset (Tables 4–8) and per project in cross-project evaluation (Tables 13–17, with Chart, Closure, Lang, Math, and Time individually reported)."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 7.1 presents a detailed case study with four cases: true negative, false negative (Fig. 4b), false positive (Fig. 4c), and true positive, with analysis of why the model failed in each error case."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The cross-project evaluation (Section 6.3) explicitly reports performance degradation compared to cross-validation. The case study discusses false positives and false negatives. The ablation study reveals that removing context nodes causes significant performance drops."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims 'improves the accuracy and F1 score by 3.1% to 7.5% and 3.0% to 7.1% respectively' which is supported by the comparison with LLM4PatchCorrect across Tables 4–8. The claim of not relying on ground-truth patches is confirmed by the methodology."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims (e.g., 'APSG plays a significant role') are justified through controlled ablation studies (Section 6.2) where individual components are systematically removed while keeping others constant. This constitutes controlled single-variable manipulation."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title and abstract frame contributions as general 'Automated Patch Correctness Assessment' but all evaluation is on Java patches only. The threats to validity acknowledge 'Our implementation currently supports Java language only' (Section 7.3), but the abstract does not hedge this limitation."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 7.3 discusses specific alternative explanations: (1) LLMs may have seen benchmark data during pre-training, addressed with base LLM performance analysis; (2) implementation errors, mitigated by independent artifact review; (3) the issue applies equally to the baseline LLM4PatchCorrect."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper directly measures patch correctness classification (accuracy, precision, recall, F1) against ground-truth labels. The claims match the granularity of measurements — no proxy gap exists between what is measured and what is claimed."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper uses 'StarCoder', 'CodeLlama', and 'Llama3' without specific version identifiers or model sizes. Section 4.5 mentions '7B size' in passing but does not specify exact model versions (e.g., CodeLlama-7b-Instruct, Llama-3-8B)."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The actual prompt text is provided in Section 4.2: 'You are a model responsible for assessing patch correctness. Assess whether the patch is correct.' The full input format with <P> tokens wrapping the patch is described in Equation 1 and Fig. 5."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 5.5 reports: Adam optimizer, learning rate range (0 to 0.00005), maximum sequence length (1024), low-rank dimension (256), 10-fold cross-validation. Hardware is specified as two RTX A6000 GPUs."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. The approach is a direct fine-tuning pipeline: APSG construction → GNN encoding → Graph-LoRA fine-tuning → classification."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 3 details APSG construction from patches using the Spoon code analysis tool. Section 4.2 describes code tokenization with <P> tokens. Section 5.2 describes dataset construction including source merging and deduplication for Multi-Benchmarks (Table 3)."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 7.3 'Threats to Validity' provides substantive discussion of both external and internal validity threats in dedicated subsections."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7.3 discusses study-specific threats: Java-only implementation, potential LLM data leakage on the specific datasets used, and implementation error mitigation through independent artifact review by multiple authors."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 7.3 states: 'Our implementation currently supports Java language only, and further efforts are needed to apply our approach to other programming languages.' The paper also notes it focuses on scenarios where ground-truth patches are inaccessible."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The replication package at GitHub includes code and datasets. The underlying datasets (Defects4J, Bugs.jar, Bears) are publicly available. The Multi-Benchmarks dataset composition is documented in Table 3."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 5.2 describes all five datasets in detail: their sources, construction processes, sizes, composition (correct vs. overfitting patches), and the benchmarks they are based on."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. All data comes from standard software engineering benchmarks (Defects4J, Bugs.jar, Bears) and prior APCA datasets."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The Multi-Benchmarks dataset pipeline is documented in Section 5.2 with source merging and deduplication steps, with counts at each stage (Table 3 shows 8,752 total → 7,794 after deduplication). APSG construction pipeline is detailed in Section 3."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The Acknowledgments section lists four funding sources: NSFC Grant No. 62102233, Shandong Province Overseas Outstanding Youth Fund (Grant No. 2022HWYQ-043), Joint Key Funds of NSFC (Grant No. U24A20244), and Qilu Young Scholar Program of Shandong University."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All authors are affiliated with Shandong University, China, as stated in the author information. They are not evaluating a product of their own company."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Funders are government agencies (NSFC, Shandong Province) and a university program with no commercial stake in the outcome of APCA research."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is included in the paper. Absence of a declaration is not the same as absence of conflict."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The training data cutoff dates for StarCoder, CodeLlama, and Llama3 are not stated anywhere in the paper, despite using these pre-trained models on benchmarks that predate them."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Section 7.3 (internal validity) discusses: 'the LLM during the pre-training process may possibly have encountered the content of the used datasets.' They provide evidence against significant overlap by showing poor base LLM performance in the ablation study."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "Section 7.3 acknowledges the contamination concern and addresses it by showing that base LLMs without fine-tuning perform very poorly (17.6%–30.8% accuracy), suggesting no significant contamination benefit. They also note this issue equally affects the baseline LLM4PatchCorrect."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study evaluates automated methods on benchmark datasets."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. The study uses publicly available software patches and benchmarks."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No inference cost, latency, or time per patch is reported. The paper does not quantify the practical cost of running the method."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Hardware is mentioned (two RTX A6000 GPUs, Ubuntu 22.04.5) and Graph-LoRA adds 0.6% parameters to a 7B model, but total GPU hours, training time, and total compute budget are not stated."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results from 10-fold CV are averaged but per-fold or per-seed variance is not reported."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Section 4.5 states: 'we perform 10-fold cross-validation and take the average of 10 rounds of each training and testing process as the final performance.'"
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Final hyperparameters are reported (Section 5.5) but no search budget, search method, or number of configurations tried is described."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Hyperparameter values (learning rate, rank dimension, sequence length) are stated without justification for how they were selected. No validation-based selection procedure is described."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper compares against 12+ baselines across 5 datasets (dozens of comparisons) without any correction for multiple comparisons."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors reproduce several baselines (BERT-LR, BATS, PANTHER, CACHE, APPT, LLM4PatchCorrect variants) without acknowledging the self-comparison bias documented by Lucic et al. (2018). They note reusing some results from prior works but do not discuss bias for the methods they reproduced themselves."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper mentions Graph-LoRA adds 0.6% parameters but does not compare training or inference compute budgets across methods. LLM4PatchCorrect vs. Graph-LoRA compute costs are not discussed."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "No discussion of whether the APCA datasets (binary correct/overfitting labels) actually measure real-world patch quality. The validity of using manually labeled binary labels as ground truth is not questioned."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is involved. The approach is a direct LLM fine-tuning pipeline without agentic scaffolding."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The benchmarks (Defects4J from 2014, patches from 2020–2023) were publicly available before the LLMs' training. The paper raises this as a concern in Section 7.3 but does not perform temporal analysis of when benchmark data appeared vs. model training periods."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the evaluation setup leaks answer information through the input features (e.g., whether context code or APSG attributes contain signals not available in realistic deployment scenarios)."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "In the 10-fold CV setting, patches from the same project or bug can appear in both train and test folds. This non-independence is not discussed, though cross-project evaluation (RQ3) partially addresses it."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No concrete leakage detection method is used. The only evidence against contamination is that base LLMs perform poorly without fine-tuning (Section 6.2 ablation), which is indirect and does not rule out partial memorization."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "Graph-LoRA improves accuracy by 3.1%–7.5% and F1 score by 3.0%–7.1% over the state-of-the-art LLM4PatchCorrect method across five APCA datasets.",
    370       "evidence": "Tables 4–8 (Section 6.1) show consistent improvements across Wang, Merge, Balance, Lin, and Multi-Benchmarks datasets using three LLMs (StarCoder, CodeLlama, Llama3).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Both APSG graph structure and Graph-LoRA attention mechanism contribute significantly to model performance.",
    375       "evidence": "Ablation study (Tables 9–10, Section 6.2) shows progressive degradation when removing attention fusion (0.7%–1.5% accuracy drop), Graph-LoRA-Weak (1.5%–2.7% drop), APSG attributes (0.7%–1.1% drop), and APSG graph (1.4%–2.7% drop).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Context nodes have the most obvious impact among APSG node types.",
    380       "evidence": "Tables 11–12 (Section 6.2) show that removing context nodes causes 2.0%–3.5% accuracy drop, larger than removing variable nodes (1.1%–1.7%) or control nodes (0.2%–0.6%).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "The method achieves state-of-the-art performance in cross-project prediction scenarios.",
    385       "evidence": "Tables 13–17 (Section 6.3) show Graph-LoRA outperforms APPT by 8.2%–16.0% and LLM4PatchCorrect by 2.2%–6.0% in accuracy across all five datasets in leave-one-project-out evaluation.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Ground-truth patches improve model performance, especially when data is scarce.",
    390       "evidence": "Table 18 (Section 7.2) shows 1.5% accuracy improvement on the Balance dataset (542 patches) and 0.7% on the Lin dataset (1,183 patches) when ground-truth patches are provided as prompts.",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "The three LLMs used likely have no data leakage issue on the studied datasets.",
    395       "evidence": "Section 6.2 ablation shows base LLMs without fine-tuning achieve very poor performance (17.6%–30.8% accuracy), suggesting they have not memorized APCA labels.",
    396       "supported": "weak"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No statistical significance testing",
    402       "detail": "All claims of superiority (e.g., 'outperforms all static methods') are based on comparing raw percentage values across 12+ baselines and 5 datasets without any statistical significance tests. With 10-fold CV, paired tests would be straightforward but are not performed."
    403     },
    404     {
    405       "flag": "No variance or uncertainty reported",
    406       "detail": "Despite using 10-fold cross-validation, no standard deviation, confidence interval, or spread measure is reported for any result across any table. This makes it impossible to assess whether observed differences are within noise."
    407     },
    408     {
    409       "flag": "Exceptionally high results on Wang dataset",
    410       "detail": "Graph-LoRA-Llama3 achieves 98.3% F1 on the Wang dataset (Table 4), which is exceptionally high for a binary classification task with 902 patches. Without variance reporting, it's unclear if this is a stable result."
    411     },
    412     {
    413       "flag": "Self-comparison bias in baseline reproduction",
    414       "detail": "The authors reproduced several baselines (BERT-LR, BATS, PANTHER, CACHE, APPT, LLM4PatchCorrect variants) on datasets not covered by prior works. There is no discussion of whether their implementations faithfully reproduce baseline performance, nor acknowledgment of documented self-comparison bias."
    415     },
    416     {
    417       "flag": "Model versions unspecified",
    418       "detail": "The paper uses 'StarCoder', 'CodeLlama', and 'Llama3' without specifying exact versions or sizes (beyond a passing reference to '7B size'). This hinders reproduction and makes results dependent on which specific model checkpoint was used."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    424       "authors": ["Edward J Hu", "Yelong Shen", "Phillip Wallis", "Zeyuan Allen-Zhu", "Yuanzhi Li", "Shean Wang", "Lu Wang", "Weizhu Chen"],
    425       "year": 2021,
    426       "arxiv_id": "2106.09685",
    427       "relevance": "Foundational PEFT method for LLMs, which Graph-LoRA builds upon for fine-tuning in code intelligence tasks."
    428     },
    429     {
    430       "title": "Leveraging Large Language Model for Automatic Patch Correctness Assessment",
    431       "authors": ["Xin Zhou", "Bowen Xu", "Kisub Kim", "DongGyun Han", "Hung Huu Nguyen", "Thanh Le-Cong", "Junda He", "Bach Le", "David Lo"],
    432       "year": 2024,
    433       "relevance": "State-of-the-art LLM-based APCA method (LLM4PatchCorrect) that this paper directly improves upon."
    434     },
    435     {
    436       "title": "APPT: Boosting Automated Patch Correctness Prediction via Fine-Tuning Pre-Trained Models",
    437       "authors": ["Quanjun Zhang", "Chunrong Fang", "Weisong Sun", "Yan Liu", "Tieke He", "Xiaodong Hao", "Zhenyu Chen"],
    438       "year": 2024,
    439       "relevance": "Pre-trained model fine-tuning for patch correctness assessment, a key ML-based baseline."
    440     },
    441     {
    442       "title": "Context-Aware Code Change Embedding for Better Patch Correctness Assessment",
    443       "authors": ["Bo Lin", "Shangwen Wang", "Ming Wen", "Xiaoguang Mao"],
    444       "year": 2022,
    445       "relevance": "CACHE method that uses contextual and structural code change information for APCA, demonstrating the importance of context in patch assessment."
    446     },
    447     {
    448       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models",
    449       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    450       "year": 2023,
    451       "relevance": "Introduces entropy-based patch correctness assessment in the LLM era, one of the attributes used in APSG."
    452     },
    453     {
    454       "title": "Sequencer: Sequence-to-Sequence Learning for End-to-End Program Repair",
    455       "authors": ["Zimin Chen", "Steve Kommrusch", "Michele Tufano", "Louis-Noël Pouchet", "Denys Poshyvanyk", "Martin Monperrus"],
    456       "year": 2019,
    457       "relevance": "Neural sequence-to-sequence approach to automated program repair, representing the learning-based APR paradigm."
    458     },
    459     {
    460       "title": "Code Llama: Open Foundation Models for Code",
    461       "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"],
    462       "year": 2023,
    463       "arxiv_id": "2308.12950",
    464       "relevance": "Open-source code LLM used as one of three backbone models in the evaluation."
    465     },
    466     {
    467       "title": "StarCoder: May the Source Be with You!",
    468       "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"],
    469       "year": 2023,
    470       "arxiv_id": "2305.06161",
    471       "relevance": "Open-source code LLM used as one of three backbone models in the evaluation."
    472     },
    473     {
    474       "title": "GraphCodeBERT: Pre-training Code Representations with Data Flow",
    475       "authors": ["Daya Guo", "Shuo Ren", "Shuai Lu"],
    476       "year": 2020,
    477       "arxiv_id": "2009.08366",
    478       "relevance": "Pioneering work on incorporating data flow into code pre-training, foundational to APSG's use of data flow edges."
    479     },
    480     {
    481       "title": "Automated Classification of Overfitting Patches with Statically Extracted Code Features",
    482       "authors": ["He Ye", "Jian Gu", "Matias Martinez", "Thomas Durieux", "Martin Monperrus"],
    483       "year": 2021,
    484       "relevance": "ODS method extracting 202 code features from ASTs for patch classification, an important ML-based APCA baseline."
    485     },
    486     {
    487       "title": "Patch Correctness Assessment: A Survey",
    488       "authors": ["Zhiwei Fei", "Jidong Ge", "Chuanyi Li", "Tianqi Wang", "Yuning Li", "Haodong Zhang", "LiGuo Huang", "Bin Luo"],
    489       "year": 2024,
    490       "relevance": "Comprehensive survey of APCA methods providing taxonomy and categorization of the field this paper contributes to."
    491     },
    492     {
    493       "title": "InValidator: Automated Patch Correctness Assessment via Semantic and Syntactic Reasoning",
    494       "authors": ["Thanh Le-Cong", "Duc-Minh Luong", "Xuan Bach D Le", "David Lo", "Nhat-Hoa Tran", "Bui Quang-Huy", "Quyet-Thang Huynh"],
    495       "year": 2023,
    496       "relevance": "APCA method combining semantic and syntactic reasoning, representing an alternative approach to patch correctness assessment."
    497     }
    498   ],
    499   "engagement_factors": {
    500     "practical_relevance": {
    501       "score": 1,
    502       "justification": "Addresses a real APR pipeline problem but requires significant setup (GNN + LLM fine-tuning + Spoon-based APSG extraction) and only works for Java."
    503     },
    504     "surprise_contrarian": {
    505       "score": 0,
    506       "justification": "Confirms the expected finding that combining structured graph representations with LLMs improves code understanding tasks."
    507     },
    508     "fear_safety": {
    509       "score": 0,
    510       "justification": "No safety or security concerns raised; focused on improving automated patch correctness assessment."
    511     },
    512     "drama_conflict": {
    513       "score": 0,
    514       "justification": "No controversy; straightforward method comparison paper in the APCA subfield."
    515     },
    516     "demo_ability": {
    517       "score": 1,
    518       "justification": "GitHub replication package is available but requires significant infrastructure (dual GPUs, Java analysis toolchain) to run."
    519     },
    520     "brand_recognition": {
    521       "score": 0,
    522       "justification": "From Shandong University, not a well-known AI lab in the broader public consciousness."
    523     }
    524   }
    525 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs