scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31338B)
      1 {
      2   "paper": {
      3     "title": "Automatic Patch Correctness Assessment with Large Language Model",
      4     "authors": [
      5       "Xin Zhou",
      6       "Bowen Xu",
      7       "Kisub Kim",
      8       "DongGyun Han",
      9       "Hung Huu Nguyen",
     10       "Thanh Le-Cong",
     11       "Junda He",
     12       "Bach Le",
     13       "David Lo"
     14     ],
     15     "year": 2024,
     16     "venue": "IEEE Transactions on Software Engineering",
     17     "arxiv_id": "2303.00202",
     18     "doi": "10.1109/TSE.2024.3452252"
     19   },
     20   "scan_version": 3,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval"],
     23   "key_findings": "LLM4PatchCorrect uses in-context learning with Starcoder-7B to assess patch correctness without needing labeled patches from the target APR tool, achieving 84.4% accuracy and 86.5% F1-score across 22 APR tools in cross-tool validation. The contrastive learning-based retrieval module for selecting semantically similar patches is the most impactful component, contributing 84.7% relative improvement in AUC. The approach significantly outperforms prior APCA methods (Cache, Quatrain, CodeBERT, ODS) on both Defects4J and Bears benchmarks, with all improvements statistically significant (Wilcoxon, p<0.05).",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper states 'We publicly share our implementation and dataset for future comparisons by the research community' in Section 6.5, but no repository URL or archive link is provided anywhere in the paper text."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The evaluation uses publicly available datasets: the merged dataset from Wang et al. [29] and Tian et al. [23] containing 1,179 patches from Defects4J [60], plus patches from the Bears benchmark [65]. These are standard public benchmarks."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper mentions using a 2080-Ti GPU (12GB), HuggingFace library, and 8-bit quantization, but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The method is described algorithmically but there are no runnable commands or reproduction guide."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "All results in Tables 2-9 report point estimates only (e.g., '84.4% accuracy'). No confidence intervals, error bars, or ± notation appears anywhere."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 5.1 states: 'we conduct the Wilcoxon signed-rank tests between LLM4PatchCorrect and all baselines to investigate whether the improvements are significant. The results show that LLM4PatchCorrect is statistically significantly better than all baselines (all p-values are less than 0.05).'"
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Relative improvements are reported with baseline context throughout, e.g., '20.9% (84.4−69.8/69.8)' improvement over Tian et al.'s work in accuracy. Both absolute values and relative improvement ratios are provided."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The dataset of 1,179 patches from 22 APR tools is used because it was available from prior work [23], [29]. No power analysis or justification for why this sample size is adequate for the claims made."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. All tables show single-run point estimates. The contrastive learning training involves randomness but no multi-run analysis is presented."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Six baselines are compared: Patch-Sim [25], CodeBERT [40], Tian et al. [23], ODS [27], Quatrain [34], and Cache [33]. These span both dynamic and static APCA approaches."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Cache (TOSEM 2022) and Quatrain (ASE 2022) represent the state-of-the-art at time of writing. CodeBERT is also a strong recent baseline. The baselines are competitive and appropriate."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Table 7 presents a systematic ablation study removing bug information, test information, and retrieved patches components individually and in combination, showing the contribution of each to overall performance."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Three evaluation metrics are used: Accuracy (Table 2), F1-score (Table 3), and AUC (Table 4). Both average and weighted average variants are reported."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No human evaluation of the system's predictions is performed. The patch labels were created by developers in prior work, but the system's outputs are evaluated only by automated comparison against ground truth labels."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Cross-tool validation ensures strict separation: patches from the target APR tool form the test set, all other APR tools' patches form the training set. Hyper-parameter tuning uses a separate 5% split from the training data (Section 4.5)."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Tables 2-4 provide per-APR-tool breakdowns for all 22 APR tools, showing how performance varies across different target tools. Tables 8-9 show per-tool results for the Bears benchmark."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 6.1 provides a case study (Figure 10) showing how the model can make incorrect predictions without retrieved patches. The SOFix case with 1:10 class imbalance is discussed as a challenging scenario."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Table 2 shows several cases where LLM4PatchCorrect performs worse than baselines: CapGen (-2.4%), GenProg (-4.2%), Jaid (-1.9%), kPAR (-7.2%), TBar (-2.9%). These negative results are openly reported."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Abstract claims of '84.4% accuracy and 86.5% F1-score on average' match Table 2 and Table 3 averages. Claims of 10.2%-32.4% improvements over state-of-the-art are supported by the comparative results."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The ablation study (Table 7) makes causal claims about component contributions using controlled single-variable manipulation: each component is added individually and in combination, providing adequate evidence for causal attribution."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title 'Automatic Patch Correctness Assessment with Large Language Model' implies general applicability, but all experiments use only Java patches from Defects4J and Bears benchmarks. The paper does not explicitly bound claims to Java or these specific benchmarks."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "Section 6.5 discusses threats to validity (selection bias, prompt design, model under-training) but these are generic methodological limitations, not substantive alternative explanations for why the approach succeeds. For example, they never consider whether Starcoder's pre-training on Defects4J-related code could explain the performance gains."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper measures patch correctness classification (correct vs. overfitting) and frames results as patch correctness assessment. The measurement matches the claim — no proxy gap exists."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Specific model names with sizes are provided: Starcoder-7B [38], CodeBERT (0.13B) [40], CodeLlama-7B [46], CodeGen2-3.7B [67], Starcoder-3B, Starcoder-1B, BLOOM-1.7B. Table 1 catalogs all models with exact sizes."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The prompt template is provided in Section 3.1: '{test-patch} Q: It was wrong or correct? A: It was'. Figure 7 shows a complete example of the concatenated input including all guiding information components. Textual descriptions for each component are shown in Figure 2."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Key hyperparameters are reported: k=10, β=0.9 (Section 4.5), contrastive learning training (lr=5e-5, batch size=64, epochs=3), CodeBERT fine-tuning (lr=1e-5, epochs=8), 8-bit quantization. The LLM is used for probability computation (argmax), not sampling."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. The approach is a single-pass inference pipeline: retrieve similar patches, concatenate with guiding information, query the LLM for next-token probabilities."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 4.1 documents duplicate removal (string matching + manual semantic equivalence check, finding 2 pairs). The tokenization process (BPE) is described in Section 3.1. The patch embedding and retrieval pipeline is detailed in Section 3.2."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 6.5 'Threats to Validity' provides substantive discussion organized into external validity, internal validity, and construct validity subsections."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 6.5 discusses study-specific threats: selection bias from considering only certain LLMs, dataset selection bias from using particular benchmarks, the manually crafted prompt may not be optimal, and the model may be under-trained."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The paper does not explicitly state what the results do NOT show. For example, it does not state that results are limited to Java patches, single-statement fixes, or the specific APR tools tested. The threats section discusses limitations but not explicit scope boundaries."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The underlying patch datasets are publicly available through Wang et al. [29], Tian et al. [23], the Defects4J benchmark [60], and the Bears benchmark [65]. The patches and their correctness labels can be independently obtained."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 4.1 describes the dataset: 1,179 patches from Defects4J merged from two existing large-scale datasets [23], [29], generated by 22 different APR tools, with labels 'carefully labeled and checked' by developers. Deduplication process is also described."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. The data consists of patches from standard software engineering benchmarks (Defects4J, Bears)."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The pipeline is documented: datasets merged from [23] and [29], deduplication via string matching and manual review (Section 4.1), cross-tool validation setup (Section 4.2), hyper-parameter tuning on 5% held-out split (Section 4.5), and identical-patch removal between train/test to avoid data leakage."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding sources, grants, or acknowledgments section appears in the paper text. Academic researchers at multiple institutions are almost certainly funded but no disclosure is made."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: Singapore Management University, NC State, Royal Holloway University of London, and University of Melbourne. All are academic institutions with no apparent conflict with the evaluated tools."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "Since no funding is disclosed, it is impossible to assess whether funders are independent of the outcome. The authors evaluate open-source models not produced by their institutions, suggesting likely independence, but this cannot be verified."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial interests statement appears in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No mention of Starcoder-7B's training data cutoff date. The paper cites the Starcoder paper [38] and notes it was trained on '1,000B tokens from 80+ programming languages' but does not state when the training data was collected."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "The paper discusses removing identical patches between the cross-tool train/test splits (Section 4.2), but never addresses whether Starcoder's pre-training data might contain Defects4J patches or their correctness labels from online sources."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "Defects4J was published in 2014 and its patches are widely discussed online. Starcoder was trained after 2014, so contamination is possible. The paper does not discuss this risk at all."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study. The evaluation is entirely automated using pre-labeled patch datasets."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. The study evaluates automated patch correctness assessment using existing benchmarks."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants. All evaluation is on pre-labeled patch correctness datasets."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants. Data comes from standard software engineering benchmarks."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants or experimental conditions involving people."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants or evaluators involved in the experimental evaluation."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Section 7 reports: 'LLM4PatchCorrect only costs 2.4 seconds for each patch' compared to Invalidator's five hours for dynamic features. The hardware requirement (2080-Ti GPU, 12GB) is also stated."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Per-patch inference time (2.4s) and GPU model (2080-Ti) are stated, but total compute budget for all experiments is not quantified. Training time for the contrastive learning model and total experiment duration are not reported."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No mention of multiple random seeds. The contrastive learning training involves randomness (dropout, batch sampling) but no seed sensitivity analysis is reported."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The number of experimental runs is never stated. Results appear to be from single runs without explicit confirmation."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Section 4.5 describes hyper-parameter tuning with specific ranges: β=[0.80, 0.82, 0.85, 0.87, 0.9, 0.92, 0.95] and k=[2, 4, 6, 8, 10, 12, 14, 15], totaling 56 configurations. Figure 8 shows the results."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "Section 4.5 states hyper-parameter tuning was performed 'on a randomly split 5% of the labeled patch pools,' separate from the test data. The best configuration (k=10, β=0.9) was selected based on AUC on this held-out validation set."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Wilcoxon signed-rank tests are conducted between LLM4PatchCorrect and each of 5+ baselines, but no correction for multiple comparisons (Bonferroni, Holm, etc.) is mentioned."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper re-uses Cache's released implementation, which is good practice. However, for other baselines the paper does not discuss author-evaluation bias or the systematic tendency for authors' re-implementations of baselines to underperform."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "LLM4PatchCorrect uses Starcoder-7B (7B parameters) while the CodeBERT baseline uses only 0.13B parameters — a ~54x size difference. This compute disparity is not discussed or controlled for in comparisons."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The paper does not discuss whether the Defects4J/Bears patch correctness benchmarks actually measure real-world patch assessment capability, or whether the binary correct/overfitting labels adequately capture patch quality."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No agentic scaffolding is involved. The approach is a direct inference pipeline without tool use or multi-step agent workflows."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Not discussed. Defects4J patches and their correctness labels have been available online since well before Starcoder's training, creating temporal leakage risk. The paper does not address whether the model might have encountered these patches during pre-training."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The paper removes identical patches between train and test sets in cross-tool validation but does not discuss whether the guiding information (bug descriptions, execution traces, test cases, coverage) could leak correctness information to the model."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "In cross-tool validation, training and test patches often target the same bugs from different APR tools. Patches for the same bug share structural similarities (same buggy code, same tests). This non-independence is not discussed."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": true,
    368         "justification": "Section 4.1 describes concrete deduplication: string matching for identical patches and manual examination for semantic equivalents, identifying 2 pairs of semantically equivalent patches which were removed. Section 4.2 also removes identical patches between train/test."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "LLM4PatchCorrect achieves 84.4% accuracy and 86.5% F1-score on average across 22 APR tools in cross-tool validation without requiring labeled patches from the target APR tool.",
    375       "evidence": "Tables 2-3 show per-tool and average results. Average accuracy is 84.4% and average F1-score is 86.5% across 22 APR tools in leave-one-out cross-tool validation.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "LLM4PatchCorrect significantly outperforms all state-of-the-art APCA approaches, with improvements of 10.2% to 32.4% in accuracy, 6.1% to 24.1% in F1, and 10.1% to 33.2% in AUC over different baselines.",
    380       "evidence": "Tables 2-4 show comparisons against 6 baselines. Wilcoxon signed-rank tests confirm all improvements are statistically significant (p<0.05). Section 5.1 reports the improvement percentages.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "The contrastive learning-based patch retrieval module is the most impactful component, contributing 84.7% relative improvement in AUC over using the LLM alone.",
    385       "evidence": "Table 7 ablation study shows retrieval module improves accuracy by 11.4%, F1 by 3.9%, and AUC by 84.7% relative to the LLM alone.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "LLM4PatchCorrect generalizes beyond Defects4J to the Bears benchmark, outperforming the best baseline by 27.0% in accuracy and 25.8% in F1.",
    390       "evidence": "Tables 8-9 show results on Bears benchmark across 4 APR tools (Arja, GenProg, Kali, RSRepair). Average accuracy 92.1% vs Quatrain's 72.5%.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Larger LLM model sizes generally lead to better performance in the LLM4PatchCorrect framework.",
    395       "evidence": "Table 10 compares 6 LLMs from 1B to 7B parameters. 7B models (Starcoder-7B, CodeLlama-7B) generally outperform smaller variants, though Starcoder-1B shows competitive F1 (85.9%).",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "LLM4PatchCorrect can filter out 90.4% of overfitting patches while increasing the correct patch ratio from 21.7% to 63.7%.",
    400       "evidence": "Figure 9(b) confusion matrix shows 584/646 overfitting patches correctly identified, with 109 correct patches among remaining 171 predicted-correct patches.",
    401       "supported": "strong"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "No error bars or variance across runs",
    407       "detail": "All experimental results are reported as single point estimates without confidence intervals, standard deviations, or any uncertainty quantification. The contrastive learning training involves randomness, yet no multi-seed analysis is presented."
    408     },
    409     {
    410       "flag": "Pre-training contamination not addressed",
    411       "detail": "Starcoder-7B was pre-trained on massive code corpora. Defects4J patches and their correctness labels are widely available online and likely in the pre-training data. The paper never discusses whether the model might have memorized patch correctness information, which could inflate performance beyond what the in-context learning framework provides."
    412     },
    413     {
    414       "flag": "Unfair compute comparison with baselines",
    415       "detail": "LLM4PatchCorrect uses Starcoder-7B (7B parameters) plus a CodeBERT-based retrieval model, while the CodeBERT baseline uses only 0.13B parameters. This ~54x model size difference is not discussed or controlled for. The improvement might partly reflect the advantage of larger models rather than the proposed approach design."
    416     },
    417     {
    418       "flag": "Shared bugs between cross-tool train and test sets",
    419       "detail": "In cross-tool validation, patches in the training set and test set often target the same Defects4J bugs. A model could learn bug-specific patterns from training patches and apply them to test patches for the same bug, inflating performance beyond genuine cross-tool transfer."
    420     },
    421     {
    422       "flag": "No released code despite claiming public availability",
    423       "detail": "Section 6.5 states 'We publicly share our implementation and dataset for future comparisons' but no URL, repository link, or archive is provided in the paper."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Starcoder: may the source be with you!",
    429       "authors": ["R. Li", "L. B. Allal", "Y. Zi"],
    430       "year": 2023,
    431       "arxiv_id": "2305.06161",
    432       "relevance": "The primary LLM used in LLM4PatchCorrect; a major open-source code LLM trained on 1000B tokens from 80+ programming languages."
    433     },
    434     {
    435       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    436       "authors": ["Z. Feng", "D. Guo", "D. Tang"],
    437       "year": 2020,
    438       "arxiv_id": "2002.08155",
    439       "relevance": "Used as the backbone for the contrastive learning patch embedding model and as a baseline for fine-tuning comparison."
    440     },
    441     {
    442       "title": "Evaluating large language models trained on code",
    443       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    444       "year": 2021,
    445       "arxiv_id": "2107.03374",
    446       "relevance": "The Codex paper — foundational work on LLMs for code that established evaluation paradigms used in subsequent code LLM research."
    447     },
    448     {
    449       "title": "Code llama: Open foundation models for code",
    450       "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"],
    451       "year": 2023,
    452       "arxiv_id": "2308.12950",
    453       "relevance": "Alternative code LLM evaluated in the model comparison study (Table 10), showing comparable performance to Starcoder-7B."
    454     },
    455     {
    456       "title": "Language models are few-shot learners",
    457       "authors": ["T. B. Brown", "B. Mann", "N. Ryder"],
    458       "year": 2020,
    459       "arxiv_id": "2005.14165",
    460       "relevance": "GPT-3 paper that introduced in-context learning, the core paradigm used by LLM4PatchCorrect for patch correctness assessment."
    461     },
    462     {
    463       "title": "LoRA: Low-rank adaptation of large language models",
    464       "authors": ["E. J. Hu", "Y. Shen", "P. Wallis"],
    465       "year": 2022,
    466       "arxiv_id": "2106.09685",
    467       "relevance": "Parameter-efficient fine-tuning method cited as context for why in-context learning (no fine-tuning) was chosen over full fine-tuning."
    468     },
    469     {
    470       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    471       "authors": ["C. S. Xia", "L. Zhang"],
    472       "year": 2022,
    473       "relevance": "AlphaRepair — a related approach using pre-trained models for program repair without fine-tuning, contrasted with the authors' focus on patch correctness assessment."
    474     },
    475     {
    476       "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation",
    477       "authors": ["Y. Wang", "W. Wang", "S. R. Joty"],
    478       "year": 2021,
    479       "arxiv_id": "2109.00859",
    480       "relevance": "A pre-trained encoder-decoder model for code, part of the landscape of code LLMs discussed in the background."
    481     },
    482     {
    483       "title": "Context-aware code change embedding for better patch correctness assessment",
    484       "authors": ["B. Lin", "S. Wang", "M. Wen", "X. Mao"],
    485       "year": 2022,
    486       "relevance": "Cache — the prior state-of-the-art in automatic patch correctness assessment, used as the primary baseline."
    487     },
    488     {
    489       "title": "SimCSE: Simple contrastive learning of sentence embeddings",
    490       "authors": ["T. Gao", "X. Yao", "D. Chen"],
    491       "year": 2021,
    492       "arxiv_id": "2104.08821",
    493       "relevance": "The contrastive learning framework adapted for patch embedding in LLM4PatchCorrect's retrieval module."
    494     },
    495     {
    496       "title": "Invalidator: Automated patch correctness assessment via semantic and syntactic reasoning",
    497       "authors": ["T. Le-Cong", "D.-M. Luong", "X. B. D. Le"],
    498       "year": 2023,
    499       "arxiv_id": "2301.01113",
    500       "relevance": "A hybrid dynamic+static APCA approach combining program invariants and code embeddings, compared against for efficiency (5 hours vs 2.4 seconds)."
    501     }
    502   ],
    503   "engagement_factors": {
    504     "practical_relevance": {
    505       "score": 2,
    506       "justification": "Practitioners using APR tools could apply this to filter overfitting patches, reducing manual review effort."
    507     },
    508     "surprise_contrarian": {
    509       "score": 1,
    510       "justification": "Using LLMs for patch assessment is a natural extension of LLM-for-code trends; the cross-tool transfer finding is mildly surprising but not contrarian."
    511     },
    512     "fear_safety": {
    513       "score": 0,
    514       "justification": "No AI safety or security concerns raised; the paper is about improving software quality tooling."
    515     },
    516     "drama_conflict": {
    517       "score": 0,
    518       "justification": "No controversy or conflict; a straightforward technical contribution with incremental improvements over baselines."
    519     },
    520     "demo_ability": {
    521       "score": 0,
    522       "justification": "No released code, demo, or installable tool despite claiming public availability."
    523     },
    524     "brand_recognition": {
    525       "score": 1,
    526       "justification": "Uses StarCoder (BigCode project) which has moderate recognition; authors from Singapore Management University have some SE research visibility."
    527     }
    528   }
    529 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs