ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (31809B)


      1 {
      2   "paper": {
      3     "title": "Invalidator: Automated Patch Correctness Assessment via Semantic and Syntactic Reasoning",
      4     "authors": [
      5       "Thanh Le-Cong",
      6       "Duc-Minh Luong",
      7       "Xuan Bach D. Le",
      8       "David Lo",
      9       "Nhat-Hoa Tran",
     10       "Bui Quang-Huy",
     11       "Quyet-Thang Huynh"
     12     ],
     13     "year": 2023,
     14     "venue": "IEEE Transactions on Software Engineering",
     15     "arxiv_id": "2301.01113",
     16     "doi": "10.1109/TSE.2023.3255177"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "INVALIDATOR combines semantic reasoning (program invariants via Daikon) and syntactic reasoning (CodeBERT embeddings + Logistic Regression) to detect overfitting patches in automated program repair, achieving 81% accuracy and 0.87 F1-score on 139 patches from Defects4J. It outperforms seven baselines by 14-19% on accuracy/F1, correctly classifying 79% of overfitting patches with 97% precision. An ablation study shows both components contribute: the semantic classifier detects 51% of overfitting patches threshold-free, while the syntactic classifier provides broader coverage. The three best techniques (INVALIDATOR, ODS, RGT) are complementary, together covering 107 of 109 overfitting patches.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Section 9 states: 'INVALIDATOR is publicly available at https://github.com/thanhlecongg/Invalidator' and 'All materials including implementation, datasets, and experimental results are also published via https://doi.org/10.5281/zenodo.7699142.'"
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The dataset is from publicly available prior work (Defects4J, Xiong et al., Wang et al.), and the authors release all materials including datasets via the Zenodo archive (doi:10.5281/zenodo.7699142)."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions 'Python programming language' and 'HuggingFace's Transformers framework' (Section 5.1.3) but provides no version numbers, requirements.txt, Dockerfile, or detailed dependency specifications."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "The paper provides a GitHub repository URL and Zenodo archive with implementation, datasets, and experimental results. The methodology is described in detail sufficient for replication, and the public repository is expected to contain runnable code."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All results in Tables 3-7 are reported as point estimates (e.g., Accuracy 0.81, F1-score 0.87) with no confidence intervals, error bars, or uncertainty quantification."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper claims INVALIDATOR 'outperforms' and 'substantially outperforms' baselines based solely on comparing raw metric values (e.g., 0.81 vs. 0.68 accuracy). No statistical significance tests (t-test, bootstrap, etc.) are reported anywhere."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper reports both absolute scores and relative improvements with baseline context, e.g., '14% (0.81 vs. 0.68)' for Accuracy and '19% (0.87 vs. 0.76)' for F1-score (Section 5.3.1), providing sufficient context for effect size interpretation."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The evaluation set contains only 139 patches (109 overfitting, 30 correct). No power analysis or justification for why this sample size is sufficient for the claims made. The class imbalance is also not formally addressed."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No variance, standard deviation, or spread measures are reported. The syntactic classifier involves ML training (Logistic Regression) that could vary across random seeds, but only single-run results are presented."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Seven baselines are compared: RGT, ODS, BERT+LR, PATCHSIM, DIFFTGEN, ANTI-PATTERNS, and GT-INVARIANT (Section 5.2, RQ1). Results for all baselines are presented in Table 3."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Baselines include ODS (2021), RGT (2021), BERT+LR (2020), and PatchZero (2023 in related work). These were state-of-the-art at time of writing. Older baselines (ANTI-PATTERNS 2016, DIFFTGEN 2017) are also included for completeness."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "RQ4 (Section 5.3.4) conducts a thorough ablation: removing syntactic classifier (Table 6), removing semantic classifier (Table 6), comparing invariant granularities (Table 7), and evaluating individual overfitting rules (Figure 9)."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Five evaluation metrics are used: Recall, Precision, Accuracy, F1-score, and AUC (Section 5.1.2). Results are reported across all metrics in Tables 3-5."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "Evaluation is entirely automated against pre-labeled ground truth patches. No human evaluation of INVALIDATOR's outputs is performed. The ground truth labels were created by prior work, not as part of this study's evaluation."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The evaluation uses Xiong et al.'s 139 patches as a held-out test set, while training/validation uses Wang et al.'s data + Defects4J patches. Duplicates were removed: 'we removed a patch if it is syntactically equivalent to a patch in the evaluation set' (Section 5.1.1). Threshold tuned on validation set, not evaluation set."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results are broken down by technique comparison (Table 3), by classifier component (Table 6), by invariant granularity (Table 7), and by overfitting rule (Figure 9). A Venn diagram shows complementarity of top techniques (Figure 5)."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper discusses 3 false positives (correct patches misclassified as overfitting) and analyzes case studies of unique detections (Section 5.3.1, Math-58 example in Figure 6). Time efficiency limitations of invariant inference are also discussed (Section 6.1)."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper reports that INVALIDATOR slightly underperforms BERT+LR and PATCHSIM on Precision (Section 5.3.1), that the semantic classifier alone has limited recall (51%), and that invariant inference is 'time-consuming' with a 5-hour limit per patch (Section 6.1)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims of 79% overfitting detection, 23% improvement over best baseline recall, and 14%/19% improvements in Accuracy/F1 are all directly supported by Table 3 results (86/109 TP, 0.81 vs 0.68 accuracy, 0.87 vs 0.76 F1)."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Causal claims are made via ablation studies (RQ4): 'removing INVALIDATORsyn leads to a decrease of 26% and 23%' and 'without INVALIDATORsem, INVALIDATOR's performance also drops by 11% and 8%.' The ablation design uses controlled single-variable manipulation (removing one component at a time), which is adequate."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title 'Automated Patch Correctness Assessment' and abstract claim broad applicability, but results are only on Java programs from 4 Defects4J projects (Chart, Time, Lang, Math). While the threats-to-validity section notes the Defects4J limitation, the title and abstract do not bound claims to Java or this specific benchmark."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Section 6.3 discusses specific alternative explanations: ground truth labels may have 'subjective bias' from human annotators (construct validity), the 21 APR techniques may not represent all approaches (external validity), and baseline threshold tuning on evaluation sets may inflate their precision (Section 5.3.1)."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper directly measures patch correctness classification against human-labeled ground truth. The measurement (TP/FP/TN/FN against labeled patches) matches the claim (automated patch correctness assessment). No proxy gap exists."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "CodeBERT is specified by name with reference to Feng et al. (2020) and the official GitHub repository. CodeBERT has a single public release, making the name sufficiently specific. Daikon is also identified as the invariant inference tool with documentation reference."
    151       },
    152       "prompts_provided": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "The paper does not use prompting. CodeBERT is used as a feature extractor (embedding generation), not as a prompted generative model. Daikon is a dynamic analysis tool, not prompt-based."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The classification threshold T=0.975 is reported (Section 5.1.3), with extensive sensitivity analysis across the full range (0, 1) in RQ3. The embedding dimension k=768 and the 5-hour invariant inference time limit are stated. The approach uses CodeBERT for feature extraction (no fine-tuning) and standard Logistic Regression."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. INVALIDATOR is a two-phase classification pipeline (semantic invariant checking followed by syntactic ML classification), not an agentic system."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 5.1.1 documents the full pipeline: 220 patches from Xiong et al. + 902 from Wang et al. → filtered to 4 Defects4J projects (139 + 666) → added 223 developer patches → removed duplicates via syntactic equivalence → 746 train/val + 139 eval → 90/10 train/val split (671/75)."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 6.3 'Threats to validity' contains three substantive subsections: external validity, internal validity, and construct validity, each discussing specific concerns."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Specific threats include: '885 patches generated from 21 popular APR techniques. This may not represent all APR techniques,' 'patches in our dataset are only generated for the Defects4J dataset,' and 'correctness of the patches may be subject to subjective bias because they were manually labeled by human annotators' (Section 6.3)."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The paper explicitly states what was not tested: only Defects4J (not Bears or Bugs.jar), only Java programs, only 4 projects. Section 6.2 acknowledges 'the reliance on ground truth patches limits our applications on pure APR problem settings.' QuixBugs was excluded with explanation."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "All materials including datasets and experimental results are published via Zenodo (doi:10.5281/zenodo.7699142). The underlying patch datasets from Xiong et al. and Wang et al. are also publicly available."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 5.1.1 describes data collection: 220 patches from Xiong et al. [28] and 902 from Wang et al. [50], filtered to 4 Defects4J projects, supplemented with 223 developer-written patches from Defects4J. Inclusion/exclusion criteria and deduplication are described."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. The data sources are standard published benchmark datasets (Defects4J patches from prior work)."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Section 5.1.1 traces the full pipeline with counts at each stage: 220+902 initial → 139+666 after project filtering → +223 developer patches = 1028 → remove duplicates = 746 for train/val, 139 for eval → 671 train + 75 validation. Table 1 and Table 2 summarize the splits."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: University of Melbourne, Hanoi University of Science and Technology, and Singapore Management University. Email addresses are provided. The authors are not affiliated with any APR tool vendor being evaluated."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "Cannot assess funder independence because no funding is disclosed. The authors are academic researchers not evaluating their own commercial product, but without funding disclosure this cannot be confirmed."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests statement or financial disclosure is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "CodeBERT is used as a feature extractor but its training data cutoff date is not stated. CodeBERT was pretrained on GitHub code which could include Defects4J project source code."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": true,
    243         "justification": "The paper explicitly addresses train/test overlap for their own classifier: 'we removed a patch if it is syntactically equivalent to a patch in the evaluation set' (Section 5.1.1). However, potential overlap between CodeBERT's pretraining data and Defects4J code is not discussed."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No discussion of whether CodeBERT's pretraining corpus (GitHub code) contains Defects4J source code, which could influence the learned code representations used for feature extraction."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants. This is a benchmark evaluation of automated patch correctness assessment techniques."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants. The study evaluates machine-generated patches against existing labeled datasets."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Section 6.1 reports: 'assessing the correctness of 139 patches in our evaluation dataset... INVALIDATOR took 15.5 hours (i.e., about 7 minutes for each patch).' A 5-hour time limit for invariant inference per patch is also stated."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No hardware specifications (GPU/CPU type), total GPU hours, or training time for the Logistic Regression model are reported. Only inference wall-clock time is mentioned."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No mention of random seeds or sensitivity analysis across seeds. The Logistic Regression training involves randomness but only single-run results are reported."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs is never stated. Results appear to be from a single run with no indication of repetition."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The threshold T=0.975 was tuned on the validation set, but no details on the search procedure (how many values tried, search method) are provided. Sensitivity analysis in RQ3 explores the full range but this is post-hoc analysis, not the search budget."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "The classification threshold T=0.975 was 'yielded the highest precision on the validation dataset' (Section 5.1.3). Importantly, tuning was done on an independent validation set rather than the evaluation set, avoiding overfitting. RQ3 provides extensive threshold sensitivity analysis."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The paper compares against 7 baselines and makes multiple performance claims, but no statistical tests are performed at all, let alone corrections for multiple comparisons."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors implemented INVALIDATOR and compare it against baselines (some re-implemented, some from prior work results). The bias of evaluating their own system is never acknowledged. For DIFFTGEN and GT-INVARIANT, they ran external implementations, but for their own system no independent evaluation is performed."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "INVALIDATOR requires up to 5 hours of invariant inference per patch, which is substantially more compute than most baselines. This compute difference is not discussed in terms of performance-per-compute comparisons with baselines."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper does not discuss whether the Defects4J patch correctness labels actually measure real-world patch assessment ability. The construct validity of using manually labeled patches from specific Java projects as a proxy for general patch correctness assessment is not questioned."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is involved. INVALIDATOR is a classification pipeline, not an agentic system with scaffolding."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether CodeBERT's pretraining data (collected before 2020) includes Defects4J source code or patches created before that date. The temporal relationship between training data and evaluation data is not analyzed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the feature extraction pipeline (CodeBERT embeddings of buggy, patched, and correct code) could leak information about patch correctness through unintended channels."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": true,
    361         "justification": "Section 5.1.1 explicitly addresses non-independence: 'there may be duplication between Wang et al.'s dataset, Defects4J's patches, and Xiong et al.'s dataset. To avoid data leakage, we removed the duplicated patches from the training and validation set.'"
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": true,
    366         "justification": "A concrete decontamination method is applied: 'we removed a patch if it is syntactically equivalent to a patch in the evaluation set' (Section 5.1.1), reducing the training set from 889 to 746 patches."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "INVALIDATOR correctly classified 79% of overfitting patches (86/109) with 97% precision on the evaluation dataset.",
    373       "evidence": "Table 3 shows 86 TP, 23 FN, 3 FP, 27 TN, yielding Recall=0.79, Precision=0.97, Accuracy=0.81, F1=0.87 on 139 patches (Section 5.3.1).",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "INVALIDATOR outperforms the best baselines (ODS and RGT) by 19% in Accuracy and 14% in F1-score.",
    378       "evidence": "Table 3: INVALIDATOR achieves 0.81 Accuracy and 0.87 F1 vs. ODS/RGT at 0.68 Accuracy and 0.76 F1 (Section 5.3.1).",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "The semantic-based classifier alone detects 51% of overfitting patches with 97% precision.",
    383       "evidence": "Table 6 ablation: INVALIDATORsem alone achieves 56 TP out of 109 overfitting patches with 2 FP, giving Recall=0.51, Precision=0.97 (Section 5.3.4, RQ4.1).",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Syntactic reasoning boosts Accuracy by 35% and F1 by 30% over the semantic classifier alone.",
    388       "evidence": "Table 6: INVALIDATOR (0.81 Accuracy, 0.87 F1) vs. semantic-only (0.60 Accuracy, 0.67 F1). Section 5.3.4, RQ4.1.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "INVALIDATOR is the most effective and stable technique among threshold-dependent APAC approaches.",
    393       "evidence": "Figure 8 shows INVALIDATOR maintains F1 above 0.8 across thresholds 0.1-0.9, while ODS and BERT+LR degrade significantly above 0.4 (Section 5.3.3, RQ3.2).",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Using invariants from all executed methods improves performance over buggy-methods-only invariants by 28% Accuracy and 31% F1.",
    398       "evidence": "Table 7: executed methods yield 0.60 Accuracy and 0.67 F1 vs. buggy methods at 0.47 Accuracy and 0.51 F1 (Section 5.3.4, RQ4.2).",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "INVALIDATOR, ODS, and RGT are complementary — together covering 107/109 overfitting patches.",
    403       "evidence": "Figure 5 Venn diagram shows their union covers 107/109 overfitting patches, with each detecting unique patches (Section 5.3.1).",
    404       "supported": "strong"
    405     }
    406   ],
    407   "red_flags": [
    408     {
    409       "flag": "No statistical significance tests",
    410       "detail": "All performance comparisons are based on raw metric differences without any statistical testing (no p-values, confidence intervals, or bootstrap tests). Claims of 'outperforming' baselines rest entirely on comparing point estimates from a single run on 139 patches."
    411     },
    412     {
    413       "flag": "Small evaluation set",
    414       "detail": "The evaluation set contains only 139 patches (109 overfitting, 30 correct). With such a small sample, especially only 30 correct patches, the precision and accuracy estimates have high uncertainty that is never quantified."
    415     },
    416     {
    417       "flag": "Requires ground truth patches",
    418       "detail": "INVALIDATOR requires developer-written correct patches as input, which significantly limits practical applicability. This is acknowledged in Section 6.2 but the abstract and title do not make this fundamental limitation prominent."
    419     },
    420     {
    421       "flag": "No random seed analysis",
    422       "detail": "The ML-based syntactic classifier (Logistic Regression on CodeBERT features) involves training randomness, but results are from a single run with no variance across seeds reported."
    423     },
    424     {
    425       "flag": "Potential baseline implementation advantage",
    426       "detail": "Some baseline results are collected from prior work while INVALIDATOR is implemented and evaluated by its own authors. The bias of self-evaluation (Lucic et al. 2018) is not acknowledged."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Genprog: A generic method for automatic software repair",
    432       "authors": ["C. Le Goues", "T. Nguyen", "S. Forrest", "W. Weimer"],
    433       "year": 2011,
    434       "relevance": "Foundational automated program repair technique using genetic programming, shown to produce 98% overfitting patches."
    435     },
    436     {
    437       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    438       "authors": ["Z. Feng", "D. Guo", "D. Tang", "N. Duan", "X. Feng", "M. Gong", "L. Shou", "B. Qin", "T. Liu", "D. Jiang", "M. Zhou"],
    439       "year": 2020,
    440       "relevance": "Pre-trained code representation model used as the feature extractor in INVALIDATOR's syntactic classifier."
    441     },
    442     {
    443       "title": "The daikon system for dynamic detection of likely invariants",
    444       "authors": ["M. D. Ernst", "J. H. Perkins", "P. J. Guo", "S. McCamant", "C. Pacheco", "M. S. Tschantz", "C. Xiao"],
    445       "year": 2007,
    446       "relevance": "Dynamic invariant inference tool central to INVALIDATOR's semantic-based patch classification approach."
    447     },
    448     {
    449       "title": "Sequencer: Sequence-to-sequence learning for end-to-end program repair",
    450       "authors": ["Z. Chen", "S. J. Kommrusch", "M. Tufano", "L.-N. Pouchet", "D. Poshyvanyk", "M. Monperrus"],
    451       "year": 2019,
    452       "relevance": "Deep learning approach to automated program repair using sequence-to-sequence models."
    453     },
    454     {
    455       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    456       "authors": ["C. S. Xia", "L. Zhang"],
    457       "year": 2022,
    458       "relevance": "Zero-shot LLM-based approach to automated program repair, relevant to understanding LLM capabilities in code generation."
    459     },
    460     {
    461       "title": "Evaluating representation learning of code changes for predicting patch correctness in program repair",
    462       "authors": ["H. Tian", "K. Liu", "A. K. Kaboré", "A. Koyuncu", "L. Li", "J. Klein", "T. F. Bissyandé"],
    463       "year": 2020,
    464       "relevance": "BERT+LR baseline technique using pre-trained language model representations for patch correctness prediction."
    465     },
    466     {
    467       "title": "Defects4j: A database of existing faults to enable controlled testing studies for java programs",
    468       "authors": ["R. Just", "D. Jalali", "M. D. Ernst"],
    469       "year": 2014,
    470       "relevance": "Standard benchmark dataset for automated program repair evaluation, used as the basis for all experiments."
    471     },
    472     {
    473       "title": "Sapfix: Automated end-to-end repair at scale",
    474       "authors": ["A. Marginean", "J. Bader", "S. Chandra", "M. Harman", "Y. Jia", "K. Mao", "A. Mols", "A. Scott"],
    475       "year": 2019,
    476       "relevance": "Facebook's industrial-scale automated bug-fixing system, demonstrating practical APR deployment."
    477     },
    478     {
    479       "title": "PatchZero: Zero-shot automatic patch correctness assessment",
    480       "authors": ["X. Zhou", "B. Xu", "K. Kim", "D. Han", "T. Le-Cong", "J. He", "B. Le", "D. Lo"],
    481       "year": 2023,
    482       "arxiv_id": "2303.00202",
    483       "relevance": "Zero-shot approach to patch correctness assessment using LLMs, extending the APCA paradigm without labeled data."
    484     },
    485     {
    486       "title": "Neural program repair with execution-based backpropagation",
    487       "authors": ["H. Ye", "M. Martinez", "M. Monperrus"],
    488       "year": 2022,
    489       "relevance": "RewardRepair technique integrating execution feedback into neural program repair training, relevant to learning-based APR."
    490     },
    491     {
    492       "title": "Automated classification of overfitting patches with statically extracted code features",
    493       "authors": ["H. Ye", "J. Gu", "M. Martinez", "T. Durieux", "M. Monperrus"],
    494       "year": 2021,
    495       "relevance": "ODS baseline using 4199 hand-crafted code features for overfitting patch classification."
    496     },
    497     {
    498       "title": "Automated patch assessment for program repair at scale",
    499       "authors": ["H. Ye", "M. Martinez", "M. Monperrus"],
    500       "year": 2021,
    501       "relevance": "RGT baseline using random testing with ground truth for automated patch correctness assessment."
    502     }
    503   ],
    504   "engagement_factors": {
    505     "practical_relevance": {
    506       "score": 2,
    507       "justification": "Provides a usable tool for APR researchers to filter overfitting patches, but requires ground truth patches which limits practical deployment."
    508     },
    509     "surprise_contrarian": {
    510       "score": 1,
    511       "justification": "Combining semantic and syntactic reasoning is a novel approach but the finding that invariants detect overfitting is not surprising given prior work by Yang and Yang (2020)."
    512     },
    513     "fear_safety": {
    514       "score": 0,
    515       "justification": "No AI safety or security concerns; focuses on software engineering quality assurance."
    516     },
    517     "drama_conflict": {
    518       "score": 0,
    519       "justification": "No controversy or dramatic claims; straightforward technical contribution."
    520     },
    521     "demo_ability": {
    522       "score": 2,
    523       "justification": "Code and data publicly released on GitHub and Zenodo, reproducible by researchers with Java/Defects4J infrastructure."
    524     },
    525     "brand_recognition": {
    526       "score": 1,
    527       "justification": "Published in IEEE TSE (top SE journal) with David Lo (well-known SE researcher) as co-author, but not from a major AI lab."
    528     }
    529   }
    530 }

Impressum · Datenschutz