scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (23978B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Invalidator: Automated Patch Correctness Assessment via Semantic and Syntactic Reasoning",
      6     "authors": [
      7       "Thanh Le-Cong",
      8       "Duc-Minh Luong",
      9       "Xuan Bach D. Le",
     10       "David Lo",
     11       "Nhat-Hoa Tran",
     12       "Bui Quang-Huy",
     13       "Quyet-Thang Huynh"
     14     ],
     15     "year": 2023,
     16     "venue": "IEEE Transactions on Software Engineering",
     17     "arxiv_id": "2301.01113",
     18     "doi": "10.1109/TSE.2023.3255177"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "All numerical claims in the abstract (79% recall, 23% improvement over best baseline, 14% and 19% gains in Accuracy and F1-score) are directly supported by Table 3's results.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The claim that combining semantic and syntactic reasoning improves performance is supported by the ablation study in RQ4.1 (Table 6), which isolates each component's contribution.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The conclusion states INVALIDATOR 'outperforms state-of-the-art baselines' without clearly bounding to the 4-project Java/Defects4J subset used; the threats section acknowledges this but it does not moderate the conclusion language.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No alternative explanations are discussed for INVALIDATOR's performance advantage, such as favorable characteristics of the evaluation dataset or potential overfitting to the Defects4J benchmark.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper measures binary patch classification (overfitting vs correct) and claims exactly this—patch correctness assessment accuracy—without conflating it with broader software quality metrics.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 6.3 'Threats to validity' covers external, internal, and construct validity in distinct subsections.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "External validity identifies specific constraints: 885 patches from 21 APR techniques only, Defects4J only, and notes QuixBugs (~35 LOC) is too simple while industrial benchmark labeling is too expensive—concrete reasons rather than boilerplate.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Section 6.2 explicitly states 'the reliance on ground truth patches limits our applications on pure APR problem settings,' clearly bounding where INVALIDATOR applies.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding source is mentioned anywhere in the paper.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations are clearly listed: University of Melbourne, Hanoi University of Science and Technology, and Singapore Management University.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No funding is disclosed, so funder independence cannot be assessed.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial interests statement is present in the paper.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 2 provides formal definitions of 'test overfitting,' 'program invariants,' and 'automated patch correctness assessment'; Definitions 1 and 2 formally define correct and error specifications with logical notation.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Four explicit contributions are listed: INVALIDATOR tool, two overfitting rules, syntactic reasoning augmentation, and empirical evaluation on 885 patches.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 7 situates INVALIDATOR against 7 baselines with conceptual differentiation (invariant-based vs test-generation vs syntactic-only), explaining why each prior approach is insufficient.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "Section 9 explicitly states INVALIDATOR is publicly available at https://github.com/thanhlecongg/Invalidator with all materials at zenodo DOI.",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "Datasets from Xiong et al. [28] and Wang et al. [50] are referenced public datasets; all materials including datasets are published via zenodo (https://doi.org/10.5281/zenodo.7699142).",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "The paper mentions Python and HuggingFace Transformers but provides no requirements.txt, Dockerfile, or specific version numbers for dependencies.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "The paper points to a GitHub repository and zenodo archive but provides no step-by-step reproduction instructions within the paper itself.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "All results are point estimates; no confidence intervals or error bars are reported anywhere in the paper.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No statistical significance tests are reported for any comparisons against baselines; performance differences are presented as raw numbers only.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Percentage improvements over baselines are consistently reported (e.g., '14% and 19% for Accuracy and F1-score'), providing effect size context with baselines as denominators.",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "The dataset size (885 total, 139 evaluation) is inherited from prior work without justification or power analysis.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "No variance, standard deviation, or results across multiple runs are reported; all results are single-run point estimates.",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Seven baselines included: RGT, ODS, BERT+LR, PATCHSIM, DIFFTGEN, ANTI-PATTERNS, and GT-INVARIANT.",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Most recent baselines are ODS and RGT (2021), within two years of this 2023 paper; these represent the acknowledged state-of-the-art in APAC.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "RQ4 provides thorough ablation: RQ4.1 removes each classifier (Table 6), RQ4.2 compares invariant granularities (Table 7), RQ4.3 evaluates individual overfitting rules (Figure 9).",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Five metrics used: Recall, Precision, Accuracy, F1-score, and AUC.",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": false,
    207           "answer": false,
    208           "justification": "The evaluation uses pre-existing human-labeled patch correctness labels from prior work; no new human evaluation of INVALIDATOR's outputs is conducted.",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "The 139 patches from Xiong et al. [28] are held out as the evaluation set, kept separate from the 671-patch training set and 75-patch validation set (Table 2).",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": false,
    220           "justification": "Results are reported at the aggregate level only; no breakdown by APR technique, bug category, or project (Chart/Time/Lang/Math) is provided in the main evaluation.",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": false,
    226           "justification": "The paper analyzes patches that INVALIDATOR uniquely detects correctly but does not analyze the 23 false negatives (overfitting patches INVALIDATOR missed).",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "The paper reports INVALIDATOR underperforms BERT+LR and PATCHSIM on Precision (0.97 vs 1.00) and explains why; ablation shows significant performance drops when components are removed.",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "CodeBERT is referenced by citation [52] and GitHub link but no specific checkpoint or model version is identified; HuggingFace Transformers version is also unspecified.",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": false,
    245           "answer": false,
    246           "justification": "CodeBERT is used as a fixed feature extractor, not a prompted language model; no prompts are applicable to this architecture.",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "The classification threshold (0.975, tuned on validation set) and Daikon 5-hour time limit are reported; CodeBERT embedding dimension (768) is specified; CodeBERT is used as fixed extractor requiring no training hyperparameters.",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": false,
    257           "answer": false,
    258           "justification": "INVALIDATOR is a deterministic pipeline tool, not an agentic system with scaffolding.",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Algorithm 2 documents test selection; Section 5.1.1 describes duplicate removal (syntactically equivalent patches), dataset filtering to 4 Defects4J projects, and handling of class imbalance.",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "All materials including datasets are published via zenodo (https://doi.org/10.5281/zenodo.7699142) and GitHub repository.",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Section 5.1.1 details collection from Xiong et al. [28] and Wang et al. [50], filtering to 4 projects, deduplication procedure, and supplementation with developer-written patches.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participant recruitment—this is a software benchmark study using existing labeled datasets.",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "Algorithms 1 and 2 document invariant inference and test selection; Figure 2 shows the full INVALIDATOR workflow; Table 1-2 detail dataset construction and splits.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "CodeBERT's training data cutoff is never stated, leaving open whether Defects4J benchmark code appeared in CodeBERT's pre-training corpus.",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": true,
    304           "justification": "Section 5.1.1 explicitly describes removing syntactically equivalent patches between training/validation and evaluation sets to avoid data leakage.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "Defects4J is a public dataset predating CodeBERT's training; whether Defects4J project code appeared in CodeBERT's GitHub-sourced pre-training data is never discussed.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "Section 6.1 reports approximately 7 minutes per patch and 15.5 hours total for 139 evaluation patches, with the bottleneck identified as Daikon invariant inference.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "Wall-clock time is reported (15.5 hours) but hardware specifications (CPU/GPU model, RAM) are not stated, making cost comparison impossible.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "INVALIDATOR correctly classifies 79% of overfitting patches with 97% precision on the Defects4J evaluation set",
    377       "evidence": "Table 3: 86 TP out of 109 overfitting patches (Recall=0.79), only 3 FP out of 30 correct patches (Precision=0.97)",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "INVALIDATOR outperforms the best baselines by 14% in F1-score and 19% in Accuracy",
    382       "evidence": "Table 3: INVALIDATOR F1=0.87, Accuracy=0.81 vs ODS/RGT F1=0.76, Accuracy=0.68",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Semantic (invariant-based) reasoning alone detects 51% of overfitting patches with 97% precision",
    387       "evidence": "Table 6: ablation removing syntactic classifier yields Recall=0.51, Precision=0.97",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Adding syntactic reasoning boosts Accuracy by 35% and F1-score by 30% over semantic-only baseline",
    392       "evidence": "Table 6: w/o syntactic: Accuracy=0.60, F1=0.67; full INVALIDATOR: Accuracy=0.81, F1=0.87",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "INVALIDATOR, ODS, and RGT used together cover 107/109 overfitting patches in a complementary fashion",
    397       "evidence": "Figure 5 Venn diagram; each technique uniquely detects 10, 7, and 5 patches respectively that the others miss",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "Using invariants from all executed methods (not just buggy methods) improves semantic classifier Accuracy by 28%",
    402       "evidence": "Table 7: executed methods Accuracy=0.60 vs buggy methods=0.47 (28% relative improvement)",
    403       "supported": "strong"
    404     },
    405     {
    406       "claim": "CodeBERT features outperform ODS and BERT features for syntactic classification (AUC 0.89 vs 0.81 and 0.83)",
    407       "evidence": "Table 5: CodeBERTgt AUC=0.89, ODSgt=0.81, BERTgt=0.83 with ground truth knowledge",
    408       "supported": "strong"
    409     }
    410   ],
    411   "methodology_tags": [
    412     "benchmark-eval"
    413   ],
    414   "key_findings": "INVALIDATOR achieves 0.81 Accuracy and 0.87 F1-score on 139 APR-generated patches from Defects4J, outperforming all 7 baselines. The two-stage approach—Daikon invariant-based semantic reasoning followed by CodeBERT+LR syntactic classification—is validated by ablation showing both components are essential: semantic-only yields 51% recall, syntactic-only yields 68% recall, while the combination achieves 79% recall. INVALIDATOR, ODS, and RGT are complementary and together cover 107/109 overfitting patches, suggesting ensemble deployment. Processing costs ~7 minutes per patch, dominated by invariant inference.",
    415   "red_flags": [
    416     {
    417       "flag": "No statistical significance tests",
    418       "detail": "All comparisons against 7 baselines use raw point estimates only; no p-values, confidence intervals, or significance tests reported despite multiple comparative claims."
    419     },
    420     {
    421       "flag": "Single benchmark, 4-project Java subset",
    422       "detail": "Evaluation uses 139 patches from only Chart, Time, Lang, and Math projects in Defects4J; conclusions do not adequately reflect this narrow scope."
    423     },
    424     {
    425       "flag": "CodeBERT contamination unaddressed",
    426       "detail": "CodeBERT was pre-trained on GitHub code; Defects4J projects are public open-source Java codebases that likely appeared in CodeBERT's training data. This potential contamination of the syntactic classifier is never discussed."
    427     },
    428     {
    429       "flag": "No variance across runs",
    430       "detail": "No multiple runs or error bars reported; logistic regression training on the 671-patch dataset may have variance from random initialization that is not characterized."
    431     },
    432     {
    433       "flag": "Failure cases not analyzed",
    434       "detail": "The 23 false negatives (overfitting patches INVALIDATOR missed) and 3 false positives are not analyzed to identify systematic failure modes."
    435     }
    436   ],
    437   "cited_papers": [
    438     {
    439       "title": "Identifying patch correctness in test-based program repair (PATCHSIM)",
    440       "relevance": "Primary baseline APAC technique and source of the 139-patch evaluation dataset"
    441     },
    442     {
    443       "title": "Automated classification of overfitting patches with statically extracted code features (ODS)",
    444       "relevance": "State-of-the-art baseline for syntactic-based APAC using 4,199 hand-crafted features"
    445     },
    446     {
    447       "title": "Evaluating representation learning of code changes for predicting patch correctness (BERT+LR)",
    448       "relevance": "Direct predecessor using BERT embeddings + logistic regression; provides 666-patch training dataset"
    449     },
    450     {
    451       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    452       "relevance": "Primary benchmark dataset used for all evaluation"
    453     },
    454     {
    455       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    456       "relevance": "Core model used for syntactic feature extraction in INVALIDATOR's second-stage classifier"
    457     },
    458     {
    459       "title": "The Daikon system for dynamic detection of likely invariants",
    460       "relevance": "Core tool for program invariant inference powering INVALIDATOR's semantic classifier"
    461     },
    462     {
    463       "title": "Automated patch assessment for program repair at scale (RGT)",
    464       "relevance": "Key semantic baseline; complementarity analysis with INVALIDATOR is a main finding"
    465     },
    466     {
    467       "title": "On reliability of patch correctness assessment",
    468       "relevance": "Establishes that manual annotation is more effective but expensive than automated APAC; motivates this work"
    469     },
    470     {
    471       "title": "Automated patch correctness assessment: How far are we? (Wang et al. 2020)",
    472       "relevance": "Provides large labeled dataset (666 patches) used for training and establishes prior state of the art"
    473     }
    474   ],
    475   "engagement_factors": {
    476     "practical_relevance": {
    477       "score": 3,
    478       "justification": "Directly addresses a critical bottleneck in APR deployment: identifying overfitting patches is essential before practitioners can trust any APR tool in production."
    479     },
    480     "surprise_contrarian": {
    481       "score": 1,
    482       "justification": "Combining semantic and syntactic approaches is an expected improvement direction; the results confirm the hypothesis rather than challenge assumptions."
    483     },
    484     "fear_safety": {
    485       "score": 0,
    486       "justification": "No AI safety or risk concerns; focused narrowly on software engineering patch assessment methodology."
    487     },
    488     "drama_conflict": {
    489       "score": 1,
    490       "justification": "Addresses the known 'test overfitting crisis' in APR (98% of GenProg patches are overfitting per Qi et al.), a longstanding pain point with real consequences for APR credibility."
    491     },
    492     "demo_ability": {
    493       "score": 2,
    494       "justification": "Tool is publicly available on GitHub with datasets; practitioners can apply it to their own Defects4J-compatible APR outputs."
    495     },
    496     "brand_recognition": {
    497       "score": 1,
    498       "justification": "David Lo (Singapore Management University) is a well-known SE researcher; no major industrial lab affiliation."
    499     }
    500   },
    501   "hn_data": {
    502     "threads": [],
    503     "top_points": 0,
    504     "total_points": 0,
    505     "total_comments": 0
    506   }
    507 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs