scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20812B)
      1 {
      2   "paper": {
      3     "title": "ComPass: Contrastive Learning for Automated Patch Correctness Assessment in Program Repair",
      4     "authors": ["Quanjun Zhang", "Ye Shang", "Haichuan Hu", "Chunrong Fang", "Zhenyu Chen", "Liang Xiao"],
      5     "year": 2026,
      6     "venue": "Empirical Software Engineering",
      7     "arxiv_id": "2602.07561"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "An anonymous repository is provided: https://anonymous.4open.science/r/ComPass-7EF1/. The Data Availability Statement confirms code and dataset are available there."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The dataset of 2,274 labeled patches from Defects4J is stated as available in the repository. Defects4J itself is a public benchmark."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Section 4.5 specifies: bert-base-uncased with 110M parameters from Hugging Face, PyTorch, Adam optimizer, two Tesla V100-SXM2 GPUs, Ubuntu 20.04. Specific library versions are not listed, but the environment is described in reasonable detail."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are described in the paper. The repository link is provided but the paper does not describe how to run the experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results are reported as point estimates (e.g., 88.35% accuracy) without confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims ComPass 'significantly outperforms' baselines but provides no statistical significance tests (no p-values, no t-tests, no bootstrap tests). Comparisons are based solely on comparing percentage values."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Percentage improvements with baseline context are consistently reported, e.g., '6.33% higher than the second highest value obtained from APPT (i.e., 82.02%)'. Absolute values for both systems are given throughout."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The dataset size of 2,274 patches is not justified. No power analysis or discussion of whether this is sufficient for the claims made."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "5-fold cross-validation is used but no variance or standard deviation across folds is reported. Only aggregate metrics are presented."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Seven baselines are included: PATCH-SIM, ODS, BERT (LR/DT/SVM), CACHE, APPT, and ChatGPT-3.5 (Section 4.3, Table 3)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "APPT (2024) is the most recent state-of-the-art baseline. ChatGPT-3.5 (gpt-3.5-turbo-0125) is also included as an LLM baseline. The baselines represent the current state of APCA research."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 5.2.1 (Table 5) presents an ablation study removing contrastive learning, data augmentation, fine-tuning, and pre-training components individually. Section 5.2.2 (Table 6) ablates vector integration strategies."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Four metrics are used: accuracy, precision, recall, and F1-score (Section 4.4)."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a binary classification task on patches with known ground-truth labels. Human evaluation of outputs is not relevant; the ground truth labels serve as the evaluation standard."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "5-fold cross-validation is used (Section 4.5), and a separate cross-project evaluation is conducted (Section 5.3) where entire projects are held out."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 8 provides per-project breakdowns (Chart, Closure, Lang, Math, Time) in the cross-project evaluation setting."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Figure 3 and accompanying text analyze a specific failure case where ComPass incorrectly classifies an overfitting patch as correct, discussing why the semantic redundancy was missed."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The ablation study shows performance drops when components are removed. The cross-project evaluation shows performance degradation compared to within-project (e.g., 88.35% → 81.01% for Chart). The paper also notes that removing cosine similarity actually increased precision by 1.47%."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 88.35% accuracy, 87.50% precision, 88.69% recall, 88.09% F1-score — all confirmed in Table 3. Claims about component contributions and generalizability are supported by Tables 5 and 7."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims (e.g., 'contrastive learning increases accuracy by 4.11%') are supported by controlled ablation studies where single components are removed (Table 5). This is adequate single-variable manipulation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper tests only on Java patches from Defects4J but the title and framing suggest general applicability to 'Program Repair' without language qualification. The threats section acknowledges Defects4J limitation but does not bound the title/abstract claims."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The threats to validity section discusses methodological limitations (BERT choice, Defects4J benchmark, baseline selection) but does not discuss alternative explanations for why ComPass outperforms baselines (e.g., could the improvement come from more training data via augmentation rather than contrastive learning specifically?)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section 4.5 specifies 'bert-base-uncased' with 110M parameters from Hugging Face. The LLM baseline specifies 'gpt-3.5-turbo-0125'. These are sufficiently specific version identifiers."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Figure 2 provides the full prompt template used for the LLM-based baseline, including the exact text and expected output format."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4.5 reports: batch size 16, max input length 512, max epochs 50, learning rate 5e-5, hidden dimension 768, 12 encoder layers, 12 attention heads, Adam optimizer."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. ComPass is a standard pre-train then fine-tune pipeline."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.2 describes: patches collected from prior studies, duplicates removed by comparing code after removing whitespace/comments, unlabeled corpus constructed from earliest commits of Defects4J projects, deduplication against patch benchmark. Augmentation process described in Section 3.2 with 18 transformation rules (Table 1)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 'Threats to Validity' provides a dedicated, substantive discussion with four specific threats."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The threats are specific to this study: (1) reliance on BERT specifically, (2) single benchmark Defects4J, (3) specific baseline selection, (4) evaluation setting focused on overfitting detection. Each includes concrete mitigation steps taken."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "While threats are discussed, the paper does not explicitly state what it does NOT show. The threats note generalization uncertainty but do not explicitly bound claims — e.g., no statement that results are limited to Java, Defects4J, or the specific APR tools tested."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The Data Availability Statement states code and dataset are available at the anonymous repository. The patches are derived from the public Defects4J benchmark."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4.2 describes collecting plausible patches from prior studies (Liu et al., Xiong et al., Ali et al., Tian et al., Lin et al.), filtering duplicates, and collecting 485,106 unlabeled code snippets from Defects4J earliest commits."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data comes from standard benchmarks (Defects4J) and prior published patch sets."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: collect patches from prior work → remove duplicates → label inheritance from established benchmarks → code transformation for augmentation (1,234,020 augmented from 485,106 unlabeled; 10,983 from 2,274 labeled). Table 2 provides detailed per-tool statistics."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Declarations section states: 'This work is supported partially by the National Natural Science Foundation of China (61932012, 62141215, 62372228), Natural Science Foundation of Jiangsu Province (BK20251458) and Fundamental Research Funds for the Central Universities (AE89991/463).'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All author affiliations are listed: Nanjing University of Science and Technology and Nanjing University. No evaluated product is affiliated with these institutions."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding is from Chinese national science foundations and university funds, which have no financial stake in the outcome of patch correctness assessment research."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The Declarations section explicitly states: 'The authors declared that they have no conflict of interest.'"
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "ComPass trains its own model (BERT fine-tuned with contrastive learning) rather than evaluating a pre-trained model's knowledge on a benchmark. The LLM baseline (GPT-3.5) is secondary and not the main contribution being evaluated."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Same as above — the paper trains and evaluates its own model with explicit train/test splits via cross-validation, not evaluating pre-trained model knowledge."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable for the same reason. The paper does address data leakage concerns for its own pre-training: 'we deduplicate all collected code snippets and the above patch benchmark' (Section 4.2)."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants. The paper states 'Ethical approval: Not Applicable.'"
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, latency, or per-patch processing time is reported, despite the paper positioning ComPass as a practical filtering component in APR pipelines."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Hardware is mentioned (two Tesla V100-SXM2 GPUs) but total training time, GPU hours, or computational budget are not reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "ComPass achieves 88.35% accuracy, 87.50% precision, 88.69% recall, and 88.09% F1-score on 2,274 Defects4J patches.",
    286       "evidence": "Table 3 (Section 5.1) reports these exact numbers under 5-fold cross-validation.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "ComPass significantly outperforms the state-of-the-art APPT by 6.33% accuracy.",
    291       "evidence": "Table 3 shows APPT at 82.02% accuracy vs ComPass at 88.35%. However, no statistical significance test is provided.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Contrastive learning increases accuracy by 4.11% and precision by 7.80%.",
    296       "evidence": "Table 5 (Section 5.2.1) ablation study: removing contrastive learning drops accuracy from 88.35% to 84.72% and precision from 87.50% to 80.67%.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "ComPass is generalizable to existing learning-based APCA approaches and advanced PLMs.",
    301       "evidence": "Table 7 (Section 5.2.3) shows improvements of 4.92%-15.67% accuracy when integrating ComPass with Tian et al. variants, APPT, and CodeBERT.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "ComPass achieves optimal performance in cross-project prediction scenarios.",
    306       "evidence": "Table 8 (Section 5.3) shows ComPass outperforms BERT SVM across all five Defects4J projects in cross-project setting, but only one baseline is compared.",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "ComPass introduces contrastive learning and data augmentation to automated patch correctness assessment, achieving 88.35% accuracy on 2,274 Defects4J patches, outperforming the prior state-of-the-art APPT by 6.33% accuracy. Ablation studies show all components (contrastive pre-training, data augmentation, fine-tuning) contribute positively. The framework generalizes across different encoder PLMs (BERT, CodeBERT) and existing APCA approaches, with 4.92%-15.67% accuracy improvements when integrated.",
    312   "red_flags": [
    313     {
    314       "flag": "No statistical significance testing",
    315       "detail": "The paper repeatedly claims ComPass 'significantly outperforms' baselines but provides no statistical tests. All comparisons are based on comparing point estimates from 5-fold cross-validation without reporting variance across folds."
    316     },
    317     {
    318       "flag": "No variance across folds reported",
    319       "detail": "Despite using 5-fold cross-validation, no standard deviation or variance across folds is reported for any experiment, making it impossible to assess result stability."
    320     },
    321     {
    322       "flag": "Cross-project evaluation uses only one baseline",
    323       "detail": "The cross-project evaluation (RQ3, Table 8) compares only against BERT SVM, omitting stronger baselines like APPT, weakening the claim of optimal cross-project performance."
    324     },
    325     {
    326       "flag": "Single benchmark, single language",
    327       "detail": "All experiments use only Java patches from Defects4J. Claims about general program repair applicability are not bounded to this setting."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "APPT: A pre-trained model-based approach for automated patch correctness assessment",
    333       "authors": ["Quanjun Zhang"],
    334       "year": 2024,
    335       "relevance": "Direct predecessor and primary baseline; PLM-based automated patch correctness assessment."
    336     },
    337     {
    338       "title": "Large language models for software engineering: Survey and open problems",
    339       "authors": ["Angela Fan", "Beliz Gokkaya", "Mark Harman"],
    340       "year": 2023,
    341       "relevance": "Survey of LLMs for software engineering tasks including program repair."
    342     },
    343     {
    344       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    345       "authors": ["Zhangyin Feng", "Daya Guo"],
    346       "year": 2020,
    347       "relevance": "Foundation PLM for code understanding, used as alternative encoder in ComPass experiments."
    348     },
    349     {
    350       "title": "ContraBERT: Enhancing code pre-trained models via contrastive learning",
    351       "authors": ["Shangqing Liu", "Bozhi Wu"],
    352       "year": 2023,
    353       "relevance": "Contrastive learning applied to code PLMs for robustness improvement."
    354     },
    355     {
    356       "title": "Context-aware code change embedding for better patch correctness assessment",
    357       "authors": ["Bo Lin", "Shangwen Wang"],
    358       "year": 2022,
    359       "relevance": "CACHE: context-aware APCA technique used as baseline."
    360     },
    361     {
    362       "title": "Identifying test-suite-overfitted patches through test case generation",
    363       "authors": ["Xiong Yingfei"],
    364       "year": 2018,
    365       "relevance": "PATCH-SIM: dynamic-based APCA baseline and foundational work on patch overfitting."
    366     },
    367     {
    368       "title": "Evaluating representation learning of code changes for predicting patch correctness in program repair",
    369       "authors": ["Haoye Tian"],
    370       "year": 2020,
    371       "relevance": "Foundational APCA work using BERT representation learning, multiple baselines built on this."
    372     },
    373     {
    374       "title": "A survey on automated program repair techniques",
    375       "authors": ["Quanjun Zhang"],
    376       "year": 2023,
    377       "relevance": "Comprehensive APR survey covering the landscape of program repair techniques."
    378     },
    379     {
    380       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    381       "authors": ["Ren\u00e9 Just", "Darioush Jalali", "Michael D. Ernst"],
    382       "year": 2014,
    383       "relevance": "Standard benchmark for evaluating automated program repair and patch correctness assessment."
    384     },
    385     {
    386       "title": "INVALIDATOR: Automated patch correctness assessment via semantic and syntactic reasoning",
    387       "authors": ["Thanh Le-Cong"],
    388       "year": 2023,
    389       "relevance": "Recent APCA technique using program invariants and pre-trained models."
    390     }
    391   ]
    392 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs