scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20297B)
      1 {
      2   "paper": {
      3     "title": "ContrastRepair: Enhancing Conversation-Based Automated Program Repair via Contrastive Test Case Pairs",
      4     "authors": ["Jiaolong Kong", "Mingfei Cheng", "Xiaofei Xie", "Shangqing Liu", "Xiaoning Du", "Qi Guo"],
      5     "year": 2024,
      6     "arxiv_id": "2403.01971"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": false,
     13         "justification": "The paper mentions patches are open-sourced for public evaluation (Section 5.2), but no repository URL for the ContrastRepair tool itself is provided in the paper."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper uses publicly available benchmark datasets: Defects4j, QuixBugs, and HumanEval-Java, all of which are standard public benchmarks."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper mentions using Python lib-javaobj and Java lib-Javassist but does not provide version details or a reproducible environment specification."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No step-by-step reproduction instructions, README with commands, or scripts for replicating experiments are provided in the paper."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No confidence intervals or error bars are reported. Results are presented as point estimates (e.g., 103 bugs fixed)."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper claims ContrastRepair 'significantly outperforms' baselines but provides no statistical significance tests (no p-values, t-tests, etc.)."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper reports percentage improvements with baseline context, e.g., '15.32% more than the best baseline' and specific counts like '103 out of 255 bugs' vs '90' for CHATREPAIR."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No justification is given for the sample sizes used. The Defects4j 2.0 subset of 82 bugs is noted as due to 'budget constraint' but no power analysis or formal justification is provided."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "In the ablation study (RQ4, Table 8), the experiments are repeated three times and average results are reported. However, no standard deviation or spread measure is given, only averages. But the paper does report results across multiple runs to mitigate randomness."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The paper compares against seven baselines: SelfAPR, AlphaRepair, RewardRepair, Recoder, CURE, TBar, and CHATREPAIR, plus a BaseChatGPT baseline (Section 4.1.4)."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Baselines include CHATREPAIR (2023), AlphaRepair (2022), SelfAPR (2022), and other recent methods. These represent the state of the art at the time of submission."
     68       },
     69       "ablation_study": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "RQ4 (Section 4.5) presents ablation study with w/o Pair, w/o Similarity, and w/o Context variants. Table 8 and Table 9 show results."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper uses #Correct (correct fixes), #Plausible (plausible fixes), and #Query (API call efficiency) as metrics (Section 4.1.5)."
     78       },
     79       "human_evaluation": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Three SE researchers manually verified whether plausible patches were semantically correct, discussed disagreements, and reached consensus (Section 5.2, threats to validity)."
     83       },
     84       "held_out_test_set": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "HumanEval-Java is used as an unseen dataset (RQ2) to address data leakage concerns, as it postdates GPT-3.5's training data cutoff."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Table 1 provides per-project breakdowns (Chart, Closure, Lang, Math, Mockito, Time). Table 3 shows per-scenario breakdowns (SL, SH, SF)."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper discusses cases where type-aware mutation cannot generate passing tests (functions with no primitive-type parameters), and discusses limitations of the approach for logical bugs (Section 3.2.2)."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper reports that the 12x10 configuration was least effective (Table 7), and that misleading test pairs can hurt repair (motivation example, Fig 1b). The w/o Context ablation shows decreased performance."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The abstract claims '143 out of all 337 bug cases' on Defects4j 1.2 and 2.0 combined (103+40=143 from Tables 1), and the best baseline fixes 124 (90+34=124). These match the results tables."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper makes causal claims about contrastive pairs improving repair ('ContrastRepair improves...') and supports these with ablation studies (w/o Pair, w/o Similarity, w/o Context) that use controlled single-variable manipulation."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The title and abstract claim 'new state-of-the-art in program repair' broadly, but results are only on Java benchmarks (Defects4j, QuixBugs-Java, HumanEval-Java) and one Python dataset (QuixBugs-Python). The approach is presented as general but tested in a narrow setting."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The threats to validity section discusses dataset selection and data leakage but does not discuss alternative explanations for why contrastive pairs help. For instance, the improvement could be due to simply providing more information in the prompt rather than the contrastive nature specifically."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Section 4.1.1 states 'gpt-3.5-turbo-0301' — a specific versioned model identifier."
    132       },
    133       "prompts_provided": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "Figure 3 shows a prompt template with placeholders (e.g., '{Error Message in Traceback}') but does not provide the actual fill values or complete prompts used. The template alone is insufficient to reconstruct exact prompts sent."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Section 4.1.1 reports temperature=1, n=3, m=40, similarity threshold θ=0.5, 1000 mutated test cases, 30-second timeout per test case evaluation."
    142       },
    143       "scaffolding_described": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The conversational repair loop (Algorithm 1), restart/continue strategies, pair selection, prompt construction, and patch augmentation are all described in detail in Sections 3.2-3.3."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 4.1.3 describes the instrumentation process to capture parameter values, the mutation process, and how test cases are collected. Section 4.1.2 describes dataset selection and fault localization setup."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 5.2 'Threats to Validity' provides substantive discussion of multiple threats including dataset selection, data leakage, manual verification, and efficiency measurement."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Specific threats include: re-implementation of CHATREPAIR may differ from original; hyperparameter settings differ from original CHATREPAIR paper (200 vs 40 restarts); manual patch verification requires consensus among three reviewers; some baseline results taken from other papers."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to specific languages, bug types, or project sizes. The limitation about logical bugs lacking test oracles is mentioned but framed as future work rather than a scope boundary."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "Generated patches are said to be open-sourced, but no link is provided in the paper. Raw experimental logs, API call records, and generated test cases are not available."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 4.1.2-4.1.3 describes how bugs are collected from Defects4j, QuixBugs, and HumanEval-Java, including version splits and localization setup."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No human participants; data sources are standard benchmarks."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The pipeline from test collection through instrumentation, mutation, pair selection, prompting, and patch validation is documented in Sections 3 and 4.1."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding acknowledgments or grant information is mentioned in the paper."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Author affiliations are listed: Singapore Management University, Nanyang Technological University, Monash University, and Tianjin University."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information is disclosed, so independence cannot be assessed."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests or financial interests statement is present in the paper."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "The paper acknowledges data leakage risk and uses HumanEval-Java as a post-training dataset, but does not state the exact training cutoff date for gpt-3.5-turbo-0301."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Section 4.3 (RQ2) and Section 5.2 explicitly discuss data leakage concerns with Defects4j and QuixBugs potentially being in ChatGPT's training data, and evaluate on HumanEval-Java to mitigate this."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "The paper acknowledges Defects4j and QuixBugs may be contaminated and uses HumanEval-Java (published January 2023, after GPT-3.5 training) as an uncontaminated benchmark (Section 4.3)."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No human participants study; only manual patch verification by researchers."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants study."
    242       },
    243       "demographics_reported": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants study."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants study."
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants study."
    257       },
    258       "blinding_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants study."
    262       },
    263       "attrition_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants study."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": true,
    273         "justification": "The paper reports average number of API queries per bug (#Query) across all experiments (Tables 4, 5, 6, 7, 8), and notes mutation time is 15-25 minutes per bug."
    274       },
    275       "compute_budget_stated": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No total API cost in dollars, total GPU hours, or aggregate compute budget is stated. Only per-bug query counts are reported."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "ContrastRepair correctly repairs 143 out of 337 bugs on Defects4j 1.2 and 2.0 combined, surpassing the best baseline CHATREPAIR which fixes 124.",
    285       "evidence": "Table 1 shows 103 fixes on D4J1.2 and 40 on D4J2.0 (total 143). CHATREPAIR gets 90+34=124.",
    286       "supported": "strong"
    287     },
    288     {
    289       "claim": "ContrastRepair reduces API calls by ~20.91% compared to CHATREPAIR.",
    290       "evidence": "Table 4 shows #Query reductions of 28.83% (SL), 30.20% (SH), and 5.97% (SF) on D4J1.2. Table 5 shows 29.34% reduction on HumanEval-Java.",
    291       "supported": "strong"
    292     },
    293     {
    294       "claim": "ContrastRepair fixes all 40 bugs in QuixBugs-Java and QuixBugs-Python.",
    295       "evidence": "Table 2 shows 40/40 for both Python and Java.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "Contrastive test pairs provide more useful information than failing tests alone.",
    300       "evidence": "Table 6 ablation: 2 pairs → 37 correct fixes vs 2 failed cases → 33 fixes. Table 8: w/o Pair → 30.67 vs ContrastRepair → 37.00 correct fixes.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "Similarity-based pair selection outperforms random selection and semantic-based approaches.",
    305       "evidence": "Table 8: w/o Similarity → 33.33 vs ContrastRepair → 37.00. Table 10: Damerau-Levenshtein outperforms BM25, CodeBERT, UniXcoder.",
    306       "supported": "moderate"
    307     }
    308   ],
    309   "methodology_tags": ["benchmark-eval"],
    310   "key_findings": "ContrastRepair enhances conversation-based automated program repair by providing LLMs with contrastive test case pairs (a failing test paired with a similar passing test). Evaluated on Defects4j, QuixBugs, and HumanEval-Java using gpt-3.5-turbo-0301, it fixes 143/337 Defects4j bugs versus 124 for CHATREPAIR, while requiring ~20% fewer API calls. Ablation studies confirm that both the contrastive pair mechanism and similarity-based selection contribute meaningfully to performance.",
    311   "red_flags": [
    312     {
    313       "flag": "No statistical significance tests",
    314       "detail": "The paper claims 'significant' improvements throughout but never performs any statistical test. All comparisons are based on raw count differences without assessing whether differences could arise by chance."
    315     },
    316     {
    317       "flag": "Variance only partially reported",
    318       "detail": "Ablation experiments (RQ4) are repeated 3 times and averages reported, but no standard deviations are given. Main experiments (RQ1, RQ2) appear to be single runs despite the stochastic nature of ChatGPT sampling at temperature=1."
    319     },
    320     {
    321       "flag": "Re-implemented baseline",
    322       "detail": "CHATREPAIR was re-implemented since it was not open-sourced. The authors acknowledge a gap between their reproduction and the original paper's results (different hyperparameter settings: 40 vs 200 restarts), which may disadvantage the baseline."
    323     },
    324     {
    325       "flag": "Perfect fault localization assumption",
    326       "detail": "All experiments use perfect (ground truth) fault localization, which is unrealistic. The paper acknowledges this is more practical at function level but still provides oracle-level information."
    327     }
    328   ],
    329   "cited_papers": [
    330     {
    331       "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    332       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    333       "year": 2023,
    334       "arxiv_id": "2304.00385",
    335       "relevance": "Primary baseline; the first fully automated conversation-driven APR approach using ChatGPT."
    336     },
    337     {
    338       "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-Shot Learning",
    339       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    340       "year": 2022,
    341       "doi": "10.1145/3540250.3549101",
    342       "relevance": "AlphaRepair, the first cloze-style APR approach using pre-trained code models without fine-tuning."
    343     },
    344     {
    345       "title": "Impact of code language models on automated program repair",
    346       "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"],
    347       "year": 2023,
    348       "arxiv_id": "2302.05020",
    349       "relevance": "Studies the impact of LLMs on APR and introduces HumanEval-Java benchmark used in this paper."
    350     },
    351     {
    352       "title": "Conversational automated program repair",
    353       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    354       "year": 2023,
    355       "arxiv_id": "2301.13246",
    356       "relevance": "Early work on conversational APR with LLMs."
    357     },
    358     {
    359       "title": "An analysis of the automatic bug fixing performance of chatgpt",
    360       "authors": ["Dominik Sobania", "Martin Briesch", "Carol Hanna", "Justyna Petke"],
    361       "year": 2023,
    362       "arxiv_id": "2301.08653",
    363       "relevance": "Evaluates ChatGPT's bug-fixing capabilities on QuixBugs."
    364     },
    365     {
    366       "title": "Evaluating large language models trained on code",
    367       "authors": ["Mark Chen"],
    368       "year": 2021,
    369       "arxiv_id": "2107.03374",
    370       "relevance": "Introduces Codex and HumanEval benchmark for LLM code generation evaluation."
    371     },
    372     {
    373       "title": "Automated repair of programs from large language models",
    374       "authors": ["Zhiyu Fan", "Xiang Gao", "Martin Mirchev", "Abhik Roychoudhury", "Shin Hwei Tan"],
    375       "year": 2023,
    376       "relevance": "Studies LLM-based automated program repair at ICSE 2023."
    377     },
    378     {
    379       "title": "Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation",
    380       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    381       "year": 2023,
    382       "arxiv_id": "2305.01210",
    383       "relevance": "Rigorous evaluation of LLM code generation correctness."
    384     },
    385     {
    386       "title": "A Survey of Learning-based Automated Program Repair",
    387       "authors": ["Quanjun Zhang", "Chunrong Fang", "Yuxiang Ma", "Weisong Sun", "Zhenyu Chen"],
    388       "year": 2023,
    389       "arxiv_id": "2301.03270",
    390       "relevance": "Comprehensive survey of learning-based APR methods."
    391     }
    392   ]
    393 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs