scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24210B)
      1 {
      2   "paper": {
      3     "title": "Practical Program Repair in the Era of Large Pre-trained Language Models",
      4     "authors": [
      5       "Chunqiu Steven Xia",
      6       "Yuxiang Wei",
      7       "Lingming Zhang"
      8     ],
      9     "year": 2022,
     10     "venue": "arXiv",
     11     "arxiv_id": "2210.14179"
     12   },
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper states in Section VI that they 'released the correct patches and code used to perform the experiments for public evaluation [74]' with a Figshare link provided."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper uses publicly available benchmarks (Defects4J 1.2, Defects4J 2.0, QuixBugs-Java, QuixBugs-Python, ManyBugs) and released their experimental data via Figshare [74]."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "Section IV-B mentions 'PyTorch versions of each PLM' and the hardware used ('32-Core workstation with Ryzen Threadripper PRO 3975WX CPU, 256 GB RAM and NVIDIA RTX A6000 GPU, running Ubuntu 20.04.4 LTS') but does not provide a requirements.txt, Dockerfile, or detailed dependency list with library versions."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "While the paper describes the approach and mentions releasing code via Figshare, it does not provide step-by-step reproduction instructions in the paper itself. The Figshare link is a placeholder URL (https://figshare.com/s/temp)."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper reports only point estimates (number of correct/plausible patches) across all tables (Tables III-X). No confidence intervals or error bars are provided for any results."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper makes multiple comparative claims ('substantially outperform all existing APR techniques') based on comparing raw counts of bugs fixed without any statistical significance tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper provides baseline counts alongside its own results (e.g., Codex fixes 99 vs AlphaRepair 67 on Defects4J 1.2, Tables VII-VIII), and reports percentage improvements like '40% (62/154)' of bugs fixed, providing sufficient context to assess magnitude."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification is provided for why 200 samples per bug was chosen as the default, nor why 2000 was selected for the improvement experiments. The choice of 200 is attributed to 'default setting used in previous work [26], [30]' without independent justification."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper reports single-run results. No standard deviation, variance, or spread measures across multiple runs are reported. With nucleus sampling (temperature=0.8), different runs could yield different results, but this is not investigated."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper compares against 20 different APR tools including 8 learning-based tools (AlphaRepair, RewardRepair, Recoder, DeepDebug, CURE, CoCoNuT, DLFix, SequenceR) and 12 traditional tools (TBar, PraPR, etc.), as detailed in Section IV-D and Tables VII-VIII."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The baselines include recent state-of-the-art tools from 2021-2022 such as AlphaRepair (2022), RewardRepair (2022), and Recoder (2021). These represent the current best at the time of writing."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper includes ablation-like analysis: comparing generative vs infilling models (Tables III-V), comparing prefix-only vs prefix+suffix (Table V, Codex single-line vs Codex suffix), and the effect of increasing sample size and adding templates (RQ4, Table X)."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper uses multiple evaluation metrics: number of correct patches, number of plausible patches, patch generation speed (patches/min in Table VI), syntactic/semantic error rates (Figure 4), and mean/sum entropy for ranking (Table IX, Figure 7)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section IV-E states they 'manually inspect each plausible patch for semantic equivalency' to determine correct patches, which is standard practice in APR research."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The models are applied in a zero-shot setting (no finetuning on APR data), and the evaluation uses standard benchmarks (Defects4J, QuixBugs, ManyBugs) with developer-written test suites for validation."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by dataset (Defects4J 1.2, 2.0, QuixBugs-Java, QuixBugs-Python, ManyBugs), by model, by repair setting (complete function, code infilling, single line), and by model size in Tables III-VIII."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "The paper discusses unique successful fixes (Figure 6) but does not analyze failure cases or discuss specific examples where PLMs fail to generate correct patches."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that generative models produce high syntactic error rates for single line generation (Figure 4, Section V-A3), and that smaller models like GPT-Neo 125M often fix very few bugs. The speed-performance tradeoff is also honestly reported."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims that PLMs 'substantially outperform all existing APR techniques on all our datasets' are supported by Tables VII-VIII showing Codex fixing 99 bugs on Defects4J 1.2 vs the best baseline's 67. All other abstract claims (scaling effect, suffix importance, patch ranking, improvements via templates) are supported in Sections V-A through V-D."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims are present (e.g., 'suffix code is important in generating more fixes', 'increasing sample size boosts performance'). These are supported by controlled comparisons: same model with/without suffix (Codex single-line vs Codex suffix in Table V), and same model with 200 vs 2000 samples (Table X). The ablation design is adequate."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section VI External validity explicitly states: 'However, our findings may still not generalize to other datasets or languages.' The study covers 3 languages and 5 datasets but acknowledges limitations."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section VI discusses data leakage as an alternative explanation for the strong results, analyzing what percentage of correct patches match developer patches and whether those appeared in training data. They found 66% of fixes differ from developer patches, and only 15% of exact-match fixes were in training data."
    130       }
    131     },
    132     "setup_transparency": {
    133       "model_versions_specified": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Table I specifies exact model names and parameter counts (GPT-Neo 125M/1.3B/2.7B, GPT-J 6.7B, GPT-NeoX 20B, Codex 12B, CodeT5 220M, INCODER 1.3B/6.7B). Section IV-B specifies 'code-davinci-002 engine' for Codex. Parameter sizes serve as version identifiers for the open-source models."
    137       },
    138       "prompts_provided": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Figures 1-3 provide the actual prompt templates used for all three repair settings (complete function generation, correct code infilling, single line generation). The Fibonacci example prompt is shown in full in Figure 1, and infilling token placement is shown in Figures 2-3."
    142       },
    143       "hyperparameters_reported": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Section IV-B states: 'Our default setting for generation uses nucleus sampling with top p = 0.95, temperature = 0.8 and 200 samples per bug.' These are the key generation hyperparameters."
    147       },
    148       "scaffolding_described": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No agentic scaffolding is used. The PLMs are applied directly with single-pass generation — no retry logic, tool use, feedback loops, or multi-step workflows."
    152       },
    153       "data_preprocessing_documented": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section IV-C describes filtering benchmarks to find bugs fitting the repair settings, with Table II providing exact counts at each filter level (#Bugs → #SF → #SH → #SL). Section IV-C3 also notes that only 91 of 185 ManyBugs bugs were reproducible and used."
    157       }
    158     },
    159     "limitations_and_scope": {
    160       "limitations_section_present": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section VI 'Threats to Validity' provides a dedicated section discussing both internal and external threats."
    164       },
    165       "threats_to_validity_specific": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section VI discusses specific threats: (1) manual validation of patches may have errors, addressed by releasing patches; (2) data leakage with specific statistics (66% of fixes differ from developer patches, only 15% found in training data); (3) evaluation limited to 5 datasets and 3 languages."
    169       },
    170       "scope_boundaries_stated": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The paper explicitly states that 'our findings may still not generalize to other datasets or languages' and notes the focus is on 'bugs where the fix is within a single function' (Section IV-C), bounding the scope of claims."
    174       }
    175     },
    176     "data_integrity": {
    177       "raw_data_available": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "The Figshare link [74] (https://figshare.com/s/temp) appears to be a placeholder URL. While they claim to release 'correct patches and code,' the actual availability cannot be verified from the paper. The underlying benchmarks (Defects4J, QuixBugs, ManyBugs) are publicly available."
    181       },
    182       "data_collection_described": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section IV-C describes each dataset in detail, including where bugs are collected from, how many bugs exist, and the filtering criteria applied. The model selection process is described in Section III-A."
    186       },
    187       "recruitment_methods_described": {
    188         "applies": false,
    189         "answer": false,
    190         "justification": "No human participants are involved. The study uses existing code benchmarks and pre-trained models."
    191       },
    192       "data_pipeline_documented": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The pipeline from bug selection (Table II with filtering counts) through patch generation (Section III-B with 3 settings) to validation (Section III-C with entropy ranking and test suite validation) is documented. Section IV-C3 explains filtering ManyBugs from 185 to 91 reproducible bugs."
    196       }
    197     },
    198     "conflicts_of_interest": {
    199       "funding_disclosed": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "No funding acknowledgment or grant information is mentioned anywhere in the paper."
    203       },
    204       "affiliations_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "All three authors are affiliated with the University of Illinois at Urbana-Champaign, disclosed at the top of the paper. No authors work at companies whose products are being evaluated."
    208       },
    209       "funder_independent_of_outcome": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding information is disclosed, so independence cannot be assessed. The paper evaluates Codex (OpenAI) among other models but the authors are university researchers, not OpenAI employees."
    213       },
    214       "financial_interests_declared": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No competing interests or financial interests statement is provided in the paper."
    218       }
    219     },
    220     "contamination": {
    221       "training_cutoff_stated": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "While the paper identifies training datasets for some models (The Pile for GPT-Neo/J/NeoX, CodeSearchNet/BigQuery for CodeT5), no training data cutoff dates are stated. For Codex and INCODER, training data is listed as 'N.R.' (not released)."
    225       },
    226       "train_test_overlap_discussed": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Section VI provides substantial analysis of train/test overlap: 'only 15% (20/128) of those patches are also found in the original training data.' They checked whether fixed functions appeared in training datasets of models where training data was accessible."
    230       },
    231       "benchmark_contamination_addressed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Section VI extensively discusses data leakage. They analyze whether developer patches appear in training data, report that 66% of correct fixes differ from developer patches, and note QuixBugs 'is not part of the training data as it has low number of stars on GitHub and contains synthetic bugs.'"
    235       }
    236     },
    237     "human_studies": {
    238       "pre_registered": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants are involved in the study. It is a benchmark evaluation of pre-trained language models."
    242       },
    243       "irb_or_ethics_approval": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants are involved in the study."
    247       },
    248       "demographics_reported": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants are involved in the study."
    252       },
    253       "inclusion_exclusion_criteria": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants are involved in the study."
    257       },
    258       "randomization_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants are involved in the study."
    262       },
    263       "blinding_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants are involved in the study."
    267       },
    268       "attrition_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants are involved in the study."
    272       }
    273     },
    274     "cost_and_practicality": {
    275       "inference_cost_reported": {
    276         "applies": true,
    277         "answer": true,
    278         "justification": "Table VI reports patch generation speed (patches/min) for each model across repair settings. Section V-B1 notes 'generating 200 patches for each of the 3 settings (i.e., at most 600 patches in total) costs no more than 2.5 hours for each model.' Figure 4 reports compilation rates."
    279       },
    280       "compute_budget_stated": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Section IV-B specifies the hardware: '32-Core workstation with Ryzen Threadripper PRO 3975WX CPU, 256 GB RAM and NVIDIA RTX A6000 GPU, running Ubuntu 20.04.4 LTS.' Time per bug is bounded at 2.5 hours and per-model speed is reported in Table VI."
    284       }
    285     }
    286   },
    287   "claims": [
    288     {
    289       "claim": "Directly applying state-of-the-art PLMs can substantially outperform all existing APR techniques on all studied datasets.",
    290       "evidence": "Table VII shows Codex fixes 99 bugs on Defects4J 1.2 vs AlphaRepair's 67 (best baseline). Combined PLMs fix 109 bugs. Table VIII shows similar outperformance on Defects4J 2.0 (52 vs 35), QuixBugs-Java (38 vs 28), and QuixBugs-Python (40 vs 27).",
    291       "supported": "strong"
    292     },
    293     {
    294       "claim": "The scaling effect exists for APR where larger models tend to achieve better performance.",
    295       "evidence": "Tables III-V consistently show increasing correct patches as model size increases within the GPT family (125M → 1.3B → 2.7B → 6.7B → 20B) and INCODER family (1.3B → 6.7B) across all datasets.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "Suffix code after the buggy line is important in generating more fixes and patches with higher compilation rate.",
    300       "evidence": "Table V shows Codex suffix outperforms Codex single-line across all datasets (e.g., 39 vs 32 correct on Defects4J 1.2). Figure 4 shows infilling models produce lower syntactic/semantic error rates.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "PLMs consider correct patches to be more natural than incorrect patches based on entropy.",
    305       "evidence": "Table IX shows mean entropy values for correct patches are consistently lower than non-plausible patches across all models and settings (e.g., Codex function gen on Defects4J 1.2: 0.04 correct vs 0.08 non-plausible).",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "PLM-based APR can be further substantially boosted by increasing sample size and incorporating fix templates.",
    310       "evidence": "Table X shows INCODER 6.7B improves from 37 to 64 correct fixes on Defects4J 1.2 when going from 200 to 2000 samples, and further to 78 with templates, surpassing all baselines.",
    311       "supported": "strong"
    312     }
    313   ],
    314   "methodology_tags": [
    315     "benchmark-eval"
    316   ],
    317   "key_findings": "Directly applying 9 pre-trained language models (ranging from 125M to 20B parameters) for automated program repair substantially outperforms 20 existing APR tools across 5 benchmarks in 3 languages. The scaling effect holds for APR, with larger models consistently fixing more bugs. Infilling models that use both prefix and suffix context outperform generative models that use prefix only, and entropy-based patch ranking enables faster identification of correct patches. Combining PLMs with simple repair templates and increased sampling further boosts performance, with INCODER 6.7B achieving 78 correct fixes on Defects4J 1.2 (vs 74 for the best baseline).",
    318   "red_flags": [
    319     {
    320       "flag": "No statistical significance tests",
    321       "detail": "All comparative claims are based on raw counts of bugs fixed without any statistical tests. Given the stochastic nature of nucleus sampling, the results could vary across runs, but no variance or significance testing is reported."
    322     },
    323     {
    324       "flag": "Placeholder data release URL",
    325       "detail": "The Figshare link [74] points to 'https://figshare.com/s/temp' which appears to be a placeholder, undermining the claim of public artifact release."
    326     },
    327     {
    328       "flag": "Potential data leakage not fully resolved",
    329       "detail": "While the paper acknowledges and partially investigates data leakage (Section VI), training data for Codex and INCODER is listed as 'N.R.' (not released), so overlap cannot be checked for the best-performing models. The 15% confirmed overlap rate among checkable models is non-trivial."
    330     },
    331     {
    332       "flag": "Missing funding disclosure",
    333       "detail": "No funding source or competing interests statement is provided, which is unusual for a paper from a major university research group."
    334     }
    335   ],
    336   "cited_papers": [
    337     {
    338       "title": "Evaluating large language models trained on code",
    339       "authors": ["Mark Chen", "Jerry Tworek"],
    340       "year": 2021,
    341       "arxiv_id": "2107.03374",
    342       "relevance": "Introduces Codex, a key model evaluated for code generation and program repair capabilities."
    343     },
    344     {
    345       "title": "Less training, more repairing please: Revisiting automated program repair via zero-shot learning",
    346       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    347       "year": 2022,
    348       "arxiv_id": "2207.08281",
    349       "relevance": "Introduces AlphaRepair, the zero-shot PLM-based APR baseline using CodeBERT."
    350     },
    351     {
    352       "title": "Incoder: A generative model for code infilling and synthesis",
    353       "authors": ["Daniel Fried", "Armen Aghajanyan"],
    354       "year": 2022,
    355       "arxiv_id": "2204.05999",
    356       "relevance": "Presents INCODER, a code infilling model evaluated for program repair tasks."
    357     },
    358     {
    359       "title": "Patch generation with language models: Feasibility and scaling behavior",
    360       "authors": ["Samuel D. Kolak", "Ruben Martins", "Claire Le Goues", "Vincent J. Hellendoorn"],
    361       "year": 2022,
    362       "relevance": "Prior work on PLMs for APR showing scaling behavior, evaluated on a smaller scale."
    363     },
    364     {
    365       "title": "Can OpenAI's Codex fix bugs?: An evaluation on QuixBugs",
    366       "authors": ["Julian A. Prenner", "Hlib Babii", "Romain Robbes"],
    367       "year": 2022,
    368       "relevance": "Prior small-scale evaluation of Codex for bug fixing on the QuixBugs benchmark."
    369     },
    370     {
    371       "title": "Competition-level code generation with AlphaCode",
    372       "authors": ["Yujia Li", "David Choi"],
    373       "year": 2022,
    374       "arxiv_id": "2203.07814",
    375       "relevance": "Demonstrates LLM capability in generating competition-level code, relevant to code generation evaluation."
    376     },
    377     {
    378       "title": "Cure: Code-aware neural machine translation for automatic program repair",
    379       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    380       "year": 2021,
    381       "relevance": "State-of-the-art learning-based APR baseline using code-aware NMT techniques."
    382     },
    383     {
    384       "title": "A syntax-guided edit decoder for neural program repair",
    385       "authors": ["Qihao Zhu", "Zeyu Sun"],
    386       "year": 2021,
    387       "relevance": "Introduces Recoder, a learning-based APR tool used as a key baseline comparison."
    388     },
    389     {
    390       "title": "Neural program repair with execution-based backpropagation",
    391       "authors": ["He Ye", "Matias Martinez", "Martin Monperrus"],
    392       "year": 2022,
    393       "relevance": "Introduces RewardRepair, an execution-guided APR technique used as baseline."
    394     },
    395     {
    396       "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation",
    397       "authors": ["Yue Wang", "Weishi Wang"],
    398       "year": 2021,
    399       "relevance": "Presents CodeT5, an encoder-decoder model for code tasks evaluated for program repair."
    400     },
    401     {
    402       "title": "Scaling laws for neural language models",
    403       "authors": ["Jared Kaplan", "Sam McCandlish"],
    404       "year": 2020,
    405       "arxiv_id": "2001.08361",
    406       "relevance": "Foundational work on scaling laws that motivates the study of model size effects on APR."
    407     },
    408     {
    409       "title": "Language models are few-shot learners",
    410       "authors": ["Tom B. Brown", "Benjamin Mann"],
    411       "year": 2020,
    412       "arxiv_id": "2005.14165",
    413       "relevance": "Introduces GPT-3 and few-shot learning paradigm used in the study's approach."
    414     }
    415   ]
    416 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs