scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24214B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Automated Repair of Programs from Large Language Models",
      6     "authors": [
      7       "Zhiyu Fan",
      8       "Xiang Gao",
      9       "Martin Mirchev",
     10       "Abhik Roychoudhury",
     11       "Shin Hwei Tan"
     12     ],
     13     "year": 2022,
     14     "venue": "arXiv",
     15     "arxiv_id": "2205.10583",
     16     "doi": "10.48550/arXiv.2205.10583"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims are substantiated: defect overlap with human code is shown by Table II manual analysis of 335 solutions, Codex-e parity/superiority over TBar/Recoder is shown by Tables III and V.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper makes comparative claims ('Codex-estm produces the best results') but relies on raw counts with no statistical tests; differences between tools (e.g., 16 vs 9 vs 11 correct patches) are never tested for significance.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Section VII explicitly acknowledges results may not generalize beyond Java, the studied configurations, or beyond Codex; hedged language ('may have potential') is used throughout the abstract.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The paper discusses why Codex-e outperforms (larger training data, flexible fault localization) and why TBar vs Recoder differ (search space vs learned patterns), providing multiple mechanistic explanations for observed outcomes.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly distinguishes 'plausible patches' (pass public tests) from 'correct patches' (pass LeetCode private held-out tests), making the measurement hierarchy clear throughout.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section VII 'Threats to Validity' exists and covers both external and internal threats with specific discussion.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats named: Java-only, single LLM evaluated, Codex-e algorithm undocumented (black-box), annotator disagreements in defect labeling (14 initial disagreements), and automated script bugs.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Scope explicitly bounded to easy/medium LeetCode Java tasks, Codex model, contests after Jun 2021; seven tasks requiring customized data structures explicitly excluded.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment appears anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All five author affiliations are listed on the title page (NUS, Beihang University, SUSTech).",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding disclosed, so independence cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "APR is defined (Section I), 'plausible' vs 'correct' patches are explicitly defined (Section IV), Codex-e modes (Codex-ebug, Codex-eline, Codex-estm) are precisely defined in Section V.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The contributions section lists three explicit contributions: systematic study of APR on LLM code, first evaluation of Codex edit mode as APR tool, and the LMDefects dataset.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section VIII explicitly positions this work relative to Nguyen et al. (Copilot evaluation, 33 tasks vs their 113), and situates Codex-e evaluation as a first-of-its-kind study in the APR and code LLM literature.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "Section VII states 'we will make our scripts available upon acceptance' — a promise of future release, not actual release at time of publication.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Section VII states 'We also release our dataset and classification result for public verification' — LMDefects is claimed to be released with the paper.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Only hardware is specified (Ubuntu 16.04, 64GB RAM, Intel Xeon, NVIDIA Titan V GPU); no software dependency list, requirements file, or Dockerfile is provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The workflow is described at a high level (Figure 1) but step-by-step reproduction instructions are absent; scripts are only promised upon acceptance.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results are reported as absolute patch counts with no confidence intervals or error bars anywhere in the paper.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to comparative claims (e.g., Recoder fixes 8 tasks vs TBar's 6) despite multiple tool comparisons.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "Results are given as raw counts (correct patches, correctly fixed tasks); no effect sizes, Cohen's d, or normalized improvement metrics are reported.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The 113-task LMDefects dataset size is not justified; no power analysis or sample size rationale is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Codex generates 50 candidates per task but variance in patch generation outcomes across runs is never reported; only point estimates of fix counts appear.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "TBar (pattern-based) and Recoder (learning-based) are both used as baselines against which Codex-e is evaluated.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "TBar (ISSTA 2019) and Recoder (FSE 2021) are the best-performing open-source Java APR tools on Defects4J at the time of the study.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Three Codex-e instruction variants (Codex-ebug, Codex-eline, Codex-estm) constitute an ablation of guidance level and specificity, with results in Table V.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Both 'plausible patches' (pass public tests) and 'correct patches' (pass private tests) are used, plus per-defect-category breakdown.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Two authors independently constructed and cross-validated ground truth patches for all 335 incorrect solutions; 14 initial disagreements resolved by discussion.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "LeetCode's private test suite serves as a held-out test set, with patched solutions submitted to the LeetCode judge for final validation.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Tables IV and V break down correctly fixed solutions by all defect sub-categories (S-O, S-C, S-V, M-S, M-U, M-L, etc.) for each tool.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Figure 5 shows a specific multi-hunk bug that all tools fail to fix, with explanation of why statistical fault localization breaks down on program-dependent bugs.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The study's main finding is that existing APR tools fix very few bugs: TBar fixes 6/67 tasks, Recoder 8/67, with multi-hunk failures totaling 0/62 solutions for both tools.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Exact model IDs specified: code-davinci-002 (Codex) and code-davinci-edit-001 (Codex-e), both stated to be trained on data up to Jun 2021.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Figure 2 shows a complete example prompt (function signature + Javadoc problem description); Codex-e instruction templates are specified verbatim ('Fix bug in the program', 'Fix line N', 'Fix s1').",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Temperature 0.8, max tokens 2048, stop sequences ('public', 'class', '//', 'System.out.print'), 50 candidates generated per task with top-5 selection — all reported.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Figure 1 shows the complete workflow; fault localization integration with Codex-e is described in detail, including the 10 most suspicious statements × 5 edits = 50 attempts per solution.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Data collection procedure documented: LeetCode contests from 4 July 2021 to 6 April 2022, easy/medium only, exclusion of hard problems and 7 tasks requiring custom data structures, public tests manually converted to JUnit.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "LMDefects dataset and defect classification results are stated to be released for public verification (Section VII).",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Collection procedure fully described: crawled 40 weekly + 20 biweekly LeetCode contests, 4 Jul 2021–6 Apr 2022, resulting in 60 easy + 53 medium tasks.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; data is collected from a public competitive programming platform.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Full pipeline documented: Codex generation → public test validation → APR tool application → LeetCode private test submission, with all parameters specified.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "Both Codex and Codex-e training data cutoff explicitly stated as June 2021.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "The study specifically designs around contamination by only using LeetCode contests released after June 2021 to ensure no overlap with Codex's training data.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "Confirmed with Codex-e developers that Codex and Codex-e share the same training dataset; LMDefects tasks all postdate the Jun 2021 cutoff, explicitly preventing contamination.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in the study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in the study.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in the study.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in the study.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in the study.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in the study.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in the study.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No Codex API costs or inference latency are reported despite using a paid API to generate 50 candidates × 113 tasks = 5,650 Codex queries plus Codex-e queries.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Only the per-repair timeout (15 minutes) is stated; total computational budget for the full experiment is not reported.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Auto-generated code from Codex shares common defect categories with human-written code, with similar mutation operators and multi-hunk fix patterns overlapping with Codeflaws.",
    375       "evidence": "Manual analysis of 335 incorrect solutions classified by two annotators using the Codeflaws defect taxonomy; defect categories (S-O, S-V, M-U, etc.) directly overlap.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "57% of Codex bugs are algorithm-related (misaligned algorithm) and 11% are syntax errors, making them largely inaccessible to existing pattern-based APR.",
    380       "evidence": "Table II shows 191/335 solutions classified as 'Misaligned Algorithm' and 37/335 as syntax errors.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Existing APR tools (TBar, Recoder) are very limited at fixing Codex-generated bugs: TBar fixes 6/67 tasks, Recoder fixes 8/67 tasks.",
    385       "evidence": "Table III reports correct patches per tool; neither tool fixes any multi-hunk bugs (Table IV shows 0 correct patches for M-S/M-U/M-L categories).",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Codex edit mode with statement-level fault localization (Codex-estm) outperforms both TBar and Recoder, producing 16 correct patches vs 9 (TBar) and 11 (Recoder).",
    390       "evidence": "Table V shows Codex-estm fixes 14 single-hunk and 2 multi-hunk solutions; Figure 7 Venn diagram shows TBar's patches are a subset of Codex-estm ∪ Recoder.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Codex-e without any location guidance (Codex-ebug) performs nearly as well as with statement-level guidance (Codex-estm): 15 vs 16 correct patches.",
    395       "evidence": "Table V comparison of Codex-ebug (8+3 easy/medium single-hunk, 2+2 multi-hunk) vs Codex-estm (10+4 single-hunk, 2+0 multi-hunk).",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Combining TBar and Codex-e patch ingredients covers required patches for 9 solutions versus 4-5 for individual tools; adding multiple Codex candidates (TBar+Codex) extends coverage to 12.",
    400       "evidence": "Table VI shows patch ingredient coverage across S-HO, M-S, and M-U defect categories for TBar, Codex-e, TBar+Codex-e, and TBar+Codex combinations.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "case-study",
    407     "observational"
    408   ],
    409   "key_findings": "Codex-generated Java programs fail on LeetCode contests primarily due to algorithm misalignment (57%) and syntax errors (11%), with defect patterns closely overlapping human programmer mistakes. Existing pattern-based (TBar) and learning-based (Recoder) APR tools fix only 6-8 of 67 unsolved tasks because they cannot handle multi-hunk bugs or diverse patch ingredients. Codex edit mode with statement-level guidance (Codex-estm) modestly outperforms both APR tools (16 vs 9-11 correct patches) and uniquely produces flexible multi-location fixes; surprisingly, giving no location guidance (Codex-ebug) achieves nearly the same count (15 fixes) while fixing more multi-hunk bugs. Combining TBar's pattern space with Codex-e or multiple Codex candidates provides patch ingredients for more complex bugs than either approach alone.",
    410   "red_flags": [
    411     {
    412       "flag": "No statistical testing",
    413       "detail": "All tool comparisons are raw patch counts (e.g., 16 vs 11 vs 9 correct patches) with no significance tests, confidence intervals, or effect sizes; differences could easily be within noise on a 67-task dataset."
    414     },
    415     {
    416       "flag": "Very small dataset",
    417       "detail": "113 total tasks (67 unsolved) is extremely small for drawing conclusions about comparative tool effectiveness; many cells in Tables IV-V contain single-digit counts."
    418     },
    419     {
    420       "flag": "Code not released at publication",
    421       "detail": "Scripts promised 'available upon acceptance' rather than released with the paper, making independent verification impossible at time of publication."
    422     },
    423     {
    424       "flag": "Single-LLM, single-language scope",
    425       "detail": "Study is limited to Codex on Java LeetCode problems; all conclusions about 'auto-generated code' behavior are bounded to this narrow configuration despite broad framing."
    426     },
    427     {
    428       "flag": "Codex-e is a black box",
    429       "detail": "The underlying algorithm of Codex edit mode is undocumented; the paper cannot explain mechanism of improvement beyond speculation about training data size and flexible fault localization."
    430     }
    431   ],
    432   "cited_papers": [
    433     {
    434       "title": "Evaluating Large Language Models Trained on Code (Codex/HumanEval)",
    435       "relevance": "Primary subject of study; provides pass@k metrics and APPS baseline results used for comparison"
    436     },
    437     {
    438       "title": "Competition-Level Code Generation with AlphaCode",
    439       "relevance": "Contemporary LLM code generation system; provides comparative pass rates on competition tasks"
    440     },
    441     {
    442       "title": "Measuring Coding Challenge Competence with APPS",
    443       "relevance": "Benchmark dataset for code generation evaluation; provides context for LeetCode-based LMDefects"
    444     },
    445     {
    446       "title": "TBar: Revisiting Template-based Automated Program Repair",
    447       "relevance": "Primary APR baseline tool evaluated in the study; pattern-based Java repair"
    448     },
    449     {
    450       "title": "A Syntax-Guided Edit Decoder for Neural Program Repair (Recoder)",
    451       "relevance": "Primary learning-based APR baseline tool; syntax-guided decoder approach"
    452     },
    453     {
    454       "title": "Codeflaws: A Programming Competition Benchmark for Evaluating Automated Program Repair Tools",
    455       "relevance": "Defect taxonomy used to classify Codex bugs; shows overlap between human and LLM programming errors"
    456     },
    457     {
    458       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies",
    459       "relevance": "Standard Java APR benchmark used to select TBar and Recoder as representative tools"
    460     },
    461     {
    462       "title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions",
    463       "relevance": "Most directly related prior work; evaluates Copilot on 33 LeetCode tasks vs this paper's 113"
    464     }
    465   ],
    466   "engagement_factors": {
    467     "practical_relevance": {
    468       "score": 2,
    469       "justification": "Directly relevant to developers using Codex/GitHub Copilot for code generation, offering concrete strategies (fault-localization-guided Codex-e) to improve output quality."
    470     },
    471     "surprise_contrarian": {
    472       "score": 2,
    473       "justification": "Counterintuitive finding that Codex-e with no location guidance (Codex-ebug) nearly matches statement-level guidance, and that Codex-e outperforms dedicated APR tools built specifically for this task."
    474     },
    475     "fear_safety": {
    476       "score": 0,
    477       "justification": "No AI safety or risk concerns raised; purely a software engineering effectiveness study."
    478     },
    479     "drama_conflict": {
    480       "score": 0,
    481       "justification": "Standard academic tool comparison with no controversy or conflict angle."
    482     },
    483     "demo_ability": {
    484       "score": 1,
    485       "justification": "LMDefects dataset is released enabling replication, but scripts are not yet available; Codex API access required and is no longer public."
    486     },
    487     "brand_recognition": {
    488       "score": 2,
    489       "justification": "Codex/GitHub Copilot is a high-recognition product; Abhik Roychoudhury is a well-known APR researcher."
    490     }
    491   },
    492   "hn_data": {
    493     "threads": [],
    494     "top_points": 0,
    495     "total_points": 0,
    496     "total_comments": 0
    497   }
    498 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs