scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30464B)
      1 {
      2   "paper": {
      3     "title": "Automated Repair of Programs from Large Language Models",
      4     "authors": [
      5       "Zhiyu Fan",
      6       "Xiang Gao",
      7       "Martin Mirchev",
      8       "Abhik Roychoudhury",
      9       "Shin Hwei Tan"
     10     ],
     11     "year": 2022,
     12     "venue": "arXiv preprint",
     13     "arxiv_id": "2205.10583",
     14     "doi": "10.48550/arXiv.2205.10583"
     15   },
     16   "scan_version": 3,
     17   "active_modules": [
     18     "experimental_rigor",
     19     "data_leakage"
     20   ],
     21   "methodology_tags": [
     22     "benchmark-eval",
     23     "qualitative"
     24   ],
     25   "key_findings": "Automatically generated code from Codex shares common defect patterns with human-written code, with 57% of bugs being algorithm-related and 11% syntax errors. Existing APR tools TBar and Recoder can fix only 9 and 11 incorrect solutions respectively, mostly limited to single-hunk fixes. Codex edit mode (Codex-estm) outperforms both by fixing 16 solutions and can generate patches at flexible locations beyond the given suspicious statement. Combining patch spaces of APR tools and Codex/Codex-e produces required ingredients for more complex multi-hunk fixes.",
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "Section VII states 'we will make our scripts available upon acceptance' — a promise of future release, not an actual release. No repository URL or archive link is provided in the paper."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper proposes the LMDefects dataset with 113 Java programming tasks but provides no download URL or archive link. Section VII states 'We also release our dataset and classiﬁcation result for public veriﬁcation' but no concrete link is given."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Section II mentions hardware ('Ubuntu-16.04 server, with 64GB RAM and Intel(R) Xeon(R) CPU E5-2660 @ 2.00GHz, and NVIDIA Titan V GPU') but provides no software dependency specifications, requirements.txt, or library versions."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a high level but without concrete commands or scripts to replicate the experiments."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "All results are reported as point estimates (raw counts of patches and fixed tasks). No confidence intervals or error bars are provided in any table or figure."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper compares TBar, Recoder, and Codex-e approaches based solely on raw counts (e.g., 9 vs 11 vs 16 correct patches) without any statistical significance tests."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Results are reported as raw counts of patches and fixed tasks. No formal effect sizes (Cohen's d, odds ratios, etc.) are computed. While baseline context exists through comparison, the small numbers make formal effect sizes important."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The dataset comprises 113 tasks and 335 incorrect solutions, but no justification is given for why this sample size is adequate. No power analysis is discussed."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "Results are from single experimental runs. No variance, standard deviation, or spread measures are reported across multiple runs or seeds."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The paper compares three repair approaches: TBar (pattern-based), Recoder (learning-based), and three variants of Codex-e (edit mode). Results are compared across all tools in Tables III, IV, V, and Figure 7."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "TBar (2019) and Recoder (2021) are described as 'the most recent representative' of different APR approaches and 'have reported the best results by generating the highest number of correct patches on the Defects4J benchmark.' Codex-e was newly released in March 2022."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The three Codex-e strategies (Codex-ebug, Codex-eline, Codex-estm) systematically vary the level of fault localization guidance, effectively ablating the contribution of location information. Table V and Section V analyze the impact."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The paper reports plausible patches (pass public tests), correct patches (pass public + private tests), and correctly fixed tasks. These measure different aspects of repair effectiveness."
     96       },
     97       "human_evaluation": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Two annotators manually classified all 335 incorrect solutions into defect categories (Section III), constructed ground truth patches, and cross-validated results. They also manually analyzed Codex-e patches for flexible fault localization."
    101       },
    102       "held_out_test_set": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Public LeetCode tests guide APR tools, while private LeetCode tests serve as the held-out validation set. 'The patched solutions are then validated using (1) the public tests, and (2) the held-out (private) tests in the LeetCode platform.'"
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Tables II, IV, and V provide detailed breakdowns by defect category (14 sub-categories), difficulty level (easy/medium), and per-tool results for each category."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section IV discusses specific failure cases: Figure 5 shows a multi-hunk bug neither tool could fix due to lack of dependency awareness. Section III analyzes negative symptoms in auto-generated code (wrong algorithms, similar code blocks, irrelevant helpers)."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper reports that existing APR tools are 'still quite limited' — TBar and Recoder fail on all multi-hunk bugs (0/62). They explicitly discuss three challenges: limited search space, inability to generate multi-edit patches, and lack of program dependency awareness."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Abstract claims are supported: (1) common mistakes finding is backed by Section III's defect classification; (2) Codex-e being 'similar to or better than' TBar/Recoder is supported by Tables III-V (16 vs 9 and 11 correct patches); (3) combining search spaces is supported by Table VI."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper makes causal claims like 'We think that Codex-estm outperforms Recoder because: (1) Codex-estm can produce complex patches at ﬂexible locations; and (2) Codex-estm is trained on a much larger dataset.' These are speculative causal attributions without controlled experiments isolating these factors."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The title 'Automated Repair of Programs from Large Language Models' is broader than the evidence, which covers only one model (Codex) on Java LeetCode problems. While the threats section notes 'our experiments may not generalize beyond the studied conﬁgurations and other programming languages beyond Java,' the title and many implications sections frame findings broadly."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper does not consider alternative explanations for why Codex-e outperforms APR tools — e.g., differences in compute budget (50 attempts for Codex-e vs 15-minute timeout for APR tools), or whether the improvement is simply due to having more attempts rather than better repair capability."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper measures 'correct patches' (passing all public and private LeetCode tests) and 'plausible patches' (passing only public tests). These are directly what is claimed — no proxy gap between measurement and claims about repair effectiveness."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Specific model versions are stated: 'code-davinci-002' for Codex and 'code-davinci-edit-001' for Codex-e (Section II). Training data cutoff (June 2021) is also provided."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Figure 2 shows the actual prompt format (function signature + problem description). Codex-e instructions are given verbatim: 'Fix bug in the program', 'Fix line N', 'Fix s1'. Stop sequences are listed: 'public', 'class', '//', 'System.out.print'."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section II reports: temperature 0.8, max token length 2048, 50 candidate solutions, best-of selection for top 5, stop sequences. Codex-e: 10 suspicious statements × 5 edits = 50 attempts, temperature 0.8. APR timeout: 15 minutes."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No agentic scaffolding is used. The workflow is a simple pipeline: Codex generates solutions → test against public tests → APR tools fix failures → validate against private tests."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section II documents: contests from July 2021 to April 2022, 40 weekly + 20 biweekly contests, easy and medium difficulty only, 7 tasks with customized data structures excluded, post-June-2021 temporal filter, manual conversion of public tests to JUnit format, and bracket mismatch pre-fixing."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section VII 'Threats to Validity' provides a dedicated discussion of both external and internal validity threats."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Specific threats include: annotator bias (14 initial disagreements resolved through discussion), Java-only evaluation, single language model (Codex), specific configurations that may not generalize, confirmation of training data overlap with Codex-e developer, and potential bugs in automated scripts."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The paper states: 'our experiments may not generalize beyond the studied conﬁgurations and other programming languages beyond Java' and 'our study only evaluates on the Codex language model and the Codex edit mode. The reported ﬁndings may not generalize beyond the studied model.'"
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No download link or archive is provided for the LMDefects dataset. The paper promises release but provides no actual access to the raw data (incorrect solutions, ground truth patches, classification labels)."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section II details: LeetCode contests from July 4, 2021 to April 6, 2022, 40 weekly + 20 biweekly contests, easy and medium problems, temporal filter for contamination avoidance, resulting in 60 easy + 53 medium = 113 tasks."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants. The data source is LeetCode contest problems, which is a publicly available problem set — not a recruited sample."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The pipeline is documented: crawl 60 contests → 113 tasks (60 easy, 53 medium) → generate 50 solutions each → select top 5 → 46 tasks solved, 67 unsolved → 335 incorrect solutions from unsolved tasks → manual defect classification by two annotators."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No acknowledgments or funding section is present in the paper. No grants, sponsors, or funding agencies are mentioned."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Author affiliations are clearly stated: National University of Singapore, Beihang University, and Southern University of Science and Technology. No commercial affiliation with OpenAI or the Codex product being evaluated."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Cannot assess funder independence because no funding source is disclosed. The authors use OpenAI's Codex API but are from independent academic institutions."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests or financial disclosure statement is present in the paper."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": true,
    242         "justification": "Section II states: 'both trained on data up to Jun 2021' for both Codex code-davinci-002 and Codex-e code-davinci-edit-001."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Explicitly addressed: 'To prevent the case where the collected dataset was used in the training set of Codex, we only consider contests that are released after Jun 2021.' Section VII also confirms with the Codex-e developer that training data is the same."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": true,
    252         "justification": "The temporal split (only contests after June 2021) directly addresses contamination risk. The paper explicitly discusses why existing datasets were avoided: 'Codex was already trained on GitHub where solutions for many previous programming tasks exist (e.g., APPS, CodeContest).'"
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in the study. The two annotators who classified defects are researchers performing analysis, not study participants."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants. The study analyzes auto-generated code and applies automated/manual repair tools."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in the study."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in the study."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in the study."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in the study."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in the study."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No API costs reported for Codex calls despite generating 50 solutions per task (113 tasks) and 50 edit attempts per incorrect solution for Codex-e. No tokens consumed or cost per task is mentioned."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Hardware is listed (server specs, GPU) and APR timeout is 15 minutes, but total compute budget (GPU hours, API spend, total wall-clock time) is not stated."
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "Temperature 0.8 introduces stochasticity in generation but no seed sensitivity analysis is performed. Results are from a single experimental run with no variance across seeds."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": true,
    313         "justification": "The paper states: 50 candidate solutions generated per task, top 5 selected. For Codex-e: 5 possible edits for each of 10 suspicious statements (50 attempts total). APR tools run once with 15-minute timeout."
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "Settings are adopted from prior work without reporting a search budget. The paper states temperature 0.8 has 'the best performance' citing prior work, but no search over hyperparameters is described or budgeted for this study."
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": true,
    323         "justification": "All three Codex-e strategies are reported without cherry-picking. Configuration choices (temperature, sampling) are justified by citing prior work: 'We reuse the same setting as prior work.'"
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": false,
    327         "answer": false,
    328         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The authors evaluate existing tools (TBar, Recoder, Codex-e) they did not build, which mitigates self-comparison bias. However, they do not explicitly acknowledge or discuss this potential bias."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "Codex-e gets 50 attempts per solution while APR tools get a 15-minute timeout. Performance is not compared at matched compute budgets, and the asymmetry in compute/attempts is not discussed as a potential confound."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The paper does not discuss whether LeetCode contest problems are representative of real-world auto-generated code repair scenarios. LeetCode problems are algorithmic competition tasks, which may differ substantially from production code generation use cases."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "TBar uses statistical fault localization, Recoder uses its own learned localization, and Codex-e uses three different instruction strategies. These scaffolding differences are not controlled when comparing tools — performance differences could stem from the scaffolding rather than the core repair capability."
    349       }
    350     },
    351     "data_leakage": {
    352       "temporal_leakage_addressed": {
    353         "applies": true,
    354         "answer": true,
    355         "justification": "The paper explicitly uses temporal split: only LeetCode contests released after June 2021 (Codex training cutoff) to prevent the model from having seen solutions."
    356       },
    357       "feature_leakage_addressed": {
    358         "applies": true,
    359         "answer": true,
    360         "justification": "The study cleanly separates public tests (used for repair guidance) from private tests (used for validation). The evaluation setup is transparent about what information each tool receives."
    361       },
    362       "non_independence_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No discussion of whether LeetCode problems from the same contests share structural similarities, or whether 5 solutions from the same task are treated as independent observations when they come from the same prompt."
    366       },
    367       "leakage_detection_method": {
    368         "applies": true,
    369         "answer": true,
    370         "justification": "Temporal split is used as a concrete prevention method. Additionally, the authors 'conﬁrmed with the developer of Codex-e that Codex and Codex-e use the same dataset for training,' verifying no additional training contamination."
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "Auto-generated code from Codex shares common programming mistakes with human-crafted solutions, with 57% being algorithm-related bugs and 11% syntax errors.",
    377       "evidence": "Section III, Table II: Manual classification of 335 incorrect solutions into 14 defect categories by two annotators. Defect overlap with Codeflaws benchmark confirmed.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Existing APR tools (TBar and Recoder) can fix only a small number of bugs in auto-generated code — 9 and 11 correct patches respectively.",
    382       "evidence": "Section IV, Table III: TBar produces 6 easy + 3 medium correct patches (6 tasks fixed); Recoder produces 6 easy + 5 medium correct patches (8 tasks fixed). Neither tool fixes any multi-hunk bugs.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Codex edit mode (Codex-estm) outperforms TBar and Recoder by producing 16 correct patches, including multi-hunk fixes.",
    387       "evidence": "Section V, Table V: Codex-estm fixes 14 single-hunk + 2 multi-hunk = 16 solutions total. Figure 7 Venn diagram shows TBar patches are a proper subset of Codex-estm ∪ Recoder.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "The effectiveness of Codex-e with specific fault location guidance (Codex-estm, 16 fixes) is nearly comparable to its effectiveness without any location guidance (Codex-ebug, 15 fixes).",
    392       "evidence": "Section V, Table V: Codex-ebug fixes 15 solutions, Codex-estm fixes 16. The fixed defect categories differ — Codex-ebug fixes more multi-hunk, Codex-estm fixes more single-hunk.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Combining patch spaces of TBar and Codex-e produces required ingredients for more complex fixes (9 solutions vs 4 and 5 individually).",
    397       "evidence": "Section V-B, Table VI: TBar+Codex-e covers 9 incorrect solutions' patch ingredients, with 2 not coverable by either tool alone. TBar+Codex covers 12 total.",
    398       "supported": "moderate"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "Small sample sizes for comparisons",
    404       "detail": "Tool comparisons are based on single-digit fix counts (e.g., TBar: 9, Recoder: 11, Codex-estm: 16). Differences of 2-7 fixes without statistical testing make it difficult to distinguish signal from noise."
    405     },
    406     {
    407       "flag": "No statistical testing or uncertainty quantification",
    408       "detail": "All comparisons between tools are based on raw counts without any significance tests, confidence intervals, or repeated runs. The stochastic nature of Codex generation (temperature 0.8) means results could vary substantially across runs."
    409     },
    410     {
    411       "flag": "Uncontrolled compute asymmetry",
    412       "detail": "Codex-e gets 50 attempts per solution (10 statements × 5 edits) while APR tools get a 15-minute timeout. The compute budget differs substantially across tools, making direct comparison of 'effectiveness' potentially misleading."
    413     },
    414     {
    415       "flag": "Dataset not actually released",
    416       "detail": "The paper proposes LMDefects as a contribution but only promises future release ('we will make our scripts available upon acceptance') without providing any URL. The dataset availability claim is unverifiable."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Evaluating large language models trained on code",
    422       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    423       "year": 2021,
    424       "arxiv_id": "2107.03374",
    425       "relevance": "Introduces Codex and the HumanEval benchmark for evaluating LLM code generation — the primary model evaluated in this study."
    426     },
    427     {
    428       "title": "Competition-level code generation with AlphaCode",
    429       "authors": ["Y. Li", "D. Choi", "J. Chung"],
    430       "year": 2022,
    431       "arxiv_id": "2203.07814",
    432       "relevance": "Major LLM code generation system evaluated on competitive programming, compared in this study's introduction."
    433     },
    434     {
    435       "title": "Measuring coding challenge competence with APPS",
    436       "authors": ["D. Hendrycks", "S. Basart", "S. Kadavath"],
    437       "year": 2021,
    438       "relevance": "Proposes the APPS benchmark for evaluating code generation on programming challenges — establishes pass@k evaluation methodology used in this study."
    439     },
    440     {
    441       "title": "TBar: Revisiting template-based automated program repair",
    442       "authors": ["K. Liu", "A. Koyuncu", "D. Kim", "T. F. Bissyandé"],
    443       "year": 2019,
    444       "relevance": "State-of-the-art pattern-based APR tool used as primary baseline for fixing LLM-generated code."
    445     },
    446     {
    447       "title": "A syntax-guided edit decoder for neural program repair",
    448       "authors": ["Q. Zhu", "Z. Sun", "Y.-a. Xiao"],
    449       "year": 2021,
    450       "relevance": "Recoder — learning-based APR tool with syntax-guided decoder, used as primary baseline and best-performing DL-based repair tool on Defects4J."
    451     },
    452     {
    453       "title": "Automatically finding patches using genetic programming",
    454       "authors": ["W. Weimer", "T. Nguyen", "C. L. Goues", "S. Forrest"],
    455       "year": 2009,
    456       "relevance": "GenProg — foundational search-based APR technique that established the generate-and-validate repair paradigm."
    457     },
    458     {
    459       "title": "Can OpenAI Codex and other large language models help us fix security bugs?",
    460       "authors": ["H. Pearce", "B. Tan", "B. Ahmad", "R. Karri", "B. Dolan-Gavitt"],
    461       "year": 2021,
    462       "arxiv_id": "2112.02125",
    463       "relevance": "Directly relevant study on using LLMs for bug fixing, focusing on security vulnerabilities."
    464     },
    465     {
    466       "title": "Can OpenAI's Codex fix bugs?: An evaluation on QuixBugs",
    467       "authors": ["J. A. Prenner", "H. Babii", "R. Robbes"],
    468       "year": 2022,
    469       "relevance": "Closely related work evaluating Codex's ability to fix bugs, using the QuixBugs benchmark."
    470     },
    471     {
    472       "title": "An empirical evaluation of GitHub Copilot's code suggestions",
    473       "authors": ["N. Nguyen", "S. Nadi"],
    474       "year": 2022,
    475       "relevance": "Evaluates Copilot code quality on LeetCode tasks — directly comparable to this study's evaluation setup."
    476     },
    477     {
    478       "title": "Jigsaw: Large language models meet program synthesis",
    479       "authors": ["N. Jain", "S. Vaidyanath", "A. Iyer"],
    480       "year": 2022,
    481       "relevance": "Combines program synthesis with LLMs for code repair, an approach this paper recommends for future work."
    482     },
    483     {
    484       "title": "CoCoNuT: combining context-aware neural translation models using ensemble for program repair",
    485       "authors": ["T. Lutellier", "H. V. Pham", "L. Pang"],
    486       "year": 2020,
    487       "relevance": "Context-aware neural APR approach that encodes program context for patch generation."
    488     },
    489     {
    490       "title": "CURE: Code-aware neural machine translation for automatic program repair",
    491       "authors": ["N. Jiang", "T. Lutellier", "L. Tan"],
    492       "year": 2021,
    493       "relevance": "DL-based APR tool using a programming language model for repair, representative of learning-based approaches."
    494     },
    495     {
    496       "title": "Program synthesis with large language models",
    497       "authors": ["J. Austin", "A. Odena", "M. Nye"],
    498       "year": 2021,
    499       "arxiv_id": "2108.07732",
    500       "relevance": "Foundational study on using LLMs for program synthesis, evaluating multiple models on code generation benchmarks."
    501     }
    502   ],
    503   "engagement_factors": {
    504     "practical_relevance": {
    505       "score": 2,
    506       "justification": "The idea of using APR to fix LLM-generated code is practically relevant to anyone using Copilot, but no tool is released for direct use."
    507     },
    508     "surprise_contrarian": {
    509       "score": 1,
    510       "justification": "The finding that Codex-e outperforms dedicated APR tools is mildly surprising, but the general limitations of APR tools are well known."
    511     },
    512     "fear_safety": {
    513       "score": 0,
    514       "justification": "No safety or security concerns raised; the study focuses on code correctness for programming contest problems."
    515     },
    516     "drama_conflict": {
    517       "score": 0,
    518       "justification": "No controversy — straightforward empirical comparison with balanced discussion of tool strengths and weaknesses."
    519     },
    520     "demo_ability": {
    521       "score": 0,
    522       "justification": "No code, tool, or demo is released; the dataset is promised for future release but not available."
    523     },
    524     "brand_recognition": {
    525       "score": 2,
    526       "justification": "Evaluates OpenAI's Codex (the model behind GitHub Copilot), which is a well-known product."
    527     }
    528   }
    529 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs