scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30022B)
      1 {
      2   "paper": {
      3     "title": "RePair: Automated Program Repair with Process-based Feedback",
      4     "authors": [
      5       "Yuze Zhao",
      6       "Zhenya Huang",
      7       "Yixiao Ma",
      8       "Rui Li",
      9       "Kai Zhang",
     10       "Hao Jiang",
     11       "Qi Liu",
     12       "Linbo Zhu",
     13       "Yu Su"
     14     ],
     15     "year": 2024,
     16     "venue": "Annual Meeting of the Association for Computational Linguistics (Findings)",
     17     "arxiv_id": "2408.11296",
     18     "doi": "10.18653/v1/2024.findings-acl.973"
     19   },
     20   "scan_version": 3,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval"],
     23   "key_findings": "RePair demonstrates that process-based feedback (SFT + PPO with a reward model critic) enables a 15B parameter model to achieve competitive performance with commercial LLMs like GPT-3.5 and Claude2 on automated program repair. The ablation study shows both process supervision and reward model feedback are necessary — process supervision without feedback actually degrades performance significantly (pass@1 drops from 36.32 to 20.43). Pairwise ranking outperforms point-wise and list-wise alternatives for reward modeling. Small-scale models (< 20B parameters) struggle to utilize explicit prompt-based feedback from compilers and test cases, motivating the RL approach.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper states 'Code and data are publicly available at https://github.com/TnTWoW/RePair' in the abstract footnote."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The same GitHub URL claims data availability. The CodeNet4Repair dataset is described as Apache-2.0 licensed (Section 2.3) and the underlying CodeNet dataset is also public."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper mentions Python 3.11.3 for testing, ZeRO++ for distributed training, and AdamW optimizer, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided in the paper. Training details are described but there is no 'Reproducing Results' section or instructions for running the experiments."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Table 3 and Table 4 report only point estimates for pass@k. No confidence intervals, error bars, or ± notation are provided."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper claims their model 'outperforms all other open-source models' and achieves 'competitive results' with commercial LLMs based solely on raw number comparisons. No statistical significance tests are performed."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Results are reported as raw pass@k numbers in tables. The paper does not explicitly report effect sizes, percentage improvements, or Cohen's d. Readers must compute differences themselves from the tables."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The test set contains 61 problems and 10,144 repair records (Table 2). No justification is given for why this size is adequate, and no power analysis is discussed."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single training runs with the unbiased pass@k estimator applied to generated samples, but no cross-run variance is reported."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 3 includes extensive baselines: 4 closed-source models (PaLM, GPT-3.5, Claude2, ChatGLM-Pro) and 6 open-source models (StarCoderBase/Chat, CodeGen2, CodeGeeX2, LLaMA2, LLaMA2-Chat)."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Baselines include GPT-3.5 (gpt-3.5-turbo-0613), Claude2, LLaMA2, and CodeGeeX2, which were all contemporary models at the time of the paper (2024)."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Table 4 presents a thorough ablation study examining process supervision, feedback, and reward function design (pairwise, pointwise, listwise ranking) across six different configurations."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper reports pass@1, pass@3, and pass@5, each at three difficulty levels (Easy, Medium, Hard) as well as overall. Per the schema, these count as multiple metrics."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No human evaluation of the system's outputs is performed. Evaluation is entirely automated via pass@k using test case execution."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 4.1 states: 'To prevent data leakage, we divided CodeNet4Repair based on the problem ID in a ratio of 9:1.' The test set has 61 separate problems from the training set's 563."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 3 breaks down all results by difficulty level (Easy, Medium, Hard) in addition to overall scores."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "The qualitative examples in Appendix A.3 and Figure 6 show only successful repairs. No failure cases or error analysis of where the model breaks down are discussed."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Table 4 shows that process supervision without feedback severely degrades performance (pass@1 drops from 36.32 to 20.43). Section 4.8 shows small models struggle with explicit prompt feedback. These are genuine negative findings."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims process-based feedback 'outperforms larger outcome-based generation methods' (Table 3: 44.34 pass@1 vs LLaMA2-70B's 24.63) and 'nearly matches...closed-source commercial large-scale LMs' (44.34 vs GPT-3.5's 46.39 and Claude2's 43.98). Both claims are supported by the results."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper claims process-based feedback improves repair performance. The ablation study in Table 4 uses controlled single-variable manipulation (adding/removing process supervision and feedback independently), which is adequate for the causal claims made."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 7 (Limitation) explicitly acknowledges scope limits: 'Due to time and resource constraints, we were unable to collect all repair processes across different languages and software engineering domains.' They bound to Python competition-level programs."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper does not discuss alternative explanations for the observed performance gains beyond the ablation study. For example, it does not consider whether the improvement comes from additional training data exposure, the specific properties of CodeNet problems, or other confounding factors."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper measures pass@k (programs passing test cases) and frames results as program repair capability. The measurement closely matches the claim. Section 7 acknowledges the gap between competition and engineering scenarios."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "StarCoderBase 15.5B is specified. Baselines include specific versions: 'gpt-3.5-turbo-0613' (Section 4.3), 'chat-bison-001' for PaLM, and specific model sizes for all open-source models."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Figure 7 (Appendix A.1) provides the full prompt templates used for both standard SFT and explicit feedback experiments, including system prompt, user prompt, and assistant response format."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4.2 reports detailed hyperparameters: learning rate 2e-5 for SFT, 9.6e-6 for RM, 9e-6 for RL, cosine schedule, K=9 programs for RM, batch size 512, KL penalty β=0.02, top_p=0.95, top_k=50, temperature=0.2."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. The system is a trained RL policy with iterative generation guided by a reward model — this is a training/inference procedure, not agentic scaffolding with tools, memory, or agent frameworks."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Sections 2.1–2.3 document the full pipeline: HTML extraction of problem descriptions, preliminary filtering (removing duplicates, malicious submissions, privacy breaches), fine filtering with Python 3.11.3 re-execution (1,227,259 → 278,408 programs), and organization into procedural format."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7 is titled 'Limitation' and provides substantive discussion of the work's scope constraints."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 7 identifies specific threats: limited to Python language, limited to competition scenarios, unable to collect repair processes across different languages and engineering domains. These are specific to this study rather than generic disclaimers."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 7 states: 'Due to time and resource constraints, we were unable to collect all repair processes across different languages and software engineering domains.' It explicitly identifies what was not tested (other languages, engineering scenarios)."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The paper claims code and data are publicly available at https://github.com/TnTWoW/RePair, and the underlying CodeNet dataset is also publicly available (Puri et al., 2021)."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Sections 2.1–2.3 describe the data collection procedure in detail: sourcing from CodeNet, HTML extraction of problem descriptions, filtering criteria, Python 3.11.3 re-execution for status verification, and test case collection."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. Data comes from CodeNet, a public programming competition dataset."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The pipeline is documented with specific numbers at each stage: 1,227,259 programs after preliminary filtering → 278,408 after fine filtering with Python 3.11.3 re-execution → 94,062 training records and 10,144 test records. Filtering criteria are explained at each stage."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Section 8 (Acknowledgement) lists funding: NSFC (No. 62106244), Anhui Provincial Natural Science Foundation (No. 2308085QF229), Fundamental Research Funds for Central Universities (No. WK2150110034), and CIPSC-SMP-Zhipu.AI Large Model Cross-Disciplinary Fund."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: USTC (State Key Laboratory of Cognitive Intelligence), Institute of AI Comprehensive National Science Center, and Hefei Normal University."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "Funders are government research agencies (NSFC, Anhui Provincial Foundation) and university funds with no financial stake in whether process-based APR outperforms alternatives."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial interests statement is included in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "StarCoderBase is described as trained on The Stack (1 trillion tokens) but no training data cutoff date is stated. The training cutoffs for baseline models (GPT-3.5, Claude2, PaLM) are also not stated."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "The paper splits its own dataset by problem ID to prevent internal leakage (Section 4.1), but does not discuss whether the pre-trained baseline models (GPT-3.5, Claude2, StarCoderBase) may have seen CodeNet solutions in their training data."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "CodeNet was published in 2021 (Puri et al., 2021) and its solutions are publicly available. Any model trained after 2021 may have encountered these solutions, but this risk is not discussed."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No inference cost, latency, or per-example cost is reported. The paper mentions 1,364 Core·hours for data preparation but does not report inference costs for the model."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Training details include 32k episodes with batch size 512 and use of ZeRO++ across accelerators, but the total GPU hours, number of GPUs, or wall-clock training time are not reported."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from a single training run."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The number of independent training runs is not stated. The pass@k estimator uses n generated samples per problem, but n is not specified and no multi-run variance is reported."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Hyperparameters are reported (Section 4.2) but no search budget, search method, or number of configurations tried is described."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The ablation study in Table 4 shows different configurations but does not explain how the final configuration was selected or whether selection was based on validation data rather than test data."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Multiple comparisons are made across 10+ baselines at 3 difficulty levels, but no statistical tests are performed at all, let alone corrections for multiple comparisons."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors evaluate their own system against baselines without acknowledging author-evaluation bias. No discussion of whether their baseline implementations are representative."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper compares a 15B fine-tuned+RL model against 70B and 540B few-shot models without controlling for or discussing compute budget differences."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The paper does not discuss whether pass@k on competition-level Python problems measures real-world program repair capability. The limitations section actually overclaims: 'there are significant similarities between competition and engineering scenarios' without evidence."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "The main comparison (Table 3) compares their model with multi-step iterative inference + reward model feedback against baselines using single-pass generation. This inference-time advantage is a confound. Section 4.5 acknowledges 'some performance gains come from supervised fine-tuning' but does not isolate the multi-step inference advantage."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "CodeNet was published in 2021 with publicly available solutions. Models like GPT-3.5 and StarCoderBase (trained on The Stack) may have encountered these solutions. This temporal leakage risk is not discussed."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the evaluation setup leaks information. The prompt includes problem descriptions that may provide implicit hints about expected solutions."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": true,
    363         "justification": "Section 4.1: 'To prevent data leakage, we divided CodeNet4Repair based on the problem ID in a ratio of 9:1.' This ensures train and test problems are completely separate."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": true,
    368         "justification": "Problem-ID-based splitting is a concrete prevention method ensuring no overlap between train and test problems within their dataset."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Process-based feedback with a 15B model outperforms larger outcome-based open-source models including LLaMA2-70B on program repair.",
    375       "evidence": "Table 3: RePair achieves 44.34 pass@1 vs LLaMA2-70B's 7.59 and LLaMA2-Chat-70B's 24.63 on CodeNet4Repair.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "RePair nearly matches the performance of closed-source commercial LLMs like GPT-3.5 and Claude2.",
    380       "evidence": "Table 3: RePair achieves 44.34 pass@1 vs GPT-3.5's 46.39 and Claude2's 43.98. At pass@5, RePair gets 65.66 vs GPT-3.5's 66.67 and Claude2's 69.44.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Both process supervision and feedback are necessary — process supervision without feedback significantly degrades performance.",
    385       "evidence": "Table 4 ablation: process+no feedback gets 20.43 pass@1 vs no process+no feedback (SFT only) at 36.32 pass@1. Process+feedback achieves 44.34.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Pairwise ranking is the most effective reward function design compared to pointwise and listwise alternatives.",
    390       "evidence": "Table 4: Pairwise achieves 44.34 pass@1 vs pointwise 40.11 and listwise 39.45.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Small-scale models (< 20B parameters) struggle to receive effective explicit feedback through prompts.",
    395       "evidence": "Figure 5 shows StarCoder and CodeGen2 performance does not consistently improve with additional explicit feedback steps, peaking at step 3 then declining, while ChatGPT can effectively use explicit feedback.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "SFT alone is not the primary factor for performance improvement — process-based feedback significantly amplifies the model's potential beyond SFT.",
    400       "evidence": "Figure 4 shows that while SFT improves several open-source models, the full RePair system (SFT+RL) substantially outperforms SFT-only variants across all metrics.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "Unfair main comparison",
    407       "detail": "Table 3 compares their fine-tuned + RL model (with multi-step iterative inference and reward model feedback) against baseline models using zero/few-shot prompting. The comparison conflates model capability with training and inference procedure advantages. While Section 4.7 partially addresses this with SFT comparisons, the main results table presents a fundamentally unequal comparison."
    408     },
    409     {
    410       "flag": "No error bars or variance",
    411       "detail": "All results are reported as point estimates without confidence intervals, standard deviations, or any indication of result stability across runs. For RL-trained models, which are known to have high variance across seeds (Henderson et al., 2018), this is a significant omission."
    412     },
    413     {
    414       "flag": "Contamination risk unaddressed",
    415       "detail": "CodeNet solutions are publicly available since 2021. StarCoderBase was trained on The Stack (public code), and commercial models like GPT-3.5 could have encountered these solutions. The baseline comparisons may be unfairly disadvantaging or advantaging certain models depending on their training data."
    416     },
    417     {
    418       "flag": "Single dataset evaluation",
    419       "detail": "All experiments are conducted solely on CodeNet4Repair. No evaluation on other program repair benchmarks (e.g., Defects4J, QuixBugs, or other APR datasets) is performed, limiting evidence for generalization."
    420     },
    421     {
    422       "flag": "Compute costs unreported",
    423       "detail": "Training a 15B parameter model with SFT and PPO-based RL for 32k episodes requires substantial compute. No GPU hours, number of accelerators, or wall-clock time is reported, making it impossible to assess practical feasibility."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Evaluating large language models trained on code",
    429       "authors": ["Mark Chen", "Jerry Tworek"],
    430       "year": 2021,
    431       "arxiv_id": "2107.03374",
    432       "relevance": "Introduced HumanEval and the pass@k metric used in this paper for evaluating code generation capabilities."
    433     },
    434     {
    435       "title": "CodeRL: Mastering code generation through pretrained models and deep reinforcement learning",
    436       "authors": ["Hung Le", "Yue Wang", "Akhilesh Deepak Gotmare", "Silvio Savarese", "Steven Hoi"],
    437       "year": 2022,
    438       "relevance": "Pioneered using reinforcement learning for code generation, directly relevant to process-based feedback in code tasks."
    439     },
    440     {
    441       "title": "Training language models to follow instructions with human feedback",
    442       "authors": ["Long Ouyang", "Jeffrey Wu"],
    443       "year": 2022,
    444       "relevance": "The RLHF methodology that this paper's approach is built upon — aligning language models using reward models and PPO."
    445     },
    446     {
    447       "title": "StarCoder: may the source be with you!",
    448       "authors": ["Raymond Li", "Loubna Ben"],
    449       "year": 2023,
    450       "relevance": "The foundation model (StarCoderBase 15.5B) used as the base for RePair's fine-tuning pipeline."
    451     },
    452     {
    453       "title": "Let's verify step by step",
    454       "authors": ["Hunter Lightman", "Vineet Kosaraju", "Yura Burda"],
    455       "year": 2023,
    456       "relevance": "Key prior work on process-based reward models for mathematical reasoning, directly motivating this paper's approach to process supervision."
    457     },
    458     {
    459       "title": "Solving math word problems with process- and outcome-based feedback",
    460       "authors": ["Jonathan Uesato", "Nate Kushman", "Ramana Kumar"],
    461       "year": 2022,
    462       "relevance": "Compared process-based and outcome-based supervision methods on math problems, providing theoretical motivation for this paper's process-based APR approach."
    463     },
    464     {
    465       "title": "CRITIC: Large language models can self-correct with tool-interactive critiquing",
    466       "authors": ["Zhibin Gou", "Zhihong Shao", "Yeyun Gong"],
    467       "year": 2024,
    468       "relevance": "Related work on LLM self-correction with tool feedback, relevant to the feedback-based repair paradigm."
    469     },
    470     {
    471       "title": "Project CodeNet: A large-scale AI for code dataset for learning a diversity of coding tasks",
    472       "authors": ["Ruchir Puri", "David S Kung"],
    473       "year": 2021,
    474       "relevance": "Source dataset from which CodeNet4Repair was derived; a major code benchmark for evaluating program analysis tasks."
    475     },
    476     {
    477       "title": "DeepFix: Fixing common C language errors by deep learning",
    478       "authors": ["Rahul Gupta", "Soham Pal", "Aditya Kanade", "Shirish Shevade"],
    479       "year": 2017,
    480       "relevance": "Early deep learning approach to automated program repair; one of the baseline datasets compared in Table 1."
    481     },
    482     {
    483       "title": "Copiloting the copilots: Fusing large language models with completion engines for automated program repair",
    484       "authors": ["Yuxiang Wei", "Chunqiu Steven Xia", "Lingming Zhang"],
    485       "year": 2023,
    486       "relevance": "Contemporary LLM-based automated program repair approach relevant to the survey scope of AI-assisted programming."
    487     },
    488     {
    489       "title": "Competition-level code generation with AlphaCode",
    490       "authors": ["Yujia Li", "David Choi"],
    491       "year": 2022,
    492       "relevance": "Major work on competition-level code generation using large-scale sampling and filtering, relevant to AI coding capabilities."
    493     },
    494     {
    495       "title": "DeepSeek-Coder: When the large language model meets programming",
    496       "authors": ["Daya Guo", "Qihao Zhu"],
    497       "year": 2024,
    498       "relevance": "Contemporary code LLM relevant to the survey scope of LLM capabilities for programming tasks."
    499     },
    500     {
    501       "title": "Code LLaMA: Open foundation models for code",
    502       "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"],
    503       "year": 2024,
    504       "relevance": "Major open-source code LLM that establishes baselines for code generation and repair capabilities."
    505     }
    506   ],
    507   "engagement_factors": {
    508     "practical_relevance": {
    509       "score": 1,
    510       "justification": "Requires training a 15B model with RL — not immediately usable by practitioners, though the dataset and approach could inform APR tool development."
    511     },
    512     "surprise_contrarian": {
    513       "score": 1,
    514       "justification": "A 15B model matching GPT-3.5 on program repair is mildly surprising, but the paradigm of process-based feedback improving results is expected from prior work on math reasoning."
    515     },
    516     "fear_safety": {
    517       "score": 0,
    518       "justification": "No safety or security implications — focused on improving program repair, a benign application."
    519     },
    520     "drama_conflict": {
    521       "score": 0,
    522       "justification": "No controversy or conflict; standard research contribution."
    523     },
    524     "demo_ability": {
    525       "score": 1,
    526       "justification": "Code and data released on GitHub, but running the system requires training a 15B model with RL, making it impractical to demo casually."
    527     },
    528     "brand_recognition": {
    529       "score": 0,
    530       "justification": "From USTC (University of Science and Technology of China), not a widely recognized AI lab in the broader tech community."
    531     }
    532   }
    533 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs