scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29296B)
      1 {
      2   "paper": {
      3     "title": "Exploring Data-Efficient Adaptation of Large Language Models for Code Generation",
      4     "authors": [
      5       "Xue Jiang",
      6       "Yihong Dong",
      7       "Zhiyuan Fan",
      8       "Zhi Jin",
      9       "Wenpin Jiao",
     10       "Ge Li"
     11     ],
     12     "year": 2025,
     13     "venue": "ACM Transactions on Software Engineering and Methodology",
     14     "arxiv_id": "2403.00046",
     15     "doi": "10.1145/3772721"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "DEED, an error-driven learning approach for data-efficient LLM adaptation, achieves an average 46.2% relative improvement in Pass@1 over fine-tuning baselines across five Python code generation benchmarks with limited training data. Self-revision by the same model produces better training data than revision by more powerful external LLMs like ChatGPT, suggesting data suitability matters more than quantity. The method generalizes across four LLMs (CodeGen-2B/6B, Llama-7B, CodeLlama-7B) and shows iterative adaptation stabilizes after two rounds.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "All five evaluation datasets (HumanEval, MBPP, HumanEval-ET, MBPP-ET, DataScience) are publicly available benchmarks that were not modified by the authors."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper mentions 'a single A6000 GPU' (Section 4.2) but provides no requirements.txt, library versions, or environment setup details beyond the hardware."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions, README, or scripts to replicate experiments are provided."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All tables report point estimates only (e.g., '38.6%'). Despite averaging over five test runs (Section 7), no confidence intervals, error bars, or ± notation are reported."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims DEED 'performs significantly better' (Section 5.1) and reports 'significant relative improvements' but uses no statistical tests (no p-values, t-tests, or bootstrap tests) to support these claims."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Relative improvements are reported with baseline context throughout (e.g., 'relative improvements of 29.5%, 33.0%, 27.1%, 37.6%, and 103.8%, respectively, when compared to the best-performing baseline' in Section 5.1). Tables provide both baseline and DEED numbers."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification for why min(200, 40%*D) was chosen as the training split size, or why 5 datasets were selected. No power analysis."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "Section 7 states 'each experiment is run five times, and its average result is reported' but no standard deviation, variance, or spread measure is reported across these runs."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Six baselines are compared: Direct Generation, Fine-tuning (Full), Fine-tuning (LoRA), Few-shot Prompting, Self-Refine, and Self-Debug (Section 5.1, Table 1)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Self-Refine (Madaan et al., 2023) and Self-Debug (Chen et al., 2023) are contemporary methods. Fine-tuning and LoRA are standard and still widely used."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "RQ6 (Section 5.6, Table 6) performs ablation on Self-Revise input components (correct solution, error messages, failed test cases). RQ3 (Table 3) compares training data variants. RQ4 (Table 4) studies iteration count."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Pass@1, Pass@5, and Pass@10 are reported across all experiments (Tables 1-6). Pass@any is additionally used for automatic code revision evaluation."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "Evaluation is entirely automated via test case execution (Pass@k). While manual inspection of Self-Revise outputs appears in Figure 4, this examines the revision process rather than systematically evaluating the final model's output quality."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section 4.1 states: 'We sample min(200, 40%*D) problems from the datasets as D_train, while the remaining problems serve as D_test.' Explicit train/test separation."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down by dataset (5 benchmarks in Table 1), by model (4 LLMs in Table 2), by training data variant (Table 3), by iteration count (Table 4), and by revision model (Table 5)."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 5.5 discusses why ChatGPT-based revision underperforms self-revision (tendency to make large changes, capability gap). Section 5.1 notes Self-refine and Self-debug 'underperform on small LLMs.' Figure 4 shows Self-Revise (FSP) copying code rather than minimally revising."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Multiple negative results: Fine-tuning (LoRA) is less effective than Fine-tuning (Full) (Section 5.1); DEED ∪ D_train is not as effective as DEED alone showing some samples have 'negative effects' (Section 5.3); ChatGPT revision doesn't improve the final model as expected (Section 5.5); Pass@10 oscillates in iterations 2-4 (Table 4)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims 'average relative improvement of 46.2% in Pass@1.' The five relative improvements from Table 1 (29.5%, 33.0%, 27.1%, 37.6%, 103.8%) average to exactly 46.2%. Other claims about Self-Revise effectiveness and cross-LLM applicability are supported by Tables 3 and 2 respectively."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Causal claims ('DEED improves performance') are supported by controlled ablation studies (RQ6, Table 6) that isolate individual component contributions through single-variable removal. The training variant study (RQ3, Table 3) controls for data differences. Ablation design is adequate for the causal claims made."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title claims 'Code Generation' broadly, but all five benchmarks are Python-only (HumanEval, MBPP, MBPP-ET, HumanEval-ET, DataScience). The paper never acknowledges this language limitation. The Limitations section (Section 8) bounds to 'low-resource scenarios' and 'requires test cases' but does not bound by programming language."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Section 7 discusses threats to validity including hyperparameter sensitivity and dataset generalizability. Section 5.5 considers two alternative explanations for ChatGPT revision underperformance: (1) tendency to disregard minimal revision instructions, and (2) capability gap between models. Section 2 provides a representation-space analysis as mechanistic explanation."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper measures Pass@k (test case passing rate) and claims improvement in 'code generation performance.' The claims match the measurement granularity — they do not overclaim 'code quality' or 'developer productivity' from Pass@k numbers."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "Open-source models are identified by name and size (CodeGen-2B/6B, Llama-7B, CodeLlama-7B) with references, but 'ChatGPT' and 'GPT-3.5-turbo' used in RQ5 lack specific API versions or snapshot dates."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Appendix C provides the instruction text for code revision, and Figure 3 shows the template structure with placeholders {r_i}, {g_i}, {c'_i}, {m_i}, {t_i}. However, the few-shot prompting baseline prompt is not provided, and Self-Revise (FSP) examples used in the prompt are not fully listed."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 4.2 reports comprehensive hyperparameters: AdamW optimizer with β1=0.9, β2=0.9, learning rate 5e-6 (full) / 2e-4 (LoRA), batch size 1, gradient accumulation 32, 10 epochs, LoRA rank 128, α=8, temperature 0.8, sampling counts (5 for error collection, 30 for revision), 2 iterations, max generation length 1024."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. DEED is a fine-tuning pipeline, not an agentic system."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 4.1 documents the data split: 'We sample min(200, 40%*D) problems from the datasets as D_train.' Section 3.1-3.2 details the error code collection and revision pipeline including sampling strategy, selection criteria (highest generation probability for errors, minimum Levenshtein distance for revisions), and filtering (test-case-based acceptance/rejection)."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 7 (Threats to Validity) provides substantive discussion across three categories (external, internal, construct validity). Section 8 (Limitations) discusses two specific limitations of the approach."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 7 discusses threats specific to this study: use of five specific public benchmarks for generalizability, sensitivity to hyperparameters with only small-range grid search performed, and reliance on Pass@k metric with its unbiased estimator."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 8 states two specific scope boundaries: (1) 'our approach requires test cases' during training, and (2) 'DEED is only used in low-resource scenarios.' However, the Python-only limitation is not explicitly stated."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw experimental data (generated error codes, revisions, model outputs, fine-tuned model weights) is made available for independent verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 3.1 describes error code collection via rejection sampling with test evaluation. Section 3.2 describes the revision process via acceptance sampling. Section 4.1 describes dataset sourcing from public benchmarks with specific references."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. All data comes from standard public code generation benchmarks (HumanEval, MBPP, DataScience, EvoCodeBench)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The full pipeline is documented: dataset splitting (Section 4.1), error code collection with selection criteria (Section 3.1), automatic revision with acceptance sampling and filtering (Section 3.2), model optimization (Section 3.3), and iterative adaptation with experience replay (Section 3.4). Algorithm 1 summarizes the complete process."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Acknowledgments section lists: National Key R&D Program (Grant No. 2023YFB4503801), National Natural Science Foundation of China (Grant No. 62192733, 62192730, 62192731), and Major Program of Hubei Province (No. 2023BAA024)."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "All authors are affiliated with Key Laboratory of High Confidence Software Technologies (Peking University), Ministry of Education; School of Computer Science, Peking University. No product of theirs is being evaluated."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Funding is from Chinese government research agencies (NSFC, National Key R&D Program, Hubei Province) which have no financial stake in whether DEED outperforms baselines."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No training data cutoff date is stated for any of the models used (CodeGen-2B/6B, Llama-7B, CodeLlama-7B). Cannot assess whether benchmark data was in pre-training."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether HumanEval, MBPP, or other benchmark problems appeared in the pre-training data of the models used."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "HumanEval (2021) and MBPP (2021) were published before the training of CodeGen (2022-2023) and other models used. No contamination analysis is performed despite this clear temporal risk."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "The paper notes DEED incurs 'no additional resource or time costs' at inference compared to direct generation (Section 6.2), but no actual inference latency, cost per example, or token counts are reported."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Section 4.2 states 'a single A6000 GPU' was used, but total GPU hours, training time, or compute budget are not quantified."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "Section 7 states experiments are 'run five times' and averaged, but no results across individual runs or seed sensitivity analysis is reported. The five runs appear to be for the unbiased Pass@k estimator sampling, not separate training runs with different seeds."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": true,
    308         "justification": "Section 7 states: 'each experiment is run five times, and its average result is reported.' Section 4.3 also clarifies n=50 samples generated per problem for Pass@k calculation."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "Section 7 states 'we only do a small-range grid search on hyperparameters, including iterations of DEED, learning rates, and training epochs' but the number of configurations tried and total compute spent on search are not reported."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The iteration count is justified via RQ4 (Table 4, selecting 2 iterations based on Pass@10 oscillation). However, for other hyperparameters (learning rate, epochs, LoRA rank), no selection methodology is described — it's unclear whether selection used validation or test performance."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper makes many comparisons across 5 datasets, 4 models, 6 baselines, and multiple ablation variants without any statistical tests, let alone multiple comparison corrections."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors implement all baselines themselves and compare against their own DEED method. No acknowledgment of author-evaluation bias or independent evaluation is provided."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "No analysis of performance as a function of compute budget. DEED involves iterative fine-tuning, error collection, and revision sampling which likely uses substantially more compute than single-pass fine-tuning, but this is not quantified or compared."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No discussion of whether HumanEval, MBPP, or DataScience actually measure 'code generation capability in specific scenarios.' The paper uses these benchmarks to simulate data-scarce scenarios without questioning whether synthetic scarcity reflects real-world constraints."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No scaffolding is involved. DEED is a fine-tuning method, not a scaffolded agent system."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "HumanEval (2021) and MBPP (2021) were published before the models' training periods. No discussion of whether model pre-training data included benchmark solutions."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "Self-Revise uses correct solutions and test cases from the training split during the revision process. While this is the method design, no discussion of whether the evaluation setup (using the same benchmark's held-out problems) leaks information through structural similarity."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "The paper splits datasets into train/test by random sampling but does not discuss whether problems in the same benchmark share structural similarities, common patterns, or come from the same source distributions that could create non-independence."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No concrete leakage detection or prevention method is used (no canary strings, membership inference, n-gram overlap analysis, or decontamination)."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "DEED achieves an average relative improvement of 46.2% in Pass@1 compared to the best-performing baseline across five code generation benchmarks.",
    372       "evidence": "Table 1 shows relative improvements of 29.5% (HumanEval), 33.0% (HumanEval-ET), 27.1% (MBPP), 37.6% (MBPP-ET), and 103.8% (DataScience) over Fine-tuning (Full), averaging 46.2%.",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "Self-Revise produces revised code that is more efficient for model optimization than code samples from datasets.",
    377       "evidence": "Table 3 (Section 5.3) shows DEED (32.8% Pass@1) outperforms Raw D_train (25.8%) despite using less training data. Also, DEED ∪ D_train (29.2%) underperforms DEED alone, suggesting some dataset samples have negative training effects.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "DEED consistently enhances performance across different LLMs of varying sizes and architectures.",
    382       "evidence": "Table 2 (Section 5.2) shows improvements on CodeGen-2B (+27.1%), CodeGen-6B (+25.6%), Llama-7B (+32.5%), and CodeLlama-7B (+25.2%) on MBPP dataset.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Using the same LLM for revision (Self-Revise) yields better final model performance than using more powerful external LLMs like ChatGPT.",
    387       "evidence": "Table 5 (Section 5.5) shows Self-Revise (FT) achieves 32.8% Pass@1 on the final model vs. 27.0% for ChatGPT-based revision and 29.0% for GPT-3.5-turbo, despite ChatGPT achieving far higher revision accuracy (61.4% vs 3.9% Pass@1 on MRevise).",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Revised codes are closer to error codes than dataset samples in the model's representation space, explaining why error-driven learning is more efficient.",
    392       "evidence": "Section 2 preliminary study shows average Euclidean distance between error codes and revisions is 6.39, vs. 12.35 between error codes and dataset samples, using CodeGen-2B on MBPP.",
    393       "supported": "weak"
    394     }
    395   ],
    396   "red_flags": [
    397     {
    398       "flag": "No uncertainty quantification despite multiple runs",
    399       "detail": "The paper runs each experiment five times but reports only averaged point estimates with no standard deviation, confidence intervals, or error bars. Claims of 'significant improvement' are made without any statistical tests."
    400     },
    401     {
    402       "flag": "Most ablations conducted on single dataset",
    403       "detail": "RQ2 (different LLMs), RQ3 (training variants), RQ4 (iterations), RQ5 (revision models), and RQ6 (ablation) are all conducted exclusively on MBPP. Only RQ1 uses all five benchmarks. Generalizability of component-level findings is unverified."
    404     },
    405     {
    406       "flag": "No contamination analysis",
    407       "detail": "HumanEval and MBPP (both 2021) were published before CodeGen's training. The paper does not analyze whether benchmark solutions appeared in pre-training data, which could inflate absolute performance numbers for all methods."
    408     },
    409     {
    410       "flag": "Simulated data scarcity may not reflect real scenarios",
    411       "detail": "The paper claims to address 'specific scenarios' like aerospace and medical devices (Section 1) but evaluates on artificially subsampled general Python benchmarks. Whether findings transfer to genuine domain-specific code generation with inherently scarce data is untested."
    412     },
    413     {
    414       "flag": "No code or model release",
    415       "detail": "Despite proposing a novel method, no source code, trained models, or experimental artifacts are released, preventing independent verification of the results."
    416     }
    417   ],
    418   "cited_papers": [
    419     {
    420       "title": "Evaluating Large Language Models Trained on Code",
    421       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    422       "year": 2021,
    423       "arxiv_id": "2107.03374",
    424       "relevance": "Introduced HumanEval benchmark and Codex; foundational for LLM code generation evaluation methodology."
    425     },
    426     {
    427       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    428       "authors": ["Erik Nijkamp", "Bo Pang", "Hiroaki Hayashi"],
    429       "year": 2023,
    430       "relevance": "Open-source code LLM used as the primary base model in all experiments."
    431     },
    432     {
    433       "title": "Code Llama: Open Foundation Models for Code",
    434       "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"],
    435       "year": 2023,
    436       "arxiv_id": "2308.12950",
    437       "relevance": "Open code LLM used as one of the evaluated models; key baseline for code generation capability."
    438     },
    439     {
    440       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    441       "authors": ["Edward J. Hu", "Yelong Shen", "Phillip Wallis"],
    442       "year": 2022,
    443       "relevance": "Parameter-efficient fine-tuning method used as both a baseline and an adaptation technique within DEED."
    444     },
    445     {
    446       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    447       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    448       "year": 2023,
    449       "relevance": "Prompting-based iterative code refinement baseline; demonstrates LLM self-improvement capabilities."
    450     },
    451     {
    452       "title": "Teaching Large Language Models to Self-Debug",
    453       "authors": ["Xinyun Chen", "Maxwell Lin", "Nathanael Schärli"],
    454       "year": 2023,
    455       "arxiv_id": "2304.05128",
    456       "relevance": "LLM self-debugging using execution feedback; directly compared as a baseline for code correction approaches."
    457     },
    458     {
    459       "title": "Self-Edit: Fault-Aware Code Editor for Code Generation",
    460       "authors": ["Kechi Zhang", "Zhuo Li", "Jia Li"],
    461       "year": 2023,
    462       "relevance": "Trains a separate editor for code correction; related approach using error-aware training for code generation."
    463     },
    464     {
    465       "title": "CYCLE: Learning to Self-Refine the Code Generation",
    466       "authors": ["Yangruibo Ding", "Marcus J. Min", "Gail E. Kaiser"],
    467       "year": 2024,
    468       "relevance": "Concurrent work enhancing LLM self-refinement capability for code correction using test results."
    469     },
    470     {
    471       "title": "EvoCodeBench: An Evolving Code Generation Benchmark with Domain-Specific Evaluations",
    472       "authors": ["Jia Li", "Ge Li", "Xuanming Zhang"],
    473       "year": 2024,
    474       "relevance": "Project-level code generation benchmark addressing data leakage; used for supplementary evaluation in Appendix B."
    475     },
    476     {
    477       "title": "WizardCoder: Empowering Code Large Language Models with Evol-Instruct",
    478       "authors": ["Ziyang Luo", "Can Xu", "Pu Zhao"],
    479       "year": 2023,
    480       "arxiv_id": "2306.08568",
    481       "relevance": "Data augmentation approach for code LLMs via evolved instructions; contrasting approach to DEED's error-driven learning."
    482     },
    483     {
    484       "title": "Magicoder: Empowering Code Generation with OSS-Instruct",
    485       "authors": ["Yuxiang Wei", "Zhe Wang", "Jiawei Liu"],
    486       "year": 2024,
    487       "relevance": "Instruction synthesis from open-source code for LLM training; alternative data-efficient approach for code generation."
    488     },
    489     {
    490       "title": "Improving Code Generation by Training with Natural Language Feedback",
    491       "authors": ["Angelica Chen", "Jérémy Scheurer", "Tomasz Korbak"],
    492       "year": 2023,
    493       "arxiv_id": "2303.16749",
    494       "relevance": "ILF method using human feedback to refine code generation; related approach requiring more human involvement than DEED."
    495     },
    496     {
    497       "title": "Competition-level code generation with AlphaCode",
    498       "authors": ["Yujia Li", "David Choi", "Junyoung Chung"],
    499       "year": 2022,
    500       "relevance": "Landmark LLM code generation system demonstrating competition-level performance; established sampling and filtering methodology."
    501     }
    502   ]
    503 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs