scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (35688B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Exploring Data-Efficient Adaptation of Large Language Models for Code Generation",
      6     "authors": [
      7       "Xue Jiang",
      8       "Yihong Dong",
      9       "Zhiyuan Fan",
     10       "Zhi Jin",
     11       "Wenpin Jiao",
     12       "Ge Li"
     13     ],
     14     "year": 2024,
     15     "venue": "ACM Transactions on Software Engineering and Methodology",
     16     "arxiv_id": "2403.00046",
     17     "doi": "10.1145/3772721"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract claims 'average relative improvement of 46.2% in Pass@1.' The five relative improvements from Table 1 (29.5%, 33.0%, 27.1%, 37.6%, 103.8%) average to exactly 46.2%. Other claims about Self-Revise effectiveness and cross-LLM applicability are supported by Tables 3 and 2 respectively.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Causal claims ('DEED improves performance') are supported by controlled ablation studies (RQ6, Table 6) that isolate individual component contributions through single-variable removal. The training variant study (RQ3, Table 3) controls for data differences. Ablation design is adequate for the causal claims made.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title claims 'Code Generation' broadly, but all five benchmarks are Python-only (HumanEval, MBPP, MBPP-ET, HumanEval-ET, DataScience). The paper never acknowledges this language limitation. The Limitations section (Section 8) bounds to 'low-resource scenarios' and 'requires test cases' but does not bound by programming language.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Section 7 discusses threats to validity including hyperparameter sensitivity and dataset generalizability. Section 5.5 considers two alternative explanations for ChatGPT revision underperformance: (1) tendency to disregard minimal revision instructions, and (2) capability gap between models. Section 2 provides a representation-space analysis as mechanistic explanation.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper measures Pass@k (test case passing rate) and claims improvement in 'code generation performance.' The claims match the measurement granularity — they do not overclaim 'code quality' or 'developer productivity' from Pass@k numbers.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 7 (Threats to Validity) provides substantive discussion across three categories (external, internal, construct validity). Section 8 (Limitations) discusses two specific limitations of the approach.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Section 7 discusses threats specific to this study: use of five specific public benchmarks for generalizability, sensitivity to hyperparameters with only small-range grid search performed, and reliance on Pass@k metric with its unbiased estimator.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Section 8 states two specific scope boundaries: (1) 'our approach requires test cases' during training, and (2) 'DEED is only used in low-resource scenarios.' However, the Python-only limitation is not explicitly stated.",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Acknowledgments section lists: National Key R&D Program (Grant No. 2023YFB4503801), National Natural Science Foundation of China (Grant No. 62192733, 62192730, 62192731), and Major Program of Hubei Province (No. 2023BAA024).",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All authors are affiliated with Key Laboratory of High Confidence Software Technologies (Peking University), Ministry of Education; School of Computer Science, Peking University. No product of theirs is being evaluated.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Funding is from Chinese government research agencies (NSFC, National Key R&D Program, Hubei Province) which have no financial stake in whether DEED outperforms baselines.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial interests statement is present in the paper.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key terms are operationalized: 'data-efficient adaptation' is defined as fine-tuning with limited labeled data, 'error-driven learning' is explained by analogy and formalized in Section 3, and 'specific scenarios' is illustrated with concrete examples (aerospace, medical devices).",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 1 explicitly lists three contributions: proposing error-driven learning for LLM adaptation, the DEED method itself, and empirical validation across five benchmarks and four LLMs.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 6 explicitly compares DEED to Self-Refine, Self-Debug, Self-Edit, CYCLE, and ILF, explaining mechanistic differences (training-time vs inference-time refinement, no human involvement, different from distillation), not merely listing references.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper.",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "All five evaluation datasets (HumanEval, MBPP, HumanEval-ET, MBPP-ET, DataScience) are publicly available benchmarks that were not modified by the authors.",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The paper mentions 'a single A6000 GPU' (Section 4.2) but provides no requirements.txt, library versions, or environment setup details beyond the hardware.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No step-by-step reproduction instructions, README, or scripts to replicate experiments are provided.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "All tables report point estimates only (e.g., '38.6%'). Despite averaging over five test runs (Section 7), no confidence intervals, error bars, or ± notation are reported.",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "The paper claims DEED 'performs significantly better' (Section 5.1) and reports 'significant relative improvements' but uses no statistical tests (no p-values, t-tests, or bootstrap tests) to support these claims.",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Relative improvements are reported with baseline context throughout (e.g., 'relative improvements of 29.5%, 33.0%, 27.1%, 37.6%, and 103.8%, respectively, when compared to the best-performing baseline' in Section 5.1). Tables provide both baseline and DEED numbers.",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "No justification for why min(200, 40%*D) was chosen as the training split size, or why 5 datasets were selected. No power analysis.",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Section 7 states 'each experiment is run five times, and its average result is reported' but no standard deviation, variance, or spread measure is reported across these runs.",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Six baselines are compared: Direct Generation, Fine-tuning (Full), Fine-tuning (LoRA), Few-shot Prompting, Self-Refine, and Self-Debug (Section 5.1, Table 1).",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Self-Refine (Madaan et al., 2023) and Self-Debug (Chen et al., 2023) are contemporary methods. Fine-tuning and LoRA are standard and still widely used.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "RQ6 (Section 5.6, Table 6) performs ablation on Self-Revise input components (correct solution, error messages, failed test cases). RQ3 (Table 3) compares training data variants. RQ4 (Table 4) studies iteration count.",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Pass@1, Pass@5, and Pass@10 are reported across all experiments (Tables 1-6). Pass@any is additionally used for automatic code revision evaluation.",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": false,
    207           "justification": "Evaluation is entirely automated via test case execution (Pass@k). While manual inspection of Self-Revise outputs appears in Figure 4, this examines the revision process rather than systematically evaluating the final model's output quality.",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Section 4.1 states: 'We sample min(200, 40%*D) problems from the datasets as D_train, while the remaining problems serve as D_test.' Explicit train/test separation.",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are broken down by dataset (5 benchmarks in Table 1), by model (4 LLMs in Table 2), by training data variant (Table 3), by iteration count (Table 4), and by revision model (Table 5).",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section 5.5 discusses why ChatGPT-based revision underperforms self-revision (tendency to make large changes, capability gap). Section 5.1 notes Self-refine and Self-debug 'underperform on small LLMs.' Figure 4 shows Self-Revise (FSP) copying code rather than minimally revising.",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Multiple negative results: Fine-tuning (LoRA) is less effective than Fine-tuning (Full) (Section 5.1); DEED ∪ D_train is not as effective as DEED alone showing some samples have 'negative effects' (Section 5.3); ChatGPT revision doesn't improve the final model as expected (Section 5.5); Pass@10 oscillates in iterations 2-4 (Table 4).",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": false,
    239           "justification": "Open-source models are identified by name and size (CodeGen-2B/6B, Llama-7B, CodeLlama-7B) with references, but 'ChatGPT' and 'GPT-3.5-turbo' used in RQ5 lack specific API versions or snapshot dates.",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "Appendix C provides the instruction text for code revision, and Figure 3 shows the template structure with placeholders {r_i}, {g_i}, {c'_i}, {m_i}, {t_i}. However, the few-shot prompting baseline prompt is not provided, and Self-Revise (FSP) examples used in the prompt are not fully listed.",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Section 4.2 reports comprehensive hyperparameters: AdamW optimizer with β1=0.9, β2=0.9, learning rate 5e-6 (full) / 2e-4 (LoRA), batch size 1, gradient accumulation 32, 10 epochs, LoRA rank 128, α=8, temperature 0.8, sampling counts (5 for error collection, 30 for revision), 2 iterations, max generation length 1024.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "No agentic scaffolding is used. DEED is a fine-tuning pipeline, not an agentic system.",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Section 4.1 documents the data split: 'We sample min(200, 40%*D) problems from the datasets as D_train.' Section 3.1-3.2 details the error code collection and revision pipeline including sampling strategy, selection criteria (highest generation probability for errors, minimum Levenshtein distance for revisions), and filtering (test-case-based acceptance/rejection).",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "No raw experimental data (generated error codes, revisions, model outputs, fine-tuned model weights) is made available for independent verification.",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 3.1 describes error code collection via rejection sampling with test evaluation. Section 3.2 describes the revision process via acceptance sampling. Section 4.1 describes dataset sourcing from public benchmarks with specific references.",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participants. All data comes from standard public code generation benchmarks (HumanEval, MBPP, DataScience, EvoCodeBench).",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The full pipeline is documented: dataset splitting (Section 4.1), error code collection with selection criteria (Section 3.1), automatic revision with acceptance sampling and filtering (Section 3.2), model optimization (Section 3.3), and iterative adaptation with experience replay (Section 3.4). Algorithm 1 summarizes the complete process.",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "No training data cutoff date is stated for any of the models used (CodeGen-2B/6B, Llama-7B, CodeLlama-7B). Cannot assess whether benchmark data was in pre-training.",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "No discussion of whether HumanEval, MBPP, or other benchmark problems appeared in the pre-training data of the models used.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "HumanEval (2021) and MBPP (2021) were published before the training of CodeGen (2022-2023) and other models used. No contamination analysis is performed despite this clear temporal risk.",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants in this study.",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants in this study.",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants in this study.",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants in this study.",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants in this study.",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants in this study.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants in this study.",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "The paper notes DEED incurs 'no additional resource or time costs' at inference compared to direct generation (Section 6.2), but no actual inference latency, cost per example, or token counts are reported.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "Section 4.2 states 'a single A6000 GPU' was used, but total GPU hours, training time, or compute budget are not quantified.",
    368           "source": "opus"
    369         }
    370       },
    371       "experimental_rigor": {
    372         "seed_sensitivity_reported": {
    373           "applies": true,
    374           "answer": false,
    375           "justification": "Section 7 states experiments are 'run five times' and averaged, but no results across individual runs or seed sensitivity analysis is reported. The five runs appear to be for the unbiased Pass@k estimator sampling, not separate training runs with different seeds.",
    376           "source": "opus"
    377         },
    378         "number_of_runs_stated": {
    379           "applies": true,
    380           "answer": true,
    381           "justification": "Section 7 states: 'each experiment is run five times, and its average result is reported.' Section 4.3 also clarifies n=50 samples generated per problem for Pass@k calculation.",
    382           "source": "opus"
    383         },
    384         "hyperparameter_search_budget": {
    385           "applies": true,
    386           "answer": false,
    387           "justification": "Section 7 states 'we only do a small-range grid search on hyperparameters, including iterations of DEED, learning rates, and training epochs' but the number of configurations tried and total compute spent on search are not reported.",
    388           "source": "opus"
    389         },
    390         "best_config_selection_justified": {
    391           "applies": true,
    392           "answer": false,
    393           "justification": "The iteration count is justified via RQ4 (Table 4, selecting 2 iterations based on Pass@10 oscillation). However, for other hyperparameters (learning rate, epochs, LoRA rank), no selection methodology is described — it's unclear whether selection used validation or test performance.",
    394           "source": "opus"
    395         },
    396         "multiple_comparison_correction": {
    397           "applies": true,
    398           "answer": false,
    399           "justification": "The paper makes many comparisons across 5 datasets, 4 models, 6 baselines, and multiple ablation variants without any statistical tests, let alone multiple comparison corrections.",
    400           "source": "opus"
    401         },
    402         "self_comparison_bias_addressed": {
    403           "applies": true,
    404           "answer": false,
    405           "justification": "The authors implement all baselines themselves and compare against their own DEED method. No acknowledgment of author-evaluation bias or independent evaluation is provided.",
    406           "source": "opus"
    407         },
    408         "compute_budget_vs_performance": {
    409           "applies": true,
    410           "answer": false,
    411           "justification": "No analysis of performance as a function of compute budget. DEED involves iterative fine-tuning, error collection, and revision sampling which likely uses substantially more compute than single-pass fine-tuning, but this is not quantified or compared.",
    412           "source": "opus"
    413         },
    414         "benchmark_construct_validity": {
    415           "applies": true,
    416           "answer": false,
    417           "justification": "No discussion of whether HumanEval, MBPP, or DataScience actually measure 'code generation capability in specific scenarios.' The paper uses these benchmarks to simulate data-scarce scenarios without questioning whether synthetic scarcity reflects real-world constraints.",
    418           "source": "opus"
    419         },
    420         "scaffold_confound_addressed": {
    421           "applies": false,
    422           "answer": false,
    423           "justification": "No scaffolding is involved. DEED is a fine-tuning method, not a scaffolded agent system.",
    424           "source": "opus"
    425         }
    426       },
    427       "data_leakage": {
    428         "temporal_leakage_addressed": {
    429           "applies": true,
    430           "answer": false,
    431           "justification": "HumanEval (2021) and MBPP (2021) were published before the models' training periods. No discussion of whether model pre-training data included benchmark solutions.",
    432           "source": "opus"
    433         },
    434         "feature_leakage_addressed": {
    435           "applies": true,
    436           "answer": false,
    437           "justification": "Self-Revise uses correct solutions and test cases from the training split during the revision process. While this is the method design, no discussion of whether the evaluation setup (using the same benchmark's held-out problems) leaks information through structural similarity.",
    438           "source": "opus"
    439         },
    440         "non_independence_addressed": {
    441           "applies": true,
    442           "answer": false,
    443           "justification": "The paper splits datasets into train/test by random sampling but does not discuss whether problems in the same benchmark share structural similarities, common patterns, or come from the same source distributions that could create non-independence.",
    444           "source": "opus"
    445         },
    446         "leakage_detection_method": {
    447           "applies": true,
    448           "answer": false,
    449           "justification": "No concrete leakage detection or prevention method is used (no canary strings, membership inference, n-gram overlap analysis, or decontamination).",
    450           "source": "opus"
    451         }
    452       }
    453     }
    454   },
    455   "claims": [
    456     {
    457       "claim": "DEED achieves an average relative improvement of 46.2% in Pass@1 on five code generation benchmarks compared to the best-performing baseline (Fine-tuning Full).",
    458       "evidence": "Table 1 shows per-dataset improvements of 29.5%, 33.0%, 27.1%, 37.6%, and 103.8% over Fine-tuning (Full), averaging 46.2%.",
    459       "supported": "strong"
    460     },
    461     {
    462       "claim": "Training on revisions of model-generated error codes is more data-efficient than training on original dataset samples, even when using fewer total examples.",
    463       "evidence": "Table 3: DEED achieves 32.8% Pass@1 vs Raw D_train 25.8% using fewer training samples; also outperforms Human-revised D_train (28.0%).",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "Revised codes are representationally closer to error codes in embedding space than dataset samples (Euclidean distance 6.39 vs 12.35), explaining DEED's efficiency.",
    468       "evidence": "Section 2 preliminary study using CodeGen-2B's final hidden layer representations on MBPP, computing Euclidean distances for three sets: errors, revisions, and dataset samples.",
    469       "supported": "moderate"
    470     },
    471     {
    472       "claim": "Self-Revise using the same base model under fine-tuning achieves better final model performance than using more powerful models like ChatGPT for revision.",
    473       "evidence": "Table 5: Self-Revise (FT) with base model yields M_θ* Pass@1=32.8%, vs ChatGPT-based revision 27.0% and GPT-3.5-turbo 29.0%.",
    474       "supported": "strong"
    475     },
    476     {
    477       "claim": "DEED consistently improves performance across LLMs of different sizes and architectures.",
    478       "evidence": "Table 2 shows relative improvements over fine-tuning: CodeGen-2B +27.1%, CodeGen-6B +25.6%, Llama-7B +32.5%, CodeLlama-7B +25.2%.",
    479       "supported": "strong"
    480     },
    481     {
    482       "claim": "Two iterations of DEED captures over 98% of achievable performance with diminishing returns thereafter.",
    483       "evidence": "Table 4: Pass@1 on MBPP progresses 15.6% (0)→31.6% (1)→32.8% (2)→33.0% (3)→33.2% (4 iterations).",
    484       "supported": "moderate"
    485     }
    486   ],
    487   "methodology_tags": [
    488     "benchmark-eval"
    489   ],
    490   "key_findings": "DEED, a fine-tuning approach using error-driven learning, outperforms standard fine-tuning (full-parameter, LoRA) and prompting baselines by 27-104% relative improvement in Pass@1 under data-scarce conditions across five code generation benchmarks and four LLMs. The core insight is that revised model error codes are representationally closer to the original errors (embedding distance 6.39) than dataset samples (12.35), making them more efficient training targets that require smaller optimization steps. Counterintuitively, Self-Revise using the same base model under fine-tuning outperforms revision by stronger models like ChatGPT, because capability mismatches between reviser and student model produce training data misaligned with the base model's expectations. Two iterations of error-collect-revise-optimize captures nearly all available improvement.",
    491   "red_flags": [
    492     {
    493       "flag": "No code release",
    494       "detail": "No repository or implementation is released, making independent reproduction impossible despite detailed methodology description."
    495     },
    496     {
    497       "flag": "No statistical significance testing",
    498       "detail": "All comparisons are point estimates without significance tests. Differences of 1-3pp between methods (e.g., iterations 2-4 in Table 4) may not be statistically meaningful."
    499     },
    500     {
    501       "flag": "No variance reporting",
    502       "detail": "Results are averaged over five runs but no standard deviations are reported, hiding whether small differences between methods are reliable."
    503     },
    504     {
    505       "flag": "Benchmark contamination unaddressed",
    506       "detail": "HumanEval and MBPP (both 2021) almost certainly appear in pretraining data for CodeGen (2023), Llama-2 (2023), and CodeLlama (2023). The paper does not discuss or control for this potential inflation of all reported numbers."
    507     },
    508     {
    509       "flag": "Simulated low-resource scenarios",
    510       "detail": "Low-resource conditions are simulated by subsampling public benchmarks (40% of dataset) rather than using actual proprietary industrial datasets, weakening real-world applicability claims."
    511     },
    512     {
    513       "flag": "ChatGPT version unspecified",
    514       "detail": "ChatGPT is used as a revision baseline with no version date, snapshot identifier, or API version, making this comparison irreproducible."
    515     },
    516     {
    517       "flag": "Small absolute gains on DataScience",
    518       "detail": "The 103.8% relative improvement headline on DataScience is 2.6%→5.3% Pass@1 absolute — very low absolute performance that raises questions about practical utility of the method on this domain."
    519     }
    520   ],
    521   "cited_papers": [
    522     {
    523       "title": "Evaluating Large Language Models Trained on Code (Codex/HumanEval)",
    524       "relevance": "Primary benchmark (HumanEval) and evaluation metric (Pass@k) used throughout; establishes the standard for LLM code generation evaluation"
    525     },
    526     {
    527       "title": "Program Synthesis with Large Language Models (MBPP)",
    528       "relevance": "Primary training and evaluation dataset used for most experiments including all iteration and ablation studies"
    529     },
    530     {
    531       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    532       "relevance": "Direct baseline compared in RQ1; representative prompting-based iterative code refinement approach that DEED aims to outperform"
    533     },
    534     {
    535       "title": "Teaching Large Language Models to Self-Debug",
    536       "relevance": "Direct baseline in RQ1; execution-feedback-based code refinement approach used for comparison"
    537     },
    538     {
    539       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    540       "relevance": "Parameter-efficient fine-tuning method used as both a baseline and as a component in DEED when full fine-tuning is too expensive"
    541     },
    542     {
    543       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    544       "relevance": "Primary base model (CodeGen-2B) for all main experiments including ablations"
    545     },
    546     {
    547       "title": "Code Llama: Open Foundation Models for Code",
    548       "relevance": "One of four LLMs tested for cross-architecture generalization in RQ2 and RQ5"
    549     },
    550     {
    551       "title": "Improving Code Generation by Training with Natural Language Feedback (ILF)",
    552       "relevance": "Related work using human feedback for code refinement training; DEED proposes automated alternative with comparable or superior performance"
    553     },
    554     {
    555       "title": "EvoCodeBench: An Evolving Code Generation Benchmark with Domain-Specific Evaluations",
    556       "relevance": "Additional benchmark used in Appendix B specifically chosen to address data contamination concerns in evaluation"
    557     },
    558     {
    559       "title": "Generalization or Memorization: Data Contamination and Trustworthy Evaluation for Large Language Models",
    560       "relevance": "Cited as motivation for using EvoCodeBench, acknowledging data leakage as a concern in benchmark evaluation"
    561     }
    562   ],
    563   "engagement_factors": {
    564     "practical_relevance": {
    565       "score": 2,
    566       "justification": "Addresses a real bottleneck — adapting LLMs to proprietary domains with limited labeled data — applicable to organizations in specialized industries."
    567     },
    568     "surprise_contrarian": {
    569       "score": 1,
    570       "justification": "The finding that self-revision with the same base model beats ChatGPT-based revision is counterintuitive and challenges the assumption that more powerful teachers always produce better fine-tuning data."
    571     },
    572     "fear_safety": {
    573       "score": 0,
    574       "justification": "No AI safety or risk concerns raised."
    575     },
    576     "drama_conflict": {
    577       "score": 0,
    578       "justification": "Standard ML systems paper with no controversial claims or field conflicts."
    579     },
    580     "demo_ability": {
    581       "score": 1,
    582       "justification": "Method is clearly described and implementable in principle, but no code is released and the base models tested (CodeGen-2B, Llama-7B) are now outdated relative to frontier models."
    583     },
    584     "brand_recognition": {
    585       "score": 1,
    586       "justification": "Peking University authors, published in ACM TOSEM — a respected SE venue. No major industry lab involvement."
    587     }
    588   },
    589   "hn_data": {
    590     "threads": [
    591       {
    592         "hn_id": "39651926",
    593         "title": "An all-optical general-purpose CPU and optical computer architecture",
    594         "points": 197,
    595         "comments": 103,
    596         "url": "https://news.ycombinator.com/item?id=39651926",
    597         "created_at": "2024-03-09T14:49:53Z"
    598       },
    599       {
    600         "hn_id": "33426789",
    601         "title": "Yoneda Hacking: The Algebra of Attacker Actions",
    602         "points": 9,
    603         "comments": 0,
    604         "url": "https://news.ycombinator.com/item?id=33426789",
    605         "created_at": "2022-11-01T20:10:20Z"
    606       },
    607       {
    608         "hn_id": "42496507",
    609         "title": "Online Advertising Is a Regrettable Necessity",
    610         "points": 6,
    611         "comments": 2,
    612         "url": "https://news.ycombinator.com/item?id=42496507",
    613         "created_at": "2024-12-23T18:27:49Z"
    614       },
    615       {
    616         "hn_id": "41961564",
    617         "title": "Easy real-time collision detection",
    618         "points": 4,
    619         "comments": 0,
    620         "url": "https://news.ycombinator.com/item?id=41961564",
    621         "created_at": "2024-10-27T11:06:23Z"
    622       },
    623       {
    624         "hn_id": "39610408",
    625         "title": "Polyamorous Scheduling is NP-hard",
    626         "points": 3,
    627         "comments": 0,
    628         "url": "https://news.ycombinator.com/item?id=39610408",
    629         "created_at": "2024-03-05T23:27:01Z"
    630       },
    631       {
    632         "hn_id": "39329353",
    633         "title": "Training microrobots to swim by a large language model",
    634         "points": 2,
    635         "comments": 1,
    636         "url": "https://news.ycombinator.com/item?id=39329353",
    637         "created_at": "2024-02-10T19:21:39Z"
    638       },
    639       {
    640         "hn_id": "41537027",
    641         "title": "Towards Battery-Free Wireless Sensing via Radio-Frequency Energy Harvesting",
    642         "points": 2,
    643         "comments": 0,
    644         "url": "https://news.ycombinator.com/item?id=41537027",
    645         "created_at": "2024-09-14T02:26:33Z"
    646       },
    647       {
    648         "hn_id": "39352140",
    649         "title": "Detecting Multimedia Generated by Large AI Models: A Survey",
    650         "points": 2,
    651         "comments": 0,
    652         "url": "https://news.ycombinator.com/item?id=39352140",
    653         "created_at": "2024-02-12T23:36:45Z"
    654       },
    655       {
    656         "hn_id": "45763351",
    657         "title": "VaultDB: A Real-World Pilot of SMPC Within a Clinical Research Network",
    658         "points": 1,
    659         "comments": 0,
    660         "url": "https://news.ycombinator.com/item?id=45763351",
    661         "created_at": "2025-10-30T18:24:42Z"
    662       },
    663       {
    664         "hn_id": "41981519",
    665         "title": "Easy real-time collision detection",
    666         "points": 1,
    667         "comments": 0,
    668         "url": "https://news.ycombinator.com/item?id=41981519",
    669         "created_at": "2024-10-29T09:41:11Z"
    670       }
    671     ],
    672     "top_points": 197,
    673     "total_points": 227,
    674     "total_comments": 106
    675   }
    676 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs