scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (34490B)
      1 {
      2   "paper": {
      3     "title": "Learning Code Preference via Synthetic Evolution",
      4     "authors": [
      5       "Jiawei Liu",
      6       "Thanh Nguyen",
      7       "Mingyue Shang",
      8       "Hantian Ding",
      9       "Xiaopeng Li",
     10       "Yu Yu",
     11       "Varun Kumar",
     12       "Zijian Wang"
     13     ],
     14     "year": 2024,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2410.03837",
     17     "doi": "10.48550/arXiv.2410.03837"
     18   },
     19   "scan_version": 3,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "CODEFAVOR, a framework for training pairwise code preference models from synthetic code evolution data (commits and LLM critiques), improves small model (7-12B) preference accuracy by up to 28.8%, matching models 6-9× larger at 34× lower cost. Human annotators outperform all models on code correctness preference (84.9%) but are sub-optimal for non-functional objectives like efficiency (74.9%) and especially security (59.7%). Code comments can negatively impact preference accuracy, and using different draft/critic models for data generation outperforms same-model setups.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper states 'We release the data and code at https://github.com/amazon-science/llm-code-preference' in Section 1 (Contribution #2) and provides a project page at https://llm-code-preference.github.io."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper releases CODEPREFBENCH (1,364 tasks) and training datasets (Commit-Instruct-EditPack and Critic-Evol-SOSS) at the GitHub repository. The underlying benchmarks (EvalPlus, EvalPerf, CyberSecEval, LBPP, BigCodeBench) are all publicly available."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Appendix A.2 mentions 'eight NVIDIA A100-40G GPUs based on Axolotl using DeepSpeed ZeRO-3 offloading and flash attention' and A.3 lists specific vLLM versions (v0.5.1, v0.6.1.post2, v0.5.3.post1). However, no Python version, PyTorch version, CUDA version, requirements.txt, or Dockerfile is provided — insufficient to recreate the environment."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repository is linked but the paper itself contains no README-style commands or 'Reproducing Results' section."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The ± numbers in Table 3 are not confidence intervals or error bars from multiple experimental runs. The table caption states 'Bracketed numbers denote the ranges of uncertain responses, half of whose ratio is accounted for the final accuracy score' — these represent ambiguity in tied/undecidable responses, not statistical uncertainty from repeated measurements."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No statistical significance tests are used anywhere in the paper. Claims like 'CODEFAVOR improves their overall performance by 9.3∼28.8%' and all model comparisons are based on raw accuracy numbers with no p-values, t-tests, or any hypothesis testing."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The paper consistently reports effect sizes with baseline context: 'CODEFAVOR improves their overall performance by 9.3∼28.8% relatively' (Section 3.3), '6∼9× more parameters while being cheaper by 34×' (Table 4), and percentage improvements in each category with before/after values visible in Tables 3 and 5."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No power analysis or justification for benchmark size (1,364 tasks), number of human annotators (18), or annotations per task (3). The training dataset sizes (20,641 and 41,595 samples) are not justified either. Appendix A.6 acknowledges the 'modest' training scale but does not justify the evaluation sample size."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "CODEFAVOR model results in Tables 3 and 5 report single-run numbers with no standard deviation, variance, or spread measure across runs. The paper does not state how many training runs were performed or report any inter-run variability."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Table 3 compares CODEFAVOR against 10 existing LLMs (proprietary and open-weight), human developer agreement, and base models before fine-tuning. Both strong (Llama-3.1-405B, Mistral Large 2) and size-matched baselines are included."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines include contemporary models as of 2024: Claude 3.5 Sonnet, Gemini 1.5 Pro/Flash, Llama-3.1-405B-Instruct, Mistral Large 2, DeepSeek V2.5. These are competitive and recent."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 3.4 presents comprehensive controlled experiments: Table 5 ablates training data sources (Commit-Instruct vs Critic-Evol vs mixture vs merging) and output design (classification vs generation). Table 6 ablates criteria and code comments. Table 7 ablates draft/critic model choices."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The paper evaluates across four distinct metrics: correctness accuracy, efficiency accuracy, security accuracy, and human preference alignment, plus cost-effectiveness (Table 4). Per-category and overall ('Avg.') scores are reported."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section 3.2 reports a human evaluation with 18 software developers annotating code preferences. Each task received 3 annotations with majority voting. Developer expertise, confidence, and annotation time are analyzed."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "CODEPREFBENCH is constructed from separate benchmark sources (EvalPlus, EvalPerf, CyberSecEval, LBPP, BigCodeBench) entirely distinct from the training data (EditPackFT-Multi, Self-OSS-Instruct). Appendix A.5 confirms only 0.1-1.7% similarity overlap above threshold."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "All results tables (Tables 3, 5, 6, 7) break down performance by correctness, efficiency, security, and human preference. Individual model comparisons are shown for each category."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Appendix A.4 provides extensive case studies of faulty preferences from both LLMs and human annotators across correctness (Figures 6-8), efficiency (Figures 9-11), and security (Figures 12-15). Each case study analyzes the specific reasoning errors."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Table 6 shows code comments negatively impact preferences (6-10% correctness drop). Table 7 shows same draft/critic models lead to 2.5-9.4% performance drops. Section 3.4 reports that empty criteria 'substantially decrease' accuracy. Some configurations underperform baselines."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims of 'up to 28.8%' improvement (Table 3: Gemma-2-9B from 60.1% to 77.4%), '6∼9× more parameters' matching (7-12B matching 70B), '34× more cost-effective' (Table 4), and '15.1∼40.3% of tasks remain unsolved' by humans (Table 3: 84.9% correctness, 59.7% security) are all supported by results."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The causal claim 'CODEFAVOR improves accuracy' is supported by before/after fine-tuning comparisons on the same base models, plus controlled ablations isolating individual factors (data source, modeling approach, criteria, comments, draft/critic models) in Tables 5-7. The ablation design provides controlled single-variable manipulation."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title 'Learning Code Preference' and abstract frame results broadly for code generation, but all experiments are Python-only (all benchmarks: EvalPlus, EvalPerf, CyberSecEval, LBPP, BigCodeBench are Python). The paper does not bound claims to Python. Appendix A.6 mentions 'self-contained code snippets' limitation but never mentions the Python-only scope."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper does not discuss alternative explanations for why CODEFAVOR works beyond the proposed mechanisms. No confound analysis is provided — for example, whether improvements come from data quantity rather than data quality, or whether the synthetic data teaches surface-level heuristics rather than genuine code understanding."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper clearly defines what is measured: accuracy on CODEPREFBENCH tasks with verifiable oracles (test execution for correctness, CPU instructions for efficiency, static analyzer for security) plus human agreement. The framing matches the measurement granularity — they say 'code preference accuracy' rather than claiming broader code quality alignment."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Many models lack specific version identifiers: 'Claude 3.5 Sonnet' without snapshot date, 'GPT-4o' without version string (used for security data generation), 'DeepSeek V2.5' without version date. The schema states marketing names without snapshot dates do not count. Only Gemini models include '001' suffixes and vLLM versions are specified in A.3."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Full prompt text is provided: Listing 1 shows the evaluation prompt template, Figure 3 shows complete Commit-Instruct prompts with examples, Figure 5 shows Critic-Evol prompts with few-shot examples. Appendix A.1 provides detailed prompt implementations."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Table 8 lists training hyperparameters (batch size 32, sequence length 2048, learning rate 5e-6, cosine annealing with 40 warmup steps). Section 3.1 states temperature 0.8 for sampling and greedy decoding for evaluation. Appendix A.3 specifies bfloat16 precision."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. The approach is a standard fine-tuning pipeline with direct model inference."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 2.3 documents preprocessing: positional bias augmentation by flipping code pairs (doubling samples), code comment clipping in Critic-Evol, filtering of non-permissive code from EditPackFT-Multi (22,469 from larger set). Filtering rates are reported: 8.1% filtered in Commit-Instruct, 17.9% in Critic-Evol."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Appendix A.6 'Limitation and Future Work' provides a dedicated section discussing three specific limitations: synthetic data scale, self-contained code focus, and benchmark diversity."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Appendix A.6 discusses specific threats: the 62,236 training sample size 'may be modest for model fine-tuning,' the focus on 'self-contained code snippets' limits real-world applicability, and CODEPREFBENCH has 'potential limitations related to the diversity and practicality of candidate code samples due to their synthetic nature.'"
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "While A.6 notes self-contained code and benchmark limitations, the paper does not explicitly state what the results do NOT show or what claims the authors are NOT making. The Python-only scope is never acknowledged. There is no explicit bounding of the generalizability of results to specific settings."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The paper releases both CODEPREFBENCH evaluation data and synthetic training datasets (Commit-Instruct-EditPack, Critic-Evol-SOSS) at https://github.com/amazon-science/llm-code-preference, enabling independent verification."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Sections 2.2.1-2.2.2 describe data generation in detail (Commit-Instruct from EditPackFT-Multi commits, Critic-Evol from Self-OSS-Instruct with Llama3-8B/70B). Section 3.1 describes benchmark construction from specific sources with specific procedures for each category."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "Section 3.2 describes annotator expertise (18 developers, 2/3 CS degrees, 95% with 2+ years experience, 43% self-rated advanced Python) but does not describe how they were recruited — whether internal employees, external contractors, what channels were used, or whether the recruitment method could introduce bias."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The full pipeline is documented with counts at each stage: Commit-Instruct starts with 22,469 commits → 91.9% transformed → 20,641 samples; Critic-Evol starts with 50,661 instructions → 82.1% revised → 41,595 samples. CODEPREFBENCH construction details are provided per category with final counts."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No formal funding disclosure or acknowledgment of grants/sponsors. The paper notes 'Work done during a research internship at AWS AI Labs' as a footnote but has no funding statement."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are clearly listed: University of Illinois Urbana-Champaign and AWS AI Labs, with corresponding email addresses at illinois.edu and amazon.com."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "The work was conducted at AWS AI Labs. AWS/Amazon has direct commercial interest in code generation tools (Amazon CodeWhisperer, mentioned in the introduction). The research develops techniques that could improve AWS's code generation products, meaning the funder has a financial stake in the outcome."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests statement or financial interest declaration is included anywhere in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No training data cutoff dates are stated for any of the evaluated models (Claude 3.5 Sonnet, Llama-3.1-405B, GPT-4o, etc.). This makes it impossible to assess whether benchmark examples appeared in the models' training data."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Appendix A.5 analyzes overlap between CODEFAVOR's training data and CODEPREFBENCH using Levenshtein similarity, finding only 0.1-1.7% of test code has similarity score >80 with training code. Figure 16 provides CDF distributions."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "While A.5 addresses contamination between CODEFAVOR training data and CODEPREFBENCH, it does not address whether the base LLMs (Llama, Claude, GPT-4o, etc.) have seen the underlying benchmark problems (HumanEval published 2021, MBPP, CyberSecEval) in their pre-training data. Only CODEFAVOR-specific contamination is checked."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "No pre-registration is mentioned. The human annotation study with 18 developers was not registered on OSF, AsPredicted, or any other platform."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "No IRB or ethics board approval is mentioned despite collecting data from 18 human developers performing annotation tasks."
    262       },
    263       "demographics_reported": {
    264         "applies": true,
    265         "answer": true,
    266         "justification": "Section 3.2 reports annotator demographics: 18 software developers, two-thirds hold CS degrees, 95% have 2+ years programming experience, 43% self-rated as advanced Python, remainder middle-level."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": true,
    270         "answer": false,
    271         "justification": "The paper states '18 software developers' were used but provides no inclusion or exclusion criteria, screening process, or eligibility requirements for annotator selection."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "This is an annotation/labeling study, not an experimental study with treatment/control conditions. All annotators perform the same task. Randomization of participants to conditions does not apply."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "This is a labeling study, not an experimental study with conditions requiring blinding. The code pair order is shuffled to prevent positional bias (Section 3.1), but traditional blinding is not applicable to this study design."
    282       },
    283       "attrition_reported": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "For human preference tasks, 148+161=309 tasks were sampled but only 145 preference pairs were obtained 'without conflicting preferences out of three annotations per pair.' The paper does not report the full attrition: how many were ties, how many had conflicting preferences, or how many annotators dropped individual tasks."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Table 4 reports normalized per-sample costs and dollar amounts: human preference costs $6.1 per task, Llama-3.1-405B is 1.2×10³ normalized units, CODEFAVOR (Mistral Nemo) is 1 unit — 34× cheaper than 70B model."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "The paper mentions 8× NVIDIA A100-40G GPUs for training (Appendix A.2) but does not report total GPU hours, total training time, or total API spend for synthetic data generation with GPT-4o and Llama models."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No mention of multiple random seeds. CODEFAVOR model results appear to be from single training runs with no seed sensitivity analysis."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The number of training runs is never stated. Results in Tables 3 and 5 are presented without indicating how many runs produced them."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No hyperparameter search budget is reported. Table 8 lists fixed hyperparameters (mostly following Dong et al., 2024) with only a note about a lower learning rate for Gemma-2, but the number of configurations tried is not stated."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": true,
    320         "justification": "Tables 5-7 transparently report all configurations tried (Commit-Instruct, Critic-Evol, Data Mixture, Model Merging) × (Classification, Generation) for all four base models. The final recommended approach (model merging) is selected based on comparing all variants."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors train CODEFAVOR models and evaluate them on their own CODEPREFBENCH benchmark without acknowledging author-evaluation bias or potential systematic advantages in the benchmark design."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "Comparisons between 7-12B CODEFAVOR models and 70-405B baselines are not normalized by compute. Table 4 compares cost-effectiveness but does not report performance as a function of compute budget for training."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The paper does not discuss whether CODEPREFBENCH actually measures code preference as experienced in real-world development. There is no analysis of construct validity — whether comparing code pairs with verifiable oracles (test execution, static analysis) adequately captures the broader concept of code preference."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "No scaffolding is involved in the evaluation. Models are evaluated via direct prompting or fine-tuned classification/generation."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluated models' training data includes solutions to HumanEval (published 2021), MBPP, or other benchmarks underlying CODEPREFBENCH. The A.5 analysis only covers CODEFAVOR training data overlap, not temporal leakage from base model pre-training."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether the evaluation setup leaks answer information. For example, the criterion statement provided to models could hint at the correct answer — a security criterion implies one code is insecure, potentially biasing the model. This is not analyzed."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": true,
    362         "justification": "Appendix A.5 analyzes independence between training and evaluation data using Levenshtein similarity scores, finding low overlap (0.1-1.7% above 80 similarity). They note seed datasets were decontaminated upon creation."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": true,
    367         "justification": "Appendix A.5 applies Levenshtein similarity scoring following Riddell et al. (2024), measuring top-1 similarity between all training-evaluation code pairs. Figure 16 visualizes the CDF of similarity scores across positive/negative sample combinations."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "CODEFAVOR improves the accuracy of model-based code preferences by up to 28.8%",
    374       "evidence": "Table 3 shows Gemma-2-9B-Instruct improves from 60.1% to 77.4% average accuracy with CODEFAVOR generation + model merging, a 28.8% relative improvement. Similar improvements observed across all four base models (9.3-28.8% relative).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "CODEFAVOR models match the performance of models 6-9× larger while being 34× more cost-effective",
    379       "evidence": "Table 3 shows CODEFAVOR 7-12B models achieving 76.9-77.7% average, slightly outperforming Llama-3-70B-Instruct (76.1%). Table 4 shows CODEFAVOR (Mistral Nemo) at 1 normalized cost unit vs Llama-3-70B at 34.1 units.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Human preference is more accurate for code correctness but sub-optimal for non-functional objectives",
    384       "evidence": "Table 3 shows human agreement at 84.9% for correctness (best by 23% over models) but 74.9% for efficiency (below Mistral Large 2 at 81.2%) and 59.7% for security (far below most models scoring 90%+).",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Despite spending 23.4 person-minutes per task, 15.1-40.3% of tasks remain unsolved by humans",
    389       "evidence": "Section 3.2 reports 7.8 minutes average per annotator × 3 annotators = 23.4 person-minutes. Unsolved rates: 15.1% correctness (100-84.9%), 25.1% efficiency, 40.3% security. However, the security figure is inflated by 73.9% of pairs annotated as equally secure, which receive 0.5 credit.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Code comments negatively impact model preferences, possibly due to LLMs' self-bias",
    394       "evidence": "Table 6 shows 6.2-10.4% correctness drops when evaluating with comments on models trained without comments, and 6-7% overall drops when both training and evaluating with comments.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Using different draft and critic models in Critic-Evol is better than using the same model",
    399       "evidence": "Table 7 shows 2.5-9.4% overall performance drop when using same draft/critic models, and higher filtering rates (21.6-27.2% vs 17.9%) indicating self-bias in same-model setups.",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "AWS employees evaluating their own framework",
    406       "detail": "Seven of eight authors are from AWS AI Labs. They designed both the training framework (CODEFAVOR) and the evaluation benchmark (CODEPREFBENCH), creating potential self-comparison bias. No independent evaluation was conducted and no competing interests statement is included."
    407     },
    408     {
    409       "flag": "No statistical significance testing",
    410       "detail": "Despite extensive comparative claims ('improves by up to 28.8%', 'matching models 6-9× larger'), no statistical significance tests are performed. All comparisons rely on point estimates from what appear to be single runs. Results within 1-2 percentage points are treated as meaningfully different."
    411     },
    412     {
    413       "flag": "Uncertain response handling inflates some scores",
    414       "detail": "The ± ranges in Table 3 represent uncertain/tied responses credited at 0.5, not experimental variance. This makes score differences harder to interpret. For example, security scores for Gemma-2-9B range from 5.4% to 100% depending on tie handling (52.7% ±47.3%)."
    415     },
    416     {
    417       "flag": "Human annotation attrition not fully transparent",
    418       "detail": "309 tasks were generated for human preference annotation but only 145 pairs survived. The paper says 'without conflicting preferences' but does not break down how many were ties, how many had conflicts, or analyze potential bias in the surviving subset."
    419     },
    420     {
    421       "flag": "Python-only evaluation presented as general code preference",
    422       "detail": "All benchmarks (EvalPlus, EvalPerf, CyberSecEval, LBPP, BigCodeBench) and training data are Python-only, but the paper's title and claims are framed as general 'code preference' without bounding to Python."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Evaluating large language models trained on code",
    428       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    429       "year": 2021,
    430       "relevance": "Foundational HumanEval benchmark for evaluating LLM code generation, used as a source for CODEPREFBENCH correctness tasks."
    431     },
    432     {
    433       "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation",
    434       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    435       "year": 2023,
    436       "relevance": "EvalPlus benchmark providing rigorous test cases for LLM code evaluation, directly used to construct CODEPREFBENCH correctness tasks."
    437     },
    438     {
    439       "title": "CodeUltraFeedback: An LLM-as-a-judge dataset for aligning large language models to coding preferences",
    440       "authors": ["Martin Weyssow", "Aton Kamanda", "Houari Sahraoui"],
    441       "year": 2024,
    442       "arxiv_id": "2403.09032",
    443       "relevance": "Closely related work on using LLMs as judges for code preferences; CODEFAVOR extends beyond prompting to training dedicated preference models."
    444     },
    445     {
    446       "title": "LLM critics help catch LLM bugs",
    447       "authors": ["Nat McAleese", "Rai Michael Pokorny", "Juan Felipe Ceron Uribe"],
    448       "year": 2024,
    449       "arxiv_id": "2407.00215",
    450       "relevance": "CritiGPT work on using LLMs to catch code bugs; confirms findings about human preference being imperfect, related to the Critic-Evol approach."
    451     },
    452     {
    453       "title": "Training language models to follow instructions with human feedback",
    454       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    455       "year": 2022,
    456       "relevance": "Foundational RLHF work establishing human preference as the standard for LLM alignment, which this paper challenges for code domains."
    457     },
    458     {
    459       "title": "Direct preference optimization: Your language model is secretly a reward model",
    460       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    461       "year": 2023,
    462       "relevance": "DPO method for preference optimization that could directly use CODEFAVOR's preference data for code LLM alignment."
    463     },
    464     {
    465       "title": "Purple llama CyberSecEval: A secure coding benchmark for language models",
    466       "authors": ["Manish Bhatt", "Sahana Chennabasappa", "Cyrus Nikolaidis"],
    467       "year": 2023,
    468       "arxiv_id": "2312.04724",
    469       "relevance": "Code security benchmark providing vulnerability-labeled code pairs used to construct CODEPREFBENCH security tasks."
    470     },
    471     {
    472       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    473       "authors": ["Terry Yue Zhuo", "Minh Chien Vu", "Jenny Chim"],
    474       "year": 2024,
    475       "arxiv_id": "2406.15877",
    476       "relevance": "Code generation benchmark used as a source for CODEPREFBENCH human preference tasks."
    477     },
    478     {
    479       "title": "Evaluating language models for efficient code generation",
    480       "authors": ["Jiawei Liu", "Songrun Xie", "Junhao Wang"],
    481       "year": 2024,
    482       "relevance": "EvalPerf benchmark evaluating LLM code efficiency, used to construct CODEPREFBENCH efficiency tasks with performance-exercising test inputs."
    483     },
    484     {
    485       "title": "Magicoder: Empowering code generation with OSS-Instruct",
    486       "authors": ["Yuxiang Wei", "Zhe Wang", "Jiawei Liu"],
    487       "year": 2024,
    488       "relevance": "Code instruction tuning methodology related to synthetic data generation approaches used in CODEFAVOR."
    489     },
    490     {
    491       "title": "WizardCoder: Empowering code large language models with Evol-Instruct",
    492       "authors": ["Ziyang Luo", "Can Xu", "Pu Zhao"],
    493       "year": 2024,
    494       "relevance": "Evol-Instruct technique for code LLM training, related to CODEFAVOR's evolutionary approach for synthetic data generation."
    495     },
    496     {
    497       "title": "Quantifying contamination in evaluating code generation capabilities of language models",
    498       "authors": ["Martin Riddell", "Ansong Ni", "Arman Cohan"],
    499       "year": 2024,
    500       "arxiv_id": "2403.04811",
    501       "relevance": "Contamination quantification methodology applied in this paper's Appendix A.5 to measure training-evaluation data overlap."
    502     }
    503   ],
    504   "engagement_factors": {
    505     "practical_relevance": {
    506       "score": 2,
    507       "justification": "Released framework and data for training code preference models, useful for teams building code evaluation or RLHF pipelines for code LLMs."
    508     },
    509     "surprise_contrarian": {
    510       "score": 1,
    511       "justification": "Finding that human developers perform poorly on security preference (59.7%) and are sub-optimal for non-functional properties is mildly surprising but not paradigm-shifting."
    512     },
    513     "fear_safety": {
    514       "score": 0,
    515       "justification": "No AI safety or security concerns raised; the paper is about code preference evaluation methodology."
    516     },
    517     "drama_conflict": {
    518       "score": 0,
    519       "justification": "No controversy or conflict angle; the paper presents constructive methodology work."
    520     },
    521     "demo_ability": {
    522       "score": 1,
    523       "justification": "Code and data released on GitHub but not a pip-installable tool or interactive demo."
    524     },
    525     "brand_recognition": {
    526       "score": 1,
    527       "justification": "AWS AI Labs is a recognized lab but not at the viral recognition level of OpenAI or Anthropic for AI code tools."
    528     }
    529   }
    530 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs