scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26540B)
      1 {
      2   "paper": {
      3     "title": "Qwen2.5-Coder Technical Report",
      4     "authors": ["Binyuan Hui", "Jian Yang", "Zeyu Cui", "Jiaxi Yang", "Dayiheng Liu", "Lei Zhang", "Tianyu Liu", "Jiajun Zhang", "Bowen Yu", "Keming Lu", "Kai Dang", "Yang Fan", "Yichang Zhang", "An Yang", "Rui Men", "Fei Huang", "Bo Zheng", "Yibo Miao", "Shanghaoran Quan", "Yunlong Feng", "Xingzhang Ren", "Xuancheng Ren", "Jingren Zhou", "Junyang Lin"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2409.12186"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": true,
     16         "justification": "Model weights released on HuggingFace (Table 4, Table 15) and evaluation code released on GitHub: https://github.com/QwenLM/Qwen2.5-Coder (Section 6, Section 7)."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "The pretraining dataset 'Qwen2.5-Coder-Data' (5.5T tokens) is not released. Only the model weights and evaluation code are public. The instruction tuning data is also not released."
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned in the paper. Hardware and software setup for training/evaluation are not detailed."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "While evaluation code is released on GitHub, there are no step-by-step reproduction instructions for the training process or detailed evaluation setup in the paper itself."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "All results across all tables (Tables 5-20) are reported as single point estimates with no confidence intervals, error bars, or ± notation."
     39       },
     40       "significance_tests": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper makes numerous claims that Qwen2.5-Coder 'outperforms' other models based solely on comparing raw numbers. No statistical significance tests are reported anywhere."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper reports absolute scores with baseline context throughout (e.g., 'Qwen2.5-Coder-32B achieves state-of-the-art performance with an average improvement of 7.9% EM and 4.2% ES compared to DS-Coder-33B-Base' in Section 6.2). Percentage improvements are given with baselines."
     49       },
     50       "sample_size_justified": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No justification is provided for the choice of benchmarks, number of test examples, or why these particular evaluation sets are sufficient to support the claims."
     54       },
     55       "variance_reported": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No variance, standard deviation, or spread measures are reported for any results. All numbers appear to be single-run results with no indication of result stability."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Extensive baselines are included: StarCoder2 series, DeepSeek-Coder series, CodeLlama series, CodeQwen1.5, CodeStral, Yi-Coder, and closed-source APIs (GPT-4o, Claude 3.5 Sonnet, o1-mini/preview)."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Baselines include contemporary models such as DeepSeek-Coder-V2 (2024), Claude-3.5-Sonnet (2024), GPT-4o (2024), and o1-mini/preview. These represent the state of the art at time of publication."
     71       },
     72       "ablation_study": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Table 3 shows an ablation of data mixture ratios (100:0:0, 85:15:5, 70:20:10 for Code:Text:Math). Figure 1 shows the effect of different data cleaning stages on performance."
     76       },
     77       "multiple_metrics": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Multiple metrics are used: Pass@1, Pass@2, Exact Match (EM), Edit Similarity (ES), win rate (CodeArena, CodeEditorBench). Results span code generation, completion, reasoning, editing, and math tasks."
     81       },
     82       "human_evaluation": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Section 7.1 describes CodeArena, 'an internal annotated evaluation benchmark called CodeArena, including nearly 400 human-curated samples' used for human preference alignment evaluation."
     86       },
     87       "held_out_test_set": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Standard held-out benchmarks are used (HumanEval, MBPP, etc.). LiveCodeBench uses problems from 2024.07-2024.11 specifically to avoid contamination. Section 5 describes decontamination procedures."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Results are broken down per-language (MultiPL-E Table 6, 17), per-task type (code generation, completion, reasoning, editing), per-model size, and per-completion granularity (line, function, API in RepoEval Table 10)."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No failure cases or error analysis is presented. The paper only reports success metrics without discussing where or why the model fails."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Table 3 shows that the 100:0:0 code-only data mixture performs worse than mixed ratios, which is a somewhat surprising negative finding. The paper reports that 'larger models did not yield significant benefits' for the text-code grounding data filter (Section 3.1.1)."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The abstract claims SOTA performance 'across more than 10 benchmarks' and 'consistently outperforming larger models of the same model size.' The results tables (5-20) support these claims with the 32B model matching or exceeding GPT-4o on most benchmarks."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "The paper makes causal claims like 'Math and Text data may positively contribute to code performance' (Section 3.1.2) based on comparing three data mixtures. However, only three ratios are tested with no variance reporting or significance tests, making the causal inference weak."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "The title and abstract frame results broadly ('code generation capabilities', 'code intelligence'). While results are on specific benchmarks, claims like 'the most powerful open-source code model to date' (Section 7.1) are not bounded to the tested benchmarks."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "No alternative explanations are discussed for the performance gains. The improvements could stem from scale, data quality, architecture, or other confounds, but these are not disentangled or discussed."
    128       },
    129       "proxy_outcome_distinction": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper measures pass@1 on coding benchmarks but frames results as 'code intelligence' and 'coding capabilities' broadly. No discussion of whether benchmark performance proxies real-world coding ability."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Exact model names with sizes are specified for all Qwen models. For closed-source APIs, specific versions are given: 'GPT-4o-2024-08-06', 'Claude-3.5-Sonnet-20240620', 'Claude-3.5-Sonnet-20241022' (Table 16)."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "The paper mentions using 'a unified prompt template' for Text-to-SQL (Section 7.4) following prior work, but does not provide the actual prompts used for evaluations across all benchmarks."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Architecture hyperparameters are reported in Table 1. Training details include sequence lengths (8,192 for file-level, 32,768 for repo-level), RoPE base frequency (10,000 → 1,000,000), and context limits for evaluation (e.g., 'maximum sequence length of 8192 tokens', 'maximum output length of 50 tokens'). FIM format is specified."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. The paper evaluates base and instruct models directly on benchmarks without scaffolding."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 3.1 describes data composition and cleaning in detail: rule-based filtering, coarse-to-fine hierarchical filtering with fastText classifiers, 4-stage iteration process (Figure 1), code removal from text data, and decontamination via 10-gram overlap (Section 5)."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "There is no limitations section in the paper. The conclusion (Section 9) mentions future directions but does not discuss limitations of the current work."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "No threats to validity are discussed anywhere in the paper."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show or what settings are excluded."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The pretraining data is not released. Evaluation results are presented as aggregated scores with no raw per-example outputs available."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 3.1 describes data collection: 'public repositories from GitHub created before February 2024, spanning 92 programming languages', Common Crawl text-code data, synthetic data from CodeQwen1.5, and data from Qwen2.5-Math and Qwen2.5."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants in the study. Data sources are standard benchmarks and public repositories."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The data pipeline is documented: Figure 2 shows the three-stage training pipeline, Section 3.1.1 describes filtering stages with validation (Figure 1 shows token counts at each stage: 582B → 370B → 147B → 118B), and Section 5 describes decontamination."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding information or acknowledgments section is present in the paper."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "All authors are listed as affiliated with 'Qwen Team, Alibaba Group' on the first page."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "Alibaba has a direct commercial interest in demonstrating that Qwen2.5-Coder is superior to competitors. The funder (Alibaba) has a clear stake in the outcomes."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is provided. Authors are Alibaba employees evaluating Alibaba's product but no explicit conflict declaration is made."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "Section 3.1.1 states 'public repositories from GitHub created before February 2024.' This establishes the training data cutoff for the code data."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "Section 5 describes decontamination: 'We removed key datasets such as HumanEval, MBPP, GSM8K, and MATH. The filtering was done using a 10-gram overlap method.'"
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "Section 5 explicitly addresses benchmark contamination with 10-gram decontamination. LiveCodeBench uses problems from 2024.07-2024.11 to 'strictly avoid test data contamination' (Section 8)."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants study. CodeArena uses human-curated samples but is a benchmark, not a human subjects study."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants study."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants study."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants study."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants study."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants study."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants study."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "No inference cost, latency, or tokens consumed per evaluation is reported."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "Training compute budget (GPU hours, hardware, training time) is not stated despite training on 5.5T tokens across six model sizes."
    287       }
    288     },
    289     "experimental_rigor": {
    290       "seed_sensitivity_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No results are reported across multiple random seeds. All results appear to be single-run."
    294       },
    295       "number_of_runs_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "The number of experimental runs is never stated for any evaluation."
    299       },
    300       "hyperparameter_search_budget": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No hyperparameter search budget is reported for training or evaluation. The data mixture ablation (Table 3) tests only 3 ratios with no search budget stated."
    304       },
    305       "best_config_selection_justified": {
    306         "applies": true,
    307         "answer": true,
    308         "justification": "Table 3 shows the data mixture selection with results for 3 configurations, and the selected 70:20:10 ratio is justified by its superior average performance across coding, math, and general benchmarks."
    309       },
    310       "multiple_comparison_correction": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite comparing across many benchmarks and model sizes."
    314       },
    315       "self_comparison_bias_addressed": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "Alibaba authors evaluate their own Qwen2.5-Coder models against competitors. No acknowledgment of self-evaluation bias. They note evaluation code is public, which partially mitigates but does not address the bias."
    319       },
    320       "compute_budget_vs_performance": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "No performance-vs-compute analysis. Different models have vastly different compute budgets (e.g., DS-Coder-V2 at 236B MoE vs Qwen2.5-Coder-32B dense) but this is not discussed."
    324       },
    325       "benchmark_construct_validity": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "No discussion of whether benchmarks like HumanEval, MBPP actually measure real-world coding ability. The paper uses benchmarks without questioning their construct validity."
    329       },
    330       "scaffold_confound_addressed": {
    331         "applies": false,
    332         "answer": false,
    333         "justification": "No scaffolding is used; models are evaluated directly on benchmarks. The Aider evaluation is the closest to scaffolded, but is a standard benchmark setup."
    334       }
    335     },
    336     "data_leakage": {
    337       "temporal_leakage_addressed": {
    338         "applies": true,
    339         "answer": true,
    340         "justification": "Section 5 addresses decontamination and Section 8 uses LiveCodeBench (2024.07-2024.11) specifically to test OOD capabilities after the training cutoff, directly addressing temporal leakage."
    341       },
    342       "feature_leakage_addressed": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "No discussion of whether evaluation setups leak information through context or hints not available in real usage scenarios."
    346       },
    347       "non_independence_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of whether training repositories overlap with benchmark repositories or whether similar problems exist across train and test sets beyond the 10-gram overlap filter."
    351       },
    352       "leakage_detection_method": {
    353         "applies": true,
    354         "answer": true,
    355         "justification": "Section 5 describes a concrete decontamination method: '10-gram overlap method, where any training data with a 10-gram word-level overlap with the test data was removed.' This is an active prevention method."
    356       }
    357     }
    358   },
    359   "claims": [
    360     {
    361       "claim": "Qwen2.5-Coder-32B-Instruct achieves state-of-the-art performance among open-source code models, matching GPT-4o coding capabilities.",
    362       "evidence": "Table 16 shows Qwen2.5-Coder-32B-Instruct achieving 92.7 HumanEval (vs GPT-4o 92.1), 90.2 MBPP (vs 86.8), 49.6 BigCodeBench-Full (vs 50.1), 31.4 LiveCodeBench (vs 34.6).",
    363       "supported": "moderate"
    364     },
    365     {
    366       "claim": "A 70:20:10 Code:Text:Math data mixture outperforms code-only training.",
    367       "evidence": "Table 3 shows the 70:20:10 ratio achieving 55.0 average vs 31.3 for 100:0:0, tested on Qwen2.5-Coder-7B.",
    368       "supported": "moderate"
    369     },
    370     {
    371       "claim": "Qwen2.5-Coder-7B outperforms the larger DS-Coder-33B across code generation benchmarks.",
    372       "evidence": "Table 5 shows Qwen2.5-Coder-7B (61.6 HE, 53.0 HE+, 76.9 MBPP) exceeding DS-Coder-33B (54.9, 47.6, 74.2) on all five metrics.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "The coarse-to-fine text-code filtering yields progressive improvement through 4 stages.",
    377       "evidence": "Figure 1 shows average performance on HumanEval/MBPP increasing from 41.6% to 46.8% across 4 filtering stages, tested on Qwen2.5-Coder-1.5B.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Qwen2.5-Coder supports 128K token context length.",
    382       "evidence": "Section 6.6 and Figure 6 show 'Needle in the Code' evaluation demonstrating correct retrieval across 10K-128K context lengths, though the task is very simple (retrieving a single custom function).",
    383       "supported": "weak"
    384     }
    385   ],
    386   "methodology_tags": ["benchmark-eval"],
    387   "key_findings": "Qwen2.5-Coder is a series of six code-specific LLMs (0.5B-32B) built on Qwen2.5 and trained on 5.5T tokens. The 32B-Instruct model matches GPT-4o on many coding benchmarks while being open-source. A key finding is that a 70:20:10 Code:Text:Math data mixture outperforms code-only training. The models demonstrate strong scaling properties across model sizes, with each size achieving SOTA among open-source models of comparable scale.",
    388   "red_flags": [
    389     {
    390       "flag": "Company evaluating its own product",
    391       "detail": "All authors are Alibaba employees evaluating Alibaba's Qwen2.5-Coder. No independent evaluation or acknowledgment of self-evaluation bias. The paper consistently highlights favorable comparisons while not discussing failures."
    392     },
    393     {
    394       "flag": "No error bars or variance reporting",
    395       "detail": "Across 20+ tables and figures with hundreds of benchmark numbers, not a single result includes error bars, confidence intervals, or variance across runs. Claims of 'outperforming' are based entirely on comparing point estimates."
    396     },
    397     {
    398       "flag": "No limitations section",
    399       "detail": "The paper contains no discussion of limitations, threats to validity, or scope boundaries. This is unusual for a 32-page technical report and suggests selective presentation."
    400     },
    401     {
    402       "flag": "Missing compute budget",
    403       "detail": "Training six models on 5.5T tokens each represents massive compute, but no training time, GPU hours, or hardware specifications are reported, making it impossible to assess practical replicability."
    404     },
    405     {
    406       "flag": "Internal benchmark without public validation",
    407       "detail": "CodeArena is described as an 'internal annotated evaluation benchmark' with ~400 samples. Since it's not public, these results cannot be independently verified."
    408     },
    409     {
    410       "flag": "Cherry-picked favorable framing",
    411       "detail": "On LiveCodeBench the 32B model (31.4) trails GPT-4o (34.6) and o1-mini (60.0) substantially, yet the abstract claims the model 'matches the coding capabilities of GPT-4o' based on other benchmarks where it performs better."
    412     }
    413   ],
    414   "cited_papers": [
    415     {
    416       "title": "Evaluating large language models trained on code",
    417       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    418       "year": 2021,
    419       "arxiv_id": "2107.03374",
    420       "relevance": "Introduces HumanEval, the foundational code generation benchmark used throughout."
    421     },
    422     {
    423       "title": "StarCoder: may the source be with you!",
    424       "authors": ["Raymond Li", "Loubna Ben Allal"],
    425       "year": 2023,
    426       "arxiv_id": "2305.06161",
    427       "relevance": "Major open-source code LLM and baseline compared against."
    428     },
    429     {
    430       "title": "StarCoder 2 and The Stack v2: The Next Generation",
    431       "authors": ["Anton Lozhkov", "Raymond Li"],
    432       "year": 2024,
    433       "arxiv_id": "2402.19173",
    434       "relevance": "StarCoder2 series used as key baseline; data curation methodology influenced this work."
    435     },
    436     {
    437       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    438       "authors": ["Daya Guo", "Qihao Zhu"],
    439       "year": 2024,
    440       "arxiv_id": "2401.14196",
    441       "relevance": "DeepSeek-Coder series is the primary competitive baseline throughout the paper."
    442     },
    443     {
    444       "title": "Code Llama: Open Foundation Models for Code",
    445       "authors": ["Baptiste Roziere", "Jonas Gehring"],
    446       "year": 2023,
    447       "arxiv_id": "2308.12950",
    448       "relevance": "Code LLM baseline from Meta, compared across multiple benchmarks."
    449     },
    450     {
    451       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    452       "authors": ["J Liu", "CS Xia"],
    453       "year": 2023,
    454       "relevance": "EvalPlus benchmark (HumanEval+, MBPP+) used as primary evaluation tool."
    455     },
    456     {
    457       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    458       "authors": ["Terry Yue Zhuo"],
    459       "year": 2024,
    460       "arxiv_id": "2406.15877",
    461       "relevance": "BigCodeBench used as challenging code generation benchmark evaluating tool-use capabilities."
    462     },
    463     {
    464       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    465       "authors": ["Naman Jain", "King Han"],
    466       "year": 2024,
    467       "arxiv_id": "2403.07974",
    468       "relevance": "Contamination-free code benchmark using competitive programming problems, key for addressing data leakage."
    469     },
    470     {
    471       "title": "MultiPL-E: A Scalable and Extensible Approach to Benchmarking Neural Code Generation",
    472       "authors": ["Federico Cassano"],
    473       "year": 2022,
    474       "arxiv_id": "2208.08227",
    475       "relevance": "Multilingual code generation benchmark used across multiple evaluations."
    476     },
    477     {
    478       "title": "CRUXEval: A Benchmark for Code Reasoning, Understanding and Execution",
    479       "authors": ["Alex Gu", "Baptiste Rozière"],
    480       "year": 2024,
    481       "arxiv_id": "2401.03065",
    482       "relevance": "Code reasoning benchmark used to evaluate whether models understand code execution flow."
    483     },
    484     {
    485       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    486       "authors": ["Rafael Rafailov"],
    487       "year": 2023,
    488       "arxiv_id": "2305.18290",
    489       "relevance": "DPO training methodology used in the post-training alignment stage of Qwen2.5-Coder."
    490     },
    491     {
    492       "title": "Efficient Training of Language Models to Fill in the Middle",
    493       "authors": ["Mohammad Bavarian"],
    494       "year": 2022,
    495       "arxiv_id": "2207.14255",
    496       "relevance": "Fill-in-the-Middle (FIM) training strategy adopted for code completion capabilities."
    497     }
    498   ]
    499 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs