scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25858B)
      1 {
      2   "paper": {
      3     "title": "Saber: An Efficient Sampling with Adaptive Acceleration and Backtracking Enhanced Remasking for Diffusion Language Model",
      4     "authors": [
      5       "Yihong Dong",
      6       "Zhaoyu Ma",
      7       "Xue Jiang",
      8       "Zhiyuan Fan",
      9       "Jiaru Qian",
     10       "Yongmin Li",
     11       "Jianha Xiao",
     12       "Zhi Jin",
     13       "Rongyu Cao",
     14       "Binhua Li",
     15       "Fei Huang",
     16       "Yongbin Li",
     17       "Ge Li"
     18     ],
     19     "year": 2025,
     20     "venue": "arXiv",
     21     "arxiv_id": "2510.18165",
     22     "doi": "10.48550/arXiv.2510.18165"
     23   },
     24   "scan_version": 2,
     25   "active_modules": ["experimental_rigor", "data_leakage"],
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Footnote on page 1 states 'Our code is available at https://github.com/zhaoyMa/Saber.' with a working URL provided."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "All datasets used (HumanEval, MBPP, HumanEval-ET, MBPP-ET, LiveCodeBench) are standard publicly available benchmarks. No proprietary data was created."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper mentions 'A6000 GPU (48GB)' and 'LLaDA-8B-Instruct' (§5.4, Appendix B.4) but provides no software dependency specifications (requirements.txt, conda env, library versions)."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No step-by-step reproduction instructions are provided in the paper. Code is released but the paper itself contains no README-level guidance on how to replicate experiments."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "All tables (Tables 1-3) report point estimates only. Despite averaging over five trials (§5.4), no confidence intervals, error bars, or ± notation is provided."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper claims Saber 'outperforms' baselines (§6.1) based solely on comparing point estimates in Table 1 without any statistical significance tests."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper reports absolute Pass@1 scores alongside percentage improvements: 'boosts Pass@1 accuracy by an average improvement of 1.9%' and '251.4% inference speedup' (Abstract). Table 1 provides baselines and Saber scores allowing direct comparison."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No justification for why five trials were chosen, nor any power analysis. The number of benchmark problems per dataset is not discussed as a potential limitation."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "Section 5.4 states 'we report the average results of five trials' but no standard deviation, IQR, or any spread measure is reported. The reader cannot assess result stability."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Table 1 compares against 8 baselines: Random, Entropy, Confidence, Confidence (p=2), SAR, Fast-dLLM, Fast-dLLM (+parallel), ReMDM, and WINO."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Baselines include recent 2025 methods: WINO (Hong et al., 2025), Fast-dLLM (Wu et al., 2025), ReMDM (Wang et al., 2025), and EB-Sampler (Ben-Hamu et al., 2025)."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Table 3 presents a thorough ablation study removing each component (adaptive acceleration, backtracking remasking) individually and together, plus a variant replacing dynamic thresholding."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Three metrics are reported: Pass@1 accuracy, average decoding Steps, and total generation Time (Table 1)."
     96       },
     97       "human_evaluation": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "Evaluation is entirely automated via pass/fail on test suites. Section 6.4 provides qualitative code examples but no systematic human evaluation of code quality."
    101       },
    102       "held_out_test_set": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are reported on standard benchmark test sets (HumanEval, MBPP, LiveCodeBench). LiveCodeBench specifically is described as 'contamination-free' (§5.1). No tuning was done on these test sets."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Results are broken down across 5 benchmarks (HumanEval, MBPP, HumanEval-ET, MBPP-ET, LiveCodeBench) in Table 1, and across 3 different DLMs in Table 2."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 6.4 (Qualitative Analysis) and Figure 4 show side-by-side comparisons including failure cases of the default sampler vs Saber's correct outputs on specific HumanEval problems."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The ablation study (Table 3) shows that removing backtracking causes severe quality collapse (45.1% → 35.2%), effectively reporting a negative result about purely aggressive acceleration."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Abstract claims of '1.9% average improvement' and '251.4% inference speedup' are supported by Table 1 results across benchmarks."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The ablation study (Table 3) provides controlled single-variable manipulation: removing adaptive acceleration or backtracking individually, supporting causal claims that each component contributes to performance."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper scopes claims to 'DLM sampling in code generation' (§6.1) and tests across 3 DLM architectures (Table 2) and 5 code benchmarks. The title specifically says 'Diffusion Language Model' rather than making broader claims."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No discussion of confounds or alternative explanations for the observed improvements. The paper does not consider whether improvements could be due to other factors beyond the proposed mechanisms."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper measures Pass@1 and claims it evaluates code generation correctness. Pass@1 directly measures functional correctness — no proxy gap exists between measurement and claim."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Specific model versions are stated: 'LLaDA-8B-Instruct' (§5.4), 'Dream-v0-Instruct-7B', and 'DiffuCoder-7B-cpGRPO' (§6.2, Table 2)."
    155       },
    156       "prompts_provided": {
    157         "applies": false,
    158         "answer": false,
    159         "justification": "The paper proposes a decoding/sampling algorithm for DLMs. Inputs are standard benchmark problems (docstrings/function signatures from HumanEval, MBPP, etc.) — no custom prompting is involved."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 5.4 and Appendix B.4 report: temperature=0, generation length=256, block length=128 for SAR, and hyperparameter µ for the backtracking mechanism."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No agentic scaffolding is used. Saber is a decoding algorithm operating directly on the DLM's output probabilities."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Appendix B.1 states 'For all datasets, tasks are presented in a zero-shot format.' Standard benchmarks are used as-is with no preprocessing."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Appendix C 'Limitation' provides a dedicated section discussing two specific limitations of the work."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Appendix C identifies specific threats: (1) Saber demands slightly more computational resources per step, and (2) hyperparameter exploration was limited to reasonable ranges with room for further adjustment."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper does not explicitly state what the results do NOT show. No mention that all benchmarks are Python-only, that models are limited to 7-8B scale, or that the method was not tested on natural language generation tasks."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No raw experimental data (per-trial results, per-problem pass/fail outcomes) is provided. Only aggregate results appear in tables."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 5.1 and Appendix B.1 describe each benchmark dataset used, its purpose, and source. All are standard public benchmarks with well-documented provenance."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants. All data comes from standard code generation benchmarks."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The pipeline is straightforward and documented: load benchmark problems → generate code with DLM using sampling method → execute against test cases → compute Pass@1. Sections 5.1-5.4 cover this."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No acknowledgments section or funding disclosure. Authors are from Peking University and Tongyi Lab (Alibaba Group) but no funding sources are mentioned."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Author affiliations are clearly listed: School of Computer Science, Peking University and Tongyi Lab, Alibaba Group."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No funding is disclosed, making independence assessment impossible. Alibaba-affiliated authors evaluate open-source DLMs, not Alibaba products, but the lack of any funding disclosure is a gap."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests or financial interests statement is present in the paper."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No training data cutoff dates are stated for LLaDA-8B-Instruct, Dream-v0-Instruct-7B, or DiffuCoder-7B-cpGRPO."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No analysis of whether HumanEval (published 2021) or MBPP examples appeared in the training data of the models used."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "LiveCodeBench is included as a 'contamination-free benchmark' (§5.1), but contamination risk for HumanEval and MBPP — the primary benchmarks — is not discussed despite these being widely available since 2021."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in this study."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Table 1 reports total generation time and average decoding steps for every method across all benchmarks. Wall-clock inference time is central to the paper's claims."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Appendix B.4 states: '8 NVIDIA A6000 GPUs (48GB each) and 1TB RAM.' Total generation times per experiment are reported in Table 1."
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "Results are averaged over 5 trials (§5.4) but no per-trial or per-seed variance is reported. The reader cannot assess seed sensitivity."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": true,
    313         "justification": "Section 5.4 explicitly states: 'To mitigate the instability of the model sampling, we report the average results of five trials in the experiments.'"
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The hyperparameter µ controls backtracking aggressiveness but no search budget (how many configurations tried, search method) is reported. Appendix C acknowledges 'we only explore the choice of hyperparameters within reasonable ranges.'"
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper does not explain how the value of µ was selected or whether it was tuned on validation vs. test data."
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "Multiple comparisons are made across 8+ baselines and 5 benchmarks without any correction for multiple testing."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The authors implement and evaluate all baseline sampling methods themselves without acknowledging potential implementation bias (Lucic et al., 2018)."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": true,
    338         "justification": "Table 1 reports both quality (Pass@1) and compute (Steps, Time) side-by-side for all methods, allowing direct speed-quality tradeoff comparison. Figure 1 also shows this tradeoff."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether HumanEval, MBPP, or LiveCodeBench actually measure real-world code generation capability vs. narrow algorithmic puzzle-solving."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "No scaffolding is involved. The method is a decoding algorithm; all comparisons use the same model with different sampling strategies."
    349       }
    350     },
    351     "data_leakage": {
    352       "temporal_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the DLMs' training data includes HumanEval or MBPP solutions, which were published in 2021 — well before these models were trained."
    356       },
    357       "feature_leakage_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether the evaluation setup leaks information that would not be available in real usage."
    361       },
    362       "non_independence_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No analysis of whether training and test data share structural similarities or overlap."
    366       },
    367       "leakage_detection_method": {
    368         "applies": true,
    369         "answer": false,
    370         "justification": "No concrete leakage detection method is applied. LiveCodeBench is contamination-free by design but no detection method was used by the authors for the other benchmarks."
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "Saber boosts Pass@1 accuracy by an average improvement of 1.9% over mainstream DLM sampling methods while achieving an average 251.4% inference speedup.",
    377       "evidence": "Table 1 shows Saber achieves highest Pass@1 across all 5 benchmarks (HumanEval 45.1%, MBPP 44.7%, LiveCodeBench 11.0%) with substantially fewer steps and lower time than standard sampling.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Saber is model-agnostic and shows consistent improvements across different DLMs.",
    382       "evidence": "Table 2 shows improvements on LLaDA-8B-Instruct (43.3%→45.1%), Dream-v0-Instruct-7B (28.1%→29.3%), and DiffuCoder-7B-cpGRPO (56.7%→57.3%), all with reduced inference time.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Backtracking-enhanced remasking is essential for maintaining generation quality during aggressive acceleration.",
    387       "evidence": "Ablation study (Table 3): removing backtracking drops Pass@1 from 45.1% to 35.2% despite faster inference (65.67 steps vs 118.92), showing error propagation without the corrective mechanism.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Adaptive acceleration is the primary driver of inference speedup.",
    392       "evidence": "Table 3: removing adaptive acceleration reverts steps to 256 (from 118.92) and increases time from 41:55 to 1:32:33 while maintaining similar Pass@1 (44.5% vs 45.1%).",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "Saber significantly narrows the performance gap between DLMs and autoregressive models in code generation.",
    397       "evidence": "The abstract and conclusion make this claim, but no direct comparison with autoregressive models is provided in the results. DiffuCoder achieves 57.3% on HumanEval — the gap with ARMs is not quantified.",
    398       "supported": "weak"
    399     }
    400   ],
    401   "methodology_tags": ["benchmark-eval"],
    402   "key_findings": "Saber, a training-free sampling algorithm for diffusion language models, combines adaptive acceleration (dynamically adjusting unmasking rate based on evolving confidence) with backtracking-enhanced remasking (reverting likely-error tokens) to improve both speed and quality in code generation. On HumanEval, it achieves 45.1% Pass@1 (vs 43.3% for confidence-based sampling) while reducing inference time by ~70%. Ablation shows the two components are synergistic: acceleration alone causes quality collapse (35.2%), while backtracking alone loses all speed gains. The method generalizes across three DLM architectures.",
    403   "red_flags": [
    404     {
    405       "flag": "No variance reported despite averaging over 5 trials",
    406       "detail": "Section 5.4 states results are averaged over 5 trials, but no standard deviation, confidence intervals, or error bars are reported in any table. The 1.9% average improvement claim could fall within noise without this information."
    407     },
    408     {
    409       "flag": "Contamination risk unaddressed for primary benchmarks",
    410       "detail": "HumanEval (2021) and MBPP (2021) are the primary evaluation benchmarks. Models trained after 2021 likely saw these problems in training data. Only LiveCodeBench is noted as contamination-free, but it is not the main benchmark."
    411     },
    412     {
    413       "flag": "Very low absolute performance on LiveCodeBench",
    414       "detail": "All methods score 0-11% Pass@1 on LiveCodeBench (the contamination-free benchmark), compared to 35-45% on HumanEval. This raises questions about whether the higher HumanEval scores reflect genuine capability or contamination."
    415     },
    416     {
    417       "flag": "Narrow improvement margin without statistical testing",
    418       "detail": "The claimed 1.9% average improvement over baselines is small enough that it could be within noise. Without significance tests on 5-trial averages, it is impossible to determine if improvements are reliable."
    419     },
    420     {
    421       "flag": "Heavy self-citation",
    422       "detail": "At least 9 of the references are to papers by the first author (Dong et al., 2023a/b, 2024a/b/c, 2025a/b/c/d), raising questions about breadth of literature engagement."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Large Language Diffusion Models",
    428       "authors": ["Shen Nie", "Fengqi Zhu", "Zebin You", "Xiaolu Zhang"],
    429       "year": 2025,
    430       "arxiv_id": "2502.09992",
    431       "relevance": "LLaDA is the primary model used in experiments; represents the state of the art in large-scale diffusion language models."
    432     },
    433     {
    434       "title": "Dream 7b: Diffusion large language models",
    435       "authors": ["Jiacheng Ye", "Zhihui Xie", "Lin Zheng"],
    436       "year": 2025,
    437       "arxiv_id": "2508.15487",
    438       "relevance": "One of three DLMs used to validate Saber's model-agnostic claims."
    439     },
    440     {
    441       "title": "DiffuCoder: Understanding and improving masked diffusion models for code generation",
    442       "authors": ["Shansan Gong", "Ruixiang Zhang", "Huangjie Zheng"],
    443       "year": 2025,
    444       "arxiv_id": "2506.20639",
    445       "relevance": "Code-specific diffusion model achieving the highest Pass@1 in experiments; relevant to DLM-based code generation."
    446     },
    447     {
    448       "title": "Evaluating large language models trained on code",
    449       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    450       "year": 2021,
    451       "arxiv_id": "2107.03374",
    452       "relevance": "Introduces HumanEval, the primary benchmark used in this paper and a foundational code generation evaluation dataset."
    453     },
    454     {
    455       "title": "Livecodebench: Holistic and contamination free evaluation of large language models for code",
    456       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    457       "year": 2024,
    458       "arxiv_id": "2403.07974",
    459       "relevance": "Contamination-free code generation benchmark used to validate Saber's generalization."
    460     },
    461     {
    462       "title": "Fast-dLLM: Training-free acceleration of diffusion LLM by enabling KV cache and parallel decoding",
    463       "authors": ["Chengyue Wu", "Hao Zhang", "Shuchen Xue"],
    464       "year": 2025,
    465       "arxiv_id": "2505.22618",
    466       "relevance": "Key baseline for efficient DLM sampling; represents the state of the art Saber compares against."
    467     },
    468     {
    469       "title": "Remasking discrete diffusion models with inference-time scaling",
    470       "authors": ["Guanghan Wang", "Yair Schiff", "Subham Sekhar Sahoo"],
    471       "year": 2025,
    472       "arxiv_id": "2503.00307",
    473       "relevance": "Proposes remasking for DLMs; directly related baseline that Saber's backtracking mechanism extends."
    474     },
    475     {
    476       "title": "Generalization or memorization: Data contamination and trustworthy evaluation for large language models",
    477       "authors": ["Yihong Dong", "Xue Jiang", "Huanyu Liu"],
    478       "year": 2024,
    479       "relevance": "Data contamination study by first author; relevant to benchmark validity and contamination concerns in LLM evaluation."
    480     },
    481     {
    482       "title": "Starcoder: may the source be with you!",
    483       "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"],
    484       "year": 2023,
    485       "arxiv_id": "2305.06161",
    486       "relevance": "Major open-source code LLM; relevant to the autoregressive baseline landscape that DLMs compete against."
    487     },
    488     {
    489       "title": "Code llama: Open foundation models for code",
    490       "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"],
    491       "year": 2023,
    492       "arxiv_id": "2308.12950",
    493       "relevance": "Major code LLM representing the autoregressive paradigm that DLMs aim to rival."
    494     },
    495     {
    496       "title": "DeepSeek-Coder: When the large language model meets programming",
    497       "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"],
    498       "year": 2024,
    499       "arxiv_id": "2401.14196",
    500       "relevance": "State-of-the-art code LLM; relevant to understanding the capability gap between autoregressive and diffusion approaches."
    501     }
    502   ]
    503 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs