ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26444B)


      1 {
      2   "paper": {
      3     "title": "AnCoder: Anchored Code Generation via Discrete Diffusion Models",
      4     "authors": ["Anton Xue", "Litu Rout", "Constantine Caramanis", "Sanjay Shakkottai"],
      5     "year": 2026,
      6     "venue": "Preprint",
      7     "arxiv_id": "2602.17688"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive is provided anywhere in the paper, footnotes, or appendix. No mention of a GitHub link or Zenodo archive."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available datasets: HumanEval (MIT License) and MBPP (Apache 2.0), and training data from OpenCoder (opc-fineweb-code, opc-annealing-corpus). Appendix A.2 explicitly states 'All datasets used in this work are publicly available and licensed for research use.'"
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided. The paper mentions using AdamW and cosine schedules but does not specify software versions, library dependencies, or hardware-software stack details beyond mentioning the Vista GPU Cluster."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. While training hyperparameters are described in Section 5 and Appendix A, there are no concrete instructions a researcher could follow to replicate the experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results in Tables 1, 2, 3, and 5 are reported as point estimates (e.g., '5.45%' Pass@1) with no confidence intervals, error bars, or ± notation."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims 'AnCoder consistently outperforms the MDLM and anchoring baselines' (Section 5.1) based on comparing raw numbers without any statistical significance tests (no p-values, no bootstrap tests, no paired tests)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Section 5.1 reports improvements with baseline context: 'Pass@1 from 3.29% to 5.45% (+2.16 absolute, +65.7% relative)' and 'syntactic validity from 65.58% to 74.72% (+9.14 absolute, +13.9% relative).' These provide both absolute and relative effect sizes with clear baseline context."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for the number of samples generated per problem (20 for HumanEval, 10 for MBPP) or for the benchmark sizes. No power analysis or discussion of whether these sample sizes are sufficient to detect meaningful differences."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. It is not stated whether results are from single runs or averaged over multiple seeds. All tables show single-point estimates only."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Table 1 compares AnCoder against MDLM (494M and 809M), the AR baseline Qwen2.5-Coder-0.5B, and ablated anchoring variants (Null, Keyword, Identifier). This provides both prior-work and ablation baselines."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper compares against MDLM [38] (2024) and Qwen2.5-Coder-0.5B, but omits comparison with other recent diffusion-based code generation models mentioned in the related work: DiffuCoder [16], Dream-Coder [48], CoDA [9], Mercury [22], and Gemini Diffusion [13]. These are all from 2025 and represent the state of the art for DLM code generation."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 5.3 and Figure 6/Table 5 present a systematic ablation of the anchor supervision weight γ across three anchoring strategies (AnchorTree, Keyword, Identifier). Additionally, Table 1 effectively ablates anchoring strategy choice (Null vs. Keyword vs. Identifier vs. AnchorTree)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper reports two metrics: syntactic validity (Syntax %) and functional correctness (Pass@1 %) across all experiments (Tables 1, 2, 3, 5)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of generated code quality is included. Evaluation is entirely automated via syntactic parsing and unit test execution. Given claims about code quality and structural correctness, human evaluation of readability and maintainability would be relevant."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "HumanEval and MBPP are standard held-out benchmarks with predefined test splits. The paper also separates training/validation sets for its own training stages (e.g., 99%-1% split for mid-training, 95%-5% for SFT)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Only aggregate numbers (overall Syntax% and Pass@1%) are reported for HumanEval and MBPP. No per-problem, per-difficulty, or per-category breakdowns are provided. Qualitative examples in Appendix A.4 show individual cases but do not constitute systematic per-category analysis."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Appendix A.4 provides detailed qualitative analysis of failure cases, showing how MDLM and weaker anchoring strategies produce syntactically valid but semantically incorrect code, with specific examples from HumanEval and MBPP."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper honestly reports that 'all diffusion models fall short of the AR baseline in terms of syntactic validity and pass@1 on both benchmarks' (Section 5.1). The ablation also shows that performance can degrade with certain γ values (Table 5)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims that AnCoder is 'a family of models showing that structurally anchored diffusion offers a parameter-efficient path to high-quality code generation.' The results in Table 1 support this: AnCoder (809M) outperforms MDLM baselines. The 'parameter-efficient' claim is supported by the 809M model outperforming a comparable 809M MDLM. The abstract does not overclaim relative to AR models."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims about AnchorTree improving code generation quality. These are supported by controlled ablation: all 809M models use the same architecture, training recipe, and data; the only variable is the anchoring strategy. Section 4.2 also provides a controlled unmasking experiment (Figure 5) demonstrating the causal effect of AST ancestry on prediction quality."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper tests only on Python code (HumanEval and MBPP are Python-only benchmarks) and a single model scale (809M parameters), but the title and conclusion use broad language like 'Anchored Code Generation' and 'high-quality code generation' without qualifying these claims to Python at 809M scale. Section A.5 acknowledges 'AnchorTree is explicitly focused on code hierarchies extracted from the abstract syntax tree' but does not bound to Python specifically."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, the remask rate (0.1 for AnCoder vs. 0.0 for MDLM) is a confound that is not adequately controlled. Section 5.1 notes this difference but does not disentangle the contribution of remasking from anchoring. No threats-to-validity or confound analysis is provided."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper specifies Qwen2.5-Coder-0.5B [21] as the initialization model (Section 5, 'Architecture'). Since this is a model they train from scratch atop this initialization, the base model is adequately specified with its full name and citation."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "The paper does not use LLM prompting in the traditional sense. AnCoder is a diffusion model that takes function signatures/docstrings as conditioning inputs, which are from the standard benchmarks (HumanEval, MBPP). Prompting in the API/chat sense is not applicable."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 5 reports: learning rate 10^-4, batch size 512, cosine LR scheduler, warmup ratio 0.01, cosine noise schedule, sampling temperature 0.8, remask rate 0.1, context length L=1024, denoising steps T in {256, 512, 1024, 2048}. Anchor weight γ values are specified per strategy. Training steps per stage are also listed."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. AnCoder is a diffusion language model evaluated on benchmark code generation tasks without any agent scaffold, tool use, or iterative prompting."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix A.1 documents the three-stage training pipeline with specific datasets, sample counts, and token counts for each stage (Table 4). The SFT stage filtering to Python-only is described. The AST-based anchor labeling preprocessing is described in Sections 4.1 and 4.3. Context length truncation/padding to L=1024 is stated."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Appendix A.5 is titled 'Limitations' and provides substantive discussion of limitations. Section 6 (Discussion) also discusses design space limitations and alternative approaches."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The limitations in Appendix A.5 are relatively generic: 'the development of anchored diffusion models is still nascent, and there exists a large search space of architecture design' and 'AnchorTree is explicitly focused on code hierarchies extracted from the abstract syntax tree, which may limit its ability to reason about dynamic execution behavior.' These do not address specific threats to the validity of the reported results (e.g., the remask rate confound, single-run results, Python-only evaluation, HumanEval contamination)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to Python, to 809M scale, or to the specific benchmarks used. The limitations section discusses future directions but not specific scope exclusions."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw experimental data (generated code samples, per-problem pass/fail results, training logs) is made available. Only aggregate metrics in tables are reported."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data collection/generation procedure is well-described: benchmarks are HumanEval (164 problems) and MBPP (974 problems), with 20 and 10 samples per problem respectively. Training datasets are documented in Appendix A.1 with sources, sizes, and domains."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. The data sources are standard public benchmarks and public code corpora."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full training pipeline is documented across three stages (pretraining, mid-training, SFT) with dataset sizes, training steps, and splits at each stage (Appendix A.1). The AST-based anchor labeling pipeline is described in Sections 4.1 and 4.3."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The Acknowledgments section states: 'This research has been supported by NSF Grants 2019844 and 2112471, the UT Austin Machine Learning Lab, and computing support on the Vista GPU Cluster through the Center for Generative AI (CGAI) and the Texas Advanced Computing Center (TACC) at UT Austin.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are listed as affiliated with The University of Texas at Austin. The paper does not evaluate any commercial product from UT Austin, so there is no obvious undisclosed conflict."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding is from NSF and UT Austin computing resources. Neither has a financial stake in whether AnchorTree outperforms baselines."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is included in the paper. Absence of disclosure is not the same as absence of conflict."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The model is initialized from Qwen2.5-Coder-0.5B, but no training data cutoff date for Qwen is stated. The paper's own training data sources are described but without temporal cutoff information relative to the benchmark creation dates."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of potential overlap between training data and HumanEval/MBPP test problems. The training data comes from OpenCoder's web-scale code corpus which could plausibly contain HumanEval/MBPP solutions."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "HumanEval was published in 2021 and MBPP in 2021. Qwen2.5-Coder was released in 2024 and the OpenCoder data is web-scale. Both benchmarks' solutions are widely available online. The paper does not discuss contamination risk at all, despite this being a well-known issue for these benchmarks."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, latency, or wall-clock time is reported. The paper studies scaling denoising steps T from 256 to 2048 but does not report the time or compute cost associated with each setting, which is critical for practical deployment of diffusion models for code generation."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total GPU hours, training time, or compute budget is stated. The paper mentions using the Vista GPU Cluster and TACC but does not quantify the computational resources consumed across three training stages totaling ~418K optimization steps."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "AnCoder with AnchorTree improves Pass@1 on HumanEval from 3.29% to 5.45% (+65.7% relative) over 809M MDLM baseline.",
    286       "evidence": "Table 1, Section 5.1. Direct comparison of 809M models with identical architecture, training recipe, and data.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "AnCoder with AnchorTree improves Pass@1 on MBPP from 6.34% to 9.10% (+43.5% relative) over 809M MDLM baseline.",
    291       "evidence": "Table 1, Section 5.1. Same controlled comparison setup.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Soft hierarchical anchoring (AnchorTree) outperforms hard anchoring (Keyword-only, Identifier-only) strategies.",
    296       "evidence": "Table 1 shows AnchorTree > Keyword > Identifier on Pass@1 for both benchmarks. Ablation in Table 5/Figure 6 confirms the pattern.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Unmasking AST ancestors of a target position improves prediction quality more than unmasking random positions.",
    301       "evidence": "Figure 5, Section 4.2. Controlled experiment on pretrained DLM with 2000 test samples showing in-out AST unmasking outperforms random and out-in orderings.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Anchored models benefit more from increased denoising steps (test-time compute scaling) than non-anchored models.",
    306       "evidence": "Tables 2 and 3 show steeper performance improvement curves for anchored models as T increases from 256 to 2048.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "All diffusion models fall short of the autoregressive baseline on both benchmarks.",
    311       "evidence": "Table 1 shows AR (494M) achieves 26.83/36.55 Pass@1 on HumanEval/MBPP vs. AnCoder AnchorTree (809M) at 5.45/9.10.",
    312       "supported": "strong"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "AnCoder introduces AnchorTree, a soft anchoring framework that uses abstract syntax tree hierarchy to guide discrete diffusion language models for code generation. On HumanEval and MBPP, AnCoder with AnchorTree improves Pass@1 by 65.7% and 43.5% relative over MDLM baselines at 809M parameters, while also improving syntactic validity. However, all diffusion models still substantially underperform a 494M autoregressive baseline (Qwen2.5-Coder-0.5B), with AnCoder achieving 5.45% vs. 26.83% Pass@1 on HumanEval. The paper demonstrates that AST-based hierarchical information is useful for diffusion denoising through a controlled unmasking experiment.",
    317   "red_flags": [
    318     {
    319       "flag": "Remask rate confound",
    320       "detail": "AnCoder uses a remask rate of 0.1 while the MDLM baseline uses 0.0. Section 5 states this explicitly but does not adequately disentangle the contribution of remasking from anchoring. The 'Null' anchoring variant (no anchoring, but with remasking) already shows substantial improvement over MDLM, suggesting remasking may account for much of the gain."
    321     },
    322     {
    323       "flag": "No error bars or variance reporting",
    324       "detail": "All results are reported as single point estimates. For Pass@1 values in the 3-9% range, the differences between methods (often 1-3 percentage points) could plausibly be within random variation, especially without knowing whether results are from single or multiple runs."
    325     },
    326     {
    327       "flag": "Missing contemporary DLM baselines",
    328       "detail": "The paper discusses DiffuCoder, Dream-Coder, CoDA, Mercury, and Gemini Diffusion in the related work but only compares against MDLM. These contemporary methods are directly relevant competitors that are omitted from the empirical evaluation."
    329     },
    330     {
    331       "flag": "Benchmark contamination risk unaddressed",
    332       "detail": "HumanEval and MBPP solutions are widely available online. The model is initialized from Qwen2.5-Coder (trained on web-scale data) and further trained on OpenCoder's web-scale corpus. No contamination analysis is performed."
    333     },
    334     {
    335       "flag": "Absolute performance levels are very low",
    336       "detail": "The best diffusion model achieves 5.45% Pass@1 on HumanEval vs. 26.83% for the AR baseline. While the relative improvement from anchoring is notable, the absolute performance level raises questions about practical relevance. The paper's framing emphasizes relative gains while the AR gap is enormous."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Evaluating large language models trained on code",
    342       "authors": ["Mark Chen"],
    343       "year": 2021,
    344       "arxiv_id": "2107.03374",
    345       "relevance": "Introduces HumanEval benchmark, a foundational evaluation tool for LLM code generation widely used across the field."
    346     },
    347     {
    348       "title": "Program synthesis with large language models",
    349       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    350       "year": 2021,
    351       "arxiv_id": "2108.07732",
    352       "relevance": "Introduces MBPP benchmark for evaluating code generation capabilities of language models."
    353     },
    354     {
    355       "title": "DiffuCoder: Understanding and improving masked diffusion models for code generation",
    356       "authors": ["Shansan Gong", "Ruixiang Zhang", "Huangjie Zheng"],
    357       "year": 2025,
    358       "arxiv_id": "2506.20639",
    359       "relevance": "Contemporary diffusion-based code generation model that improves masked diffusion for code, directly relevant as an alternative approach."
    360     },
    361     {
    362       "title": "Beyond autoregression: An empirical study of diffusion large language models for code generation",
    363       "authors": ["Chengze Li", "Yitong Zhang", "Jia Li"],
    364       "year": 2025,
    365       "arxiv_id": "2509.11252",
    366       "relevance": "Empirical comparison of diffusion vs. autoregressive LLMs for code generation, establishing DLM capabilities and limitations."
    367     },
    368     {
    369       "title": "Mercury: Ultra-fast language models based on diffusion",
    370       "authors": ["Samar Khanna", "Siddhant Kharbanda", "Shufan Li"],
    371       "year": 2025,
    372       "arxiv_id": "2506.17298",
    373       "relevance": "Diffusion-based language model focusing on speed, relevant to understanding the DLM-for-code landscape."
    374     },
    375     {
    376       "title": "Qwen2.5-coder technical report",
    377       "authors": ["Binyuan Hui", "Jian Yang", "Zeyu Cui"],
    378       "year": 2024,
    379       "arxiv_id": "2409.12186",
    380       "relevance": "Technical report for the autoregressive model used as AnCoder's initialization backbone and primary AR baseline."
    381     },
    382     {
    383       "title": "Simple and effective masked diffusion language models",
    384       "authors": ["Subham Sahoo", "Marianne Arriola", "Yair Schiff"],
    385       "year": 2024,
    386       "relevance": "MDLM is the primary baseline method in this paper; foundational work on masked discrete diffusion for language."
    387     },
    388     {
    389       "title": "Large language diffusion models",
    390       "authors": ["Shen Nie", "Fengqi Zhu", "Zebin You"],
    391       "year": 2025,
    392       "arxiv_id": "2502.09992",
    393       "relevance": "Major work on scaling diffusion language models, relevant to understanding DLM capabilities and the AR-DLM performance gap."
    394     },
    395     {
    396       "title": "Constrained decoding of diffusion LLMs with context-free grammars",
    397       "authors": ["Niels Mündler", "Jasper Dekoninck", "Martin Vechev"],
    398       "year": 2025,
    399       "arxiv_id": "2508.10111",
    400       "relevance": "Complementary approach to ensuring syntactically valid code from diffusion models via constrained decoding."
    401     },
    402     {
    403       "title": "Anchored diffusion language model",
    404       "authors": ["Litu Rout", "Constantine Caramanis", "Sanjay Shakkottai"],
    405       "year": 2025,
    406       "relevance": "Foundational work introducing anchored diffusion for language models, directly extended by this paper to code generation."
    407     },
    408     {
    409       "title": "TreeDiff: AST-guided code generation with diffusion LLMs",
    410       "authors": ["Yiming Zeng", "Jinghan Cao", "Zexin Li"],
    411       "year": 2025,
    412       "arxiv_id": "2508.01473",
    413       "relevance": "Most closely related work: also uses AST structure for diffusion code generation but modifies the forward noising schedule rather than denoising."
    414     },
    415     {
    416       "title": "Dream-coder 7b: An open diffusion language model for code",
    417       "authors": ["Zhihui Xie", "Jiacheng Ye", "Lin Zheng"],
    418       "year": 2025,
    419       "arxiv_id": "2509.01142",
    420       "relevance": "Open diffusion language model for code at larger scale (7B), relevant as a contemporary DLM code generation baseline."
    421     }
    422   ]
    423 }

Impressum · Datenschutz