ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28215B)


      1 {
      2   "paper": {
      3     "title": "REFINESTAT: Efficient Exploration for Probabilistic Program Synthesis",
      4     "authors": ["Madhav Kanda", "Shubham Ugare", "Sasa Misailovic"],
      5     "year": 2025,
      6     "venue": "ICLR 2026",
      7     "arxiv_id": "2509.01082",
      8     "doi": "10.48550/arXiv.2509.01082"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "REFINESTAT, a framework combining semantic constrained decoding with diagnostic-aware refinement, enables small language models (<8B parameters) to generate statistically reliable probabilistic programs. It achieves ~40 percentage point improvement in run rates over unconstrained generation and ~30 points over syntax-only constraining. On several datasets, REFINESTAT with a 7B model matches or surpasses GPT-4 and OpenAI o3 on ELPD-LOO scores, demonstrating that constrained decoding can close the gap between small open-weight models and large closed-source models for probabilistic program synthesis.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Section 9 (Reproducibility Statement): 'We provide the source code of REFINESTAT as part of the supplementary material that can be used to reproduce our results, and we will also release it as open source.'"
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The datasets are from Stan PosteriorDB (Magnusson et al., 2024), a publicly available benchmark. The paper cites it explicitly and uses standard datasets from it."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Appendix E specifies hardware (48-core Intel Xeon Silver 4214R, 2x NVIDIA RTX A5000), framework (PyTorch, Itergen library), and inference setup. Specific library versions are not enumerated in a requirements file in the text, but the implementation details are provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper mentions source code in supplementary material but does not provide step-by-step reproduction instructions in the paper text. Prompts are provided in Appendix D, but no explicit README-style reproduction guide is described."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Tables 2, 3, and 9 all report mean ± standard deviation for all metrics across experimental runs."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims REFINESTAT 'significantly improves' over baselines but provides no statistical significance tests (no p-values, no t-tests, no bootstrap tests). Comparisons are based solely on comparing mean ± std values."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Table 1 reports absolute run rate differences (e.g., '40 percentage points higher'). Tables 2 and 3 provide absolute metric values for both systems, enabling effect size assessment. Section 5.1 quantifies improvements as '+40pp' and '+30pp'."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper uses 5 datasets and 10 seeds but does not justify why these numbers are sufficient. No power analysis is provided. The choice of 5 runs for comparison in Section 5.2 is justified only by token budget matching, not statistical power."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Tables 2, 3, 6, 8, and 9 all report standard deviation across runs. Section 5.2 states 'We repeat this process five times to compute the mean and standard deviation for all metrics.'"
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Multiple baselines: Standard (unconstrained generation), Syncode (syntax-only constraining), BoxLM (GPT-4-based), OpenAI o3, and Expert Stan programs from PosteriorDB."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "BoxLM (Li et al., 2024), Syncode (Ugare et al., 2024c), and OpenAI o3 are all recent. The expert Stan programs from PosteriorDB serve as gold-standard references."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 5.4, Table 4: systematic ablation removing each semantic validation component one at a time, measuring impact on run rate. Each component's contribution is quantified."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Seven diagnostic metrics from the Bayesian Workflow Reliability Score (R-hat, ESS bulk, ESS tail, divergences, BFMI, Pareto k, ELPD-LOO), plus run rate and token efficiency."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant here — the quality of probabilistic programs is assessed via well-established statistical diagnostics (convergence, predictive accuracy), which are more rigorous than human judgment for this domain."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "ELPD-LOO (leave-one-out cross-validation) is inherently a held-out evaluation metric. Each data point is predicted from a model fit to all other points, providing out-of-sample assessment."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 2 provides per-dataset, per-model breakdowns for all diagnostic metrics. Tables 1, 3, 5, 6 similarly break down by dataset and/or model."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Appendix F provides detailed error analysis: search space limitations, model misfit and sampling failures, call-level hallucinations, and termination failures. Section 5.2 discusses cases where Standard achieves higher ELPD but with unreliable diagnostics."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Qwen-Coder-7B fails on GP dataset even with REFINESTAT (Table 2, marked ✗). The Dugongs ELPD is lower than BoxLM (Table 3). Section 5.2 acknowledges cases where Standard baseline achieves higher ELPD-LOO."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims that REFINESTAT produces programs 'often matching or surpassing those from closed-source large language models (e.g., OpenAI o3)' are supported by Table 3 (Peregrine and GP better than o3, others comparable). The 'syntactically sound and statistically reliable' claim is supported by Tables 1-2."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The ablation study (Table 4) supports causal claims about individual components' contributions via controlled single-variable manipulation. The claim that semantic constraining causes improved run rates is supported by systematic removal of each constraint."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper tests on only 5 datasets from PosteriorDB and 4 SLMs, all ≤8B parameters. The title claims 'Probabilistic Program Synthesis' broadly, but results are limited to PyMC/NumPyro on a small set of well-known Bayesian models. Section 5.5 shows NumPyro generalizability but scope is still narrow."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 5.4 addresses the alternative explanation that results could be due to memorization, with controlled experiments using anonymized prompts and syntactic obfuscation. Section 5.2 discusses the alternative interpretation when Standard achieves higher ELPD (unreliable sampling)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper carefully defines what it measures (Bayesian Workflow Reliability Score, ELPD-LOO) and what these metrics represent (convergence quality, out-of-sample predictive accuracy). Definition 2 explicitly formulates the reliability score. The paper does not overclaim beyond these metrics."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4 specifies exact models: Llama3-8B, CodeGemma-7B, Qwen2.5-Coder-7B, DeepSeek-R1-Distill-Qwen-7B. These are specific model identifiers with parameter counts, not vague names."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Appendix D provides the full prompt template and all 5 dataset-specific prompts with exact code and descriptions. The actual text sent to models is fully reproducible."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix E reports: convergence threshold αR=1.05, ESSbulk≥400, ESStail≥100, Pareto threshold Lcd=0.7, ε=0.2, β=4, α=2, Rmax=100, temperature range 0.2-0.4, 10 seeds, MCMC settings (1000 samples, 1000 tune, 4 chains)."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The REFINESTAT framework is described in detail in Section 3: Algorithm 1 specifies the full synthesis loop, Section 3.1 describes constrained decoding with 6 validity predicates, Section 3.2 describes the refinement loop with likelihood and prior resampling. Figure 1 provides a workflow diagram."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The datasets are used directly from PosteriorDB with exact data values provided in Appendix D. The prompts show exactly how data is provided to the model. No preprocessing is applied beyond formatting into the prompt template."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 'Conclusion and Limitations' contains a dedicated limitations paragraph discussing missing prior-predictive/posterior-predictive checks, partial diagnostic coverage, and lack of global convergence guarantee."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The limitations are specific to this work: 'it does not include prior-predictive or posterior-predictive checks, which often require manual inspection and domain-specific judgment', 'the reported ELPD only partially reflect model adequacy in some cases', 'refinement strategy...does not guarantee convergence to globally optimal program.'"
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "While limitations mention what the framework doesn't include (predictive checks, convergence guarantee), the paper does not explicitly state what settings the results do NOT generalize to — e.g., larger models, more complex Bayesian models, other PPLs beyond PyMC/NumPyro, or real-world modeling problems beyond PosteriorDB."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The datasets are from PosteriorDB, a publicly available benchmark. All data values are printed in full in Appendix D. Source code is provided for reproduction."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4 describes dataset selection: '5 benchmark datasets from Stan PosteriorDB (Magnusson et al., 2024), mirroring the selection in prior research on automated statistical modeling (Li et al., 2024).' Each dataset is described with its domain and characteristics."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data comes from standard public benchmarks (PosteriorDB)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is clear: prompt with dataset → LLM generation (constrained/unconstrained) → MCMC inference → diagnostic extraction → model comparison. Algorithm 1 formalizes this. Appendix E provides experimental setup details."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Section 8 (Acknowledgments): 'This research was supported in part by NSF Grants No. CCF-1846354 and CCF-2313028.'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are from University of Illinois Urbana-Champaign. No commercial affiliations. The paper does not evaluate a commercial product from the authors' employer."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "NSF is a government funding agency independent of the outcome. No commercial funder with a stake in the results."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is provided. One author (Ugare) has multiple papers on Syncode/IterGen which are used as baselines and building blocks — this potential conflict is not disclosed."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper does not state the training data cutoff dates for any of the 4 SLMs used (Llama3-8B, CodeGemma-7B, Qwen2.5-Coder-7B, DQ-7B). PosteriorDB datasets and their Stan programs may be in the training data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Section 5.4 addresses memorization explicitly with controlled experiments: anonymized prompts and syntactic obfuscation to test whether models reproduce memorized solutions. They acknowledge 'measuring the memorization effect is still an open problem.'"
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Section 5.4 directly addresses whether PosteriorDB programs could be memorized: only Eight Schools has a PyMC program in PosteriorDB (outdated version), remaining programs are in Stan. Modified prompts produce comparable results, providing evidence against pure memorization."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Appendix G.1, Table 8 reports token consumption per method per dataset, showing REFINESTAT uses ~1.9x tokens on average compared to baseline. This serves as a cost proxy for local inference."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Appendix E specifies hardware: '48-core Intel Xeon Silver 4214R CPU with 2 NVIDIA RTX A5000 GPUs.' Token budgets are reported in Table 8. Rmax=100 iterations is stated as the compute budget."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Appendix E: 'We run all experiments for 10 seeds to reduce result randomness.' Tables report mean ± std across these runs, showing seed sensitivity."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Appendix E states '10 seeds'. Section 5.2 states '5 times' for the baseline comparison with best-of-5 protocol. Section 5.4 ablation uses '10 different random seeds'."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The paper states hyperparameter values (β=4, α=2, Rmax=100) were chosen 'based on preliminary experiments' (Appendix E) but does not report how many configurations were tried or the search methodology."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Temperature range 0.2-0.4 is used but the paper does not explain how this range was selected. The best program is selected by ELPD-LOO (Algorithm 1), which is well-defined, but overall configuration selection is not justified."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper makes comparisons across 4 models × 5 datasets × multiple metrics without any statistical tests, let alone multiple comparison corrections."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors evaluate REFINESTAT against baselines including Syncode, which was created by the same group (Ugare et al., 2024c). REFINESTAT also builds on IterGen (Ugare et al., 2024a) from the same authors. This self-comparison bias is not acknowledged."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Section 5.2 explicitly addresses compute fairness: 'Since we observed that REFINESTAT consumes almost twice the number of tokens used by the Baseline, we run baseline models five times with different seeds (2.5× tokens more than REFINESTAT).' Table 8 in Appendix G.1 provides token ratios."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper uses 5 PosteriorDB datasets without discussing whether these small, well-studied problems are representative of real-world probabilistic programming challenges. No analysis of whether success on these tasks predicts success on more complex models."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "The paper evaluates its own framework (REFINESTAT) as an integrated system. The scaffold IS the contribution being tested, not a confound. Comparisons use the same base models with different generation strategies."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "PosteriorDB datasets (Eight Schools from 1981, Dugongs, etc.) and their Stan reference programs have been publicly available long before any of the tested models were trained. The paper does not discuss temporal leakage despite models likely having seen these classic problems."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The prompts include dataset descriptions (e.g., 'A hierarchical model for the 8-schools data') which could cue memorized solutions. This is not discussed as a potential leakage pathway."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the 5 PosteriorDB datasets are independent or whether performance on one predicts performance on another. The datasets vary in complexity but potential correlations are not addressed."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "Section 5.4 applies concrete detection methods: anonymized prompts (removing dataset names/metadata) and syntactic obfuscation (exponential notation for numbers) to test whether the model is reproducing memorized solutions rather than synthesizing new ones."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "REFINESTAT achieves ~40 percentage point higher run rates than unconstrained generation and ~30 points higher than Syncode.",
    365       "evidence": "Table 1 shows run rates across temperatures: REFINESTAT 45-50% vs Standard 10-11% vs Syncode 21%. Section 5.1.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "REFINESTAT with DQ-7B matches or surpasses GPT-4 (BoxLM) and OpenAI o3 on ELPD-LOO for several datasets.",
    370       "evidence": "Table 3 shows REFINESTAT outperforms on Peregrine (-114.29 vs -173.11 BoxLM) and GP (-23.39 vs -34.95 o3), matches on Eight Schools, but underperforms on Dugongs (8.35 vs 23.40 BoxLM).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Parameter validity is the most critical semantic validation component, with removal causing a 14.5 percentage point drop in run rate.",
    375       "evidence": "Table 4 ablation study: removing parameter validity drops run rate from 50% to 35.5%. Other components contribute 5.5-10% each.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "REFINESTAT's effectiveness is not primarily due to memorization.",
    380       "evidence": "Section 5.4, Table 9: anonymized prompts and syntactically obfuscated inputs produce comparable reliability, convergence, and predictive metrics. However, authors note 'these studies are limited in scope' and memorization measurement 'is still an open problem.'",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "REFINESTAT generalizes across probabilistic programming backends (PyMC and NumPyro).",
    385       "evidence": "Section 5.5, Tables 5-6: NumPyro run rates double with REFINESTAT, and diagnostics improve across all datasets. GP task succeeds only with REFINESTAT.",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Very small benchmark suite",
    392       "detail": "Only 5 datasets are used, all from PosteriorDB, all relatively simple well-known Bayesian models. The generalizability claims ('probabilistic program synthesis') substantially exceed what this narrow benchmark can support."
    393     },
    394     {
    395       "flag": "Self-comparison with own prior work",
    396       "detail": "Two of the three authors created Syncode (Ugare et al., 2024c) and IterGen (Ugare et al., 2024a), which serve as both baselines and building blocks for REFINESTAT. This creates a dual conflict: incentive to make the baseline appear weak and the extension appear strong. This is not disclosed."
    397     },
    398     {
    399       "flag": "Claims of statistical significance without tests",
    400       "detail": "The paper repeatedly uses language like 'significantly improves' (Sections 1, 5.1, 5.2) but provides no statistical significance tests. Comparisons are based on comparing means with overlapping standard deviations."
    401     },
    402     {
    403       "flag": "BoxLM comparison uses reported numbers",
    404       "detail": "Section 5.3 states 'Since the code for BoxLM is not publicly available, we rely on the reported numbers from their paper for comparison.' This prevents fair comparison under identical conditions."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "Automated statistical model discovery with language models",
    410       "authors": ["Michael Y. Li", "Emily B. Fox", "Noah D. Goodman"],
    411       "year": 2024,
    412       "arxiv_id": "2402.17879",
    413       "relevance": "Direct predecessor using GPT-4 for probabilistic program synthesis; REFINESTAT's primary comparison target."
    414     },
    415     {
    416       "title": "SynCode: LLM generation with grammar augmentation",
    417       "authors": ["Shubham Ugare", "Tarun Suresh", "Hangoo Kang", "Sasa Misailovic", "Gagandeep Singh"],
    418       "year": 2024,
    419       "arxiv_id": "2403.01632",
    420       "relevance": "Grammar-constrained LLM decoding framework that REFINESTAT builds upon; key baseline for syntactic-only constraining."
    421     },
    422     {
    423       "title": "IterGen: Iterative structured LLM generation",
    424       "authors": ["Shubham Ugare", "Rohan Gumaste", "Tarun Suresh", "Gagandeep Singh", "Sasa Misailovic"],
    425       "year": 2025,
    426       "arxiv_id": "2410.07295",
    427       "relevance": "Iterative error-driven backtracking for LLM code generation; REFINESTAT's implementation infrastructure."
    428     },
    429     {
    430       "title": "The Llama 3 herd of models",
    431       "authors": ["Aaron Grattafiori et al."],
    432       "year": 2024,
    433       "arxiv_id": "2407.21783",
    434       "relevance": "One of the primary SLMs evaluated in REFINESTAT experiments."
    435     },
    436     {
    437       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    438       "authors": ["Daya Guo et al."],
    439       "year": 2025,
    440       "arxiv_id": "2501.12948",
    441       "relevance": "Source of DeepSeek-R1-Distill-Qwen-7B, the best-performing SLM in REFINESTAT evaluation."
    442     },
    443     {
    444       "title": "Qwen2.5-coder technical report",
    445       "authors": ["Binyuan Hui et al."],
    446       "year": 2024,
    447       "arxiv_id": "2409.12186",
    448       "relevance": "Code-specialized LLM evaluated in REFINESTAT; relevant to AI code generation capabilities."
    449     },
    450     {
    451       "title": "Generalization or memorization: Data contamination and trustworthy evaluation for large language models",
    452       "authors": ["Yihong Dong et al."],
    453       "year": 2024,
    454       "arxiv_id": "2402.15938",
    455       "relevance": "Addresses data contamination concerns in LLM evaluation, relevant to benchmark validity."
    456     },
    457     {
    458       "title": "Type-constrained code generation with language models",
    459       "authors": ["Niels Mündler", "Jingxuan He", "Hao Wang", "Koushik Sen", "Dawn Song", "Martin Vechev"],
    460       "year": 2025,
    461       "relevance": "Type-constrained LLM code generation approach; relevant to constrained decoding for code synthesis."
    462     },
    463     {
    464       "title": "Demystifying memorization in LLM-based program repair via a general hypothesis testing framework",
    465       "authors": ["Jiaolong Kong", "Xiaofei Xie", "Shangqing Liu"],
    466       "year": 2025,
    467       "relevance": "Proposes code mutation to detect memorization in LLM program synthesis; inspired REFINESTAT's memorization tests."
    468     },
    469     {
    470       "title": "posteriordb: Testing, benchmarking and developing Bayesian inference algorithms",
    471       "authors": ["Måns Magnusson", "Jakob Torgander", "Paul-Christian Bürkner", "Lu Zhang", "Bob Carpenter", "Aki Vehtari"],
    472       "year": 2024,
    473       "arxiv_id": "2407.04967",
    474       "relevance": "The benchmark suite used for all REFINESTAT evaluations; relevant to AI evaluation methodology."
    475     }
    476   ]
    477 }

Impressum · Datenschutz