ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27084B)


      1 {
      2   "paper": {
      3     "title": "Bottom-up Domain-specific Superintelligence: A Reliable Knowledge Graph is What We Need",
      4     "authors": [
      5       "Bhishma Dedhia",
      6       "Yuval Kansal",
      7       "Niraj K. Jha"
      8     ],
      9     "year": 2025,
     10     "venue": "Preprint (arXiv)",
     11     "arxiv_id": "2507.13966"
     12   },
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper provides a project website URL (https://kg-bottom-up-superintelligence.github.io/) on the title page. While the paper itself does not contain a direct GitHub link in the body text, this project page serves as the artifact portal. The paper does not explicitly state code is released but provides this URL as the project resource."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The paper describes curating 24,000 QA tasks and the ICD-Bench evaluation suite (3,675 items), but does not provide a download link for the dataset. The UMLS KG used is a publicly available resource, but the curated curriculum and ICD-Bench data are not explicitly released with a URL."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "Appendix C describes the SFT setup (8×H100 GPUs, LoRA rank=16, learning rate schedule, optimizer settings) but does not provide a requirements.txt, Dockerfile, or detailed library version specifications needed to recreate the environment."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper describes the pipeline methodology in detail (Sections 3.1, 3.2, Algorithm 1 in Appendix A.3) and the SFT setup (Appendix C), but does not provide step-by-step reproduction instructions, README with commands, or scripts to replicate the experiments."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The paper reports 'Bootstrapped confidence intervals over 500 samples were < 0.75%' for ICD-Bench results (Section 6.1, Fig. 6 caption, Fig. 7 caption)."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper claims curriculum-tuned models 'significantly outperform' baselines across multiple sections (Sections 6.1, 6.2), but no formal statistical significance tests (p-values, t-tests, etc.) are reported. Claims of difference are based on comparing accuracy numbers."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper reports effect sizes in context, e.g., 'outperform all open-source baselines across inference budgets by 10-20%' (Section 6.2, O2.1), and provides baseline context in figures and tables (Table 1, Fig. 7) showing absolute accuracy values and improvements."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The training dataset size (24,000 tasks) and ICD-Bench size (3,675 items) are stated but not justified with a power analysis or explanation of why these specific sizes were chosen."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper reports bootstrapped confidence intervals for ICD-Bench results but does not report variance or standard deviation across experimental runs. The SFT models appear to be trained once (single run), and no multi-seed results are presented."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 6.2 compares against four baselines: QwQ-32B base model, DeepSeek-R1-Distilled Qwen, o3, and Gemini-2.5-Pro. Table 1 adds comparisons against MedGemma, Meerkat, Qwen3, and Sky-T1."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Baselines include o3 (OpenAI, 2025), Gemini-2.5-Pro (Google, 2025), DeepSeek-R1 (2025), Qwen3 (2025), and Meerkat (2025). These are contemporary state-of-the-art reasoning models."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Section 6.4 presents detailed ablations disentangling the effects of curriculum depth (1-hop vs. 3-hop), complexity sampling, and diversity sampling on model performance. Three model variants (QwQ-Med-1, -2, -3) also serve as ablations of increasing curriculum complexity."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper uses accuracy across 15 ICD-Bench categories, per-difficulty-bin accuracy (Section 6.3), recall vs. reasoning accuracy analysis (Section 6.5), and evaluation on 4 external benchmarks (MedQA, PubMedQA, MedMCQA, MMLU-Med in Table 1)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No human evaluation of model outputs is conducted. All evaluation is automated via multiple-choice accuracy. For a paper claiming 'superintelligence' in medical reasoning, human expert evaluation of the model's reasoning traces would strengthen the claims."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "ICD-Bench is explicitly separated from training data via decontamination (Section 5.1): exact KG path overlap is excluded and an 18-gram text overlap filter is applied. ICD-Bench uses 4- and 5-hop paths while training uses 1-3 hop paths. External benchmarks (MedQA, etc.) are independent."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down across all 15 ICD-Bench categories (Fig. 1, Fig. 7), across 5 difficulty bins (Fig. 8), and across hop lengths (Fig. 10). This is a strength of the evaluation design."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 6.5 discusses failure modes: 'even curriculum-tuned models struggle to reason correctly due to insufficient recall' in certain categories like Drugs and Mediators. Appendix D provides qualitative examples comparing base model failures (Examples 5-7) with detailed analysis of reasoning-recall gaps."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 6.4 reports that 'using only three-hop chains slightly degrades performance' on easy tasks (O4.2), and Section 6.6 notes QwQ-Med-3 scores lower than the base QwQ on MedQA (82.72 vs. 85.62), acknowledging the trade-off."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims QwQ-Med-3 'significantly outperforms state-of-the-art open-source and proprietary reasoning models on all categories of ICD-Bench,' which is supported by Fig. 1, Fig. 7, and Section 6.2. The claim about transfer to external benchmarks is supported by Table 1. The claim about performance gap widening on hardest tasks is supported by Fig. 8."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper makes causal claims about curriculum tuning improving performance, which are supported by controlled ablation studies (Section 6.4) with single-variable manipulations (depth, complexity sampling, diversity sampling) under fixed FLOPs budgets. The three progressive models (Med-1, -2, -3) provide controlled comparisons."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper uses the term 'superintelligence' in the title and throughout, but results are limited to multiple-choice medical QA tasks derived from a single KG (UMLS). The Limitations section (Section 8) acknowledges constraints ('generalizability to other domains... remains to be fully validated' and 'we limit our focus to generating closed-ended multiple choice question tasks'), but the title 'Bottom-up Domain-specific Superintelligence' significantly overstates what closed-ended MCQ performance demonstrates."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper does not substantively discuss alternative explanations for the performance gains. For instance, it does not consider whether the gains could be attributed to data augmentation effects, exposure to medical terminology, or the specific characteristics of MCQ format rather than 'domain-specific superintelligence.' The Limitations section discusses scope constraints but not alternative causal explanations."
    130       }
    131     },
    132     "setup_transparency": {
    133       "model_versions_specified": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper uses 'Gemini 2.0 Flash' (Section 3.1.1), 'Gemini-2.5-Pro' (Section 3.1.2), 'QwQ-32B' (Section 5.2), 'Qwen 2.5-72B' (Section 3.1.2), 'o3' (Section 6.2), but none include specific API versions, snapshot dates, or model card version identifiers. These are marketing names without version specificity."
    137       },
    138       "prompts_provided": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper provides actual prompt templates in Appendix A.2 (Prompt 1 for QA generation), Appendix A.3 (Prompt 2 for thinking trace generation, Prompt 3 for correctness filtering), and Appendix F (Prompt 4 for benchmark evaluation). These contain the full text used."
    142       },
    143       "hyperparameters_reported": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Appendix C reports LoRA rank=16, α=16, batch size=16, gradient accumulation every 2 steps, cosine learning rate schedule with peak 1e-5, 5% warmup, Adam with β1=0.9, β2=0.95, weight decay=1e-4, 8 epochs. Inference: temperature=0.6, K values for parallel scaling. Table 3 in Appendix F reports inference hyperparameters for all models."
    147       },
    148       "scaffolding_described": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "The approach does not use agentic scaffolding. It is a fine-tuning and inference pipeline without tool use, retry logic, or agent-based workflow."
    152       },
    153       "data_preprocessing_documented": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 3.1.2 and Algorithm 1 describe the full data preprocessing pipeline: diversity sampling, complexity sampling, quality filtering (API failures, formatting, distractor quality), thinking trace generation, and dual-model correctness filtering. Section 5.1 describes the decontamination steps."
    157       }
    158     },
    159     "limitations_and_scope": {
    160       "limitations_section_present": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section 8 (Discussion) contains a dedicated 'Limitations' paragraph that discusses five specific limitations."
    164       },
    165       "threats_to_validity_specific": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The Limitations paragraph in Section 8 identifies specific threats: (1) closed vocabulary of the KG constraining conceptual coverage, (2) limitation to closed-ended MCQ tasks, (3) difficulty heuristic relying on oracle answers, (4) efficacy demonstrated only in medicine where reliable KG exists, and (5) generalizability to domains lacking canonical KGs not validated."
    169       },
    170       "scope_boundaries_stated": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "While the Limitations section acknowledges several constraints, the paper does not explicitly state what the results do NOT show. The paper claims 'superintelligence' but does not explicitly bound this claim to MCQ performance on KG-derived tasks. The title and framing significantly exceed what the evidence demonstrates, and no explicit 'what we do not claim' statement is provided."
    174       }
    175     },
    176     "data_integrity": {
    177       "raw_data_available": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "The curated 24,000 QA training tasks, ICD-Bench data, and generated thinking traces are not made available for independent verification. Only aggregate results are presented."
    181       },
    182       "data_collection_described": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 3.1 and Algorithm 1 describe the data collection procedure in detail: path traversal on the UMLS KG, QA generation via LLM, quality filtering, thinking trace generation, and correctness filtering with dual LLM verification."
    186       },
    187       "recruitment_methods_described": {
    188         "applies": false,
    189         "answer": false,
    190         "justification": "No human participants were involved. Data is generated synthetically from a knowledge graph using LLMs."
    191       },
    192       "data_pipeline_documented": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The full data pipeline is documented in Section 3.1.2 (5 steps), Algorithm 1 (Appendix A.3), and Section 5.1 (decontamination). Figure 3 provides a visual overview. However, exact counts of items filtered at each stage are not provided."
    196       }
    197     },
    198     "conflicts_of_interest": {
    199       "funding_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The 'Acknowledgments and Disclosure of Funding' section states: 'This work was supported by NSF under Grant No. CNS-2216746' and mentions computational resources from Princeton Research Computing and the Princeton Language and Intelligence Initiative."
    203       },
    204       "affiliations_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "All authors are affiliated with the Department of Electrical and Computer Engineering at Princeton University, clearly stated on the title page. No evaluated products are from Princeton."
    208       },
    209       "funder_independent_of_outcome": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The funder is NSF (National Science Foundation), a government agency with no financial stake in the outcome of the research. The computational resources are from Princeton University."
    213       },
    214       "financial_interests_declared": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No competing interests or financial interests statement is provided in the paper. The absence of a disclosure statement means this criterion is not satisfied."
    218       }
    219     },
    220     "contamination": {
    221       "training_cutoff_stated": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "The paper uses QwQ-32B as the base model and evaluates against models like o3, Gemini-2.5-Pro, and DeepSeek-R1, but does not state the training data cutoff dates for any of these models."
    225       },
    226       "train_test_overlap_discussed": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Section 5.1 describes a two-fold decontamination procedure: (1) excluding QA pairs whose KG paths exactly match ICD-Bench paths, and (2) an 18-gram text overlap filter. This addresses train/test overlap between the curated curriculum and ICD-Bench."
    230       },
    231       "benchmark_contamination_addressed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "For external benchmarks (MedQA, MedMCQA, MMLU-Med, PubMedQA), no contamination analysis is provided. The base QwQ-32B model was trained on web-scale data that could include these benchmark questions. The paper does not discuss whether the base model or the curriculum-tuned model may have seen these external benchmark questions during pre-training."
    235       }
    236     },
    237     "human_studies": {
    238       "pre_registered": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants are involved in this study."
    242       },
    243       "irb_or_ethics_approval": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants are involved in this study."
    247       },
    248       "demographics_reported": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants are involved in this study."
    252       },
    253       "inclusion_exclusion_criteria": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants are involved in this study."
    257       },
    258       "randomization_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants are involved in this study."
    262       },
    263       "blinding_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants are involved in this study."
    267       },
    268       "attrition_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants are involved in this study."
    272       }
    273     },
    274     "cost_and_practicality": {
    275       "inference_cost_reported": {
    276         "applies": true,
    277         "answer": true,
    278         "justification": "The paper reports average thinking tokens per question across scaling configurations (Fig. 6 x-axis, Fig. 7 x-axis), enabling assessment of inference cost. Section 6.1 systematically varies inference budgets with K values and reports accuracy vs. token consumption trade-offs."
    279       },
    280       "compute_budget_stated": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Section 5.2 states 'fine-tuned using LoRA with rank 16 on 8 H100 NVIDIA GPUs, with each run taking approximately 20 hours.' The paper also notes curriculum generation is a 'one-time cost' (Section 3.1) though does not quantify it."
    284       }
    285     }
    286   },
    287   "claims": [
    288     {
    289       "claim": "QwQ-Med-3 significantly outperforms state-of-the-art open-source and proprietary reasoning models on all 15 categories of ICD-Bench.",
    290       "evidence": "Fig. 1, Fig. 7, and Section 6.2 show QwQ-Med-3 outperforms QwQ-32B base, DeepSeek-R1-Distilled Qwen, o3, and Gemini-2.5-Pro across all ICD-Bench categories, with 10-20% improvement over open-source baselines.",
    291       "supported": "moderate"
    292     },
    293     {
    294       "claim": "Performance improves with deeper and more diverse KG curricula, with curriculum depth proving especially crucial for the most challenging reasoning tasks.",
    295       "evidence": "Section 6.4 and Fig. 9 show ablations across 1-hop only, 3-hop only, balanced, and full 24K datasets. The hardest tasks benefit most from 3-hop paths, while easy tasks benefit more from diversity.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "Curriculum-tuned models can transfer acquired KG primitives to external medical QA benchmarks beyond the original KG.",
    300       "evidence": "Table 1 shows QwQ-Med-3/parallel-scaling achieves competitive or improved results on MedQA (85.39), PubMedQA (78.19), MedMCQA (73.25), and MMLU-Med (92.90) versus baselines. However, on MedQA the base QwQ (85.62/87.09) outperforms QwQ-Med-3 (82.72/85.39).",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "Curriculum-tuned models show greater marginal gains on harder tasks, with the base model offering near-zero accuracy on the hardest tasks.",
    305       "evidence": "Section 6.3 and Fig. 8 show difficulty-stratified results where the base model drops to near-zero on the hardest bin while QwQ-Med-3 maintains meaningful accuracy. The performance gap widens with difficulty.",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "The approach demonstrates domain-specific superintelligence, taking a step towards medical superintelligence.",
    310       "evidence": "The paper demonstrates strong MCQ performance on ICD-Bench and external benchmarks but does not demonstrate reasoning beyond closed-ended multiple-choice format. No open-ended clinical reasoning, real-world validation, or comparison to human expert performance is provided.",
    311       "supported": "weak"
    312     }
    313   ],
    314   "methodology_tags": [
    315     "benchmark-eval"
    316   ],
    317   "key_findings": "The paper proposes a task-generation pipeline that traverses knowledge graph paths to create grounded medical reasoning tasks, then fine-tunes QwQ-32B on 24,000 such tasks to produce QwQ-Med-3. On the newly introduced ICD-Bench (3,675 medical QA items across 15 specialties), QwQ-Med-3 outperforms o3, Gemini-2.5-Pro, and other baselines across all categories. Ablation studies show curriculum depth is most critical for hard tasks while diversity helps across all difficulties. The curriculum-tuned model shows competitive transfer to external medical benchmarks (MedQA, PubMedQA, MedMCQA, MMLU-Med), though it underperforms the base model on MedQA.",
    318   "red_flags": [
    319     {
    320       "flag": "Overclaimed title and framing",
    321       "detail": "The paper claims 'domain-specific superintelligence' and 'a step towards medical superintelligence' based on multiple-choice QA performance. Superintelligence implies surpassing human expert capability across the domain, but the evaluation is limited to closed-ended MCQ tasks generated from a single KG. No comparison to human expert performance is provided, and no open-ended clinical reasoning is evaluated."
    322     },
    323     {
    324       "flag": "Self-constructed benchmark advantage",
    325       "detail": "ICD-Bench is constructed by the same team using the same KG (UMLS) and similar pipeline as the training data. While decontamination is performed (path exclusion + 18-gram filter), the benchmark inherently tests the same type of KG-grounded reasoning the model was trained on, which may overstate generalization. The model's advantage on ICD-Bench vs. external benchmarks supports this concern."
    326     },
    327     {
    328       "flag": "No formal significance testing",
    329       "detail": "The paper repeatedly uses the word 'significantly' to describe performance differences but provides no formal statistical significance tests. Bootstrapped CIs are reported as <0.75% but no comparison tests between models are conducted."
    330     },
    331     {
    332       "flag": "Single training run",
    333       "detail": "Each model variant appears to be trained exactly once. No multi-seed experiments are reported, making it impossible to assess the stability of results across random initialization or data sampling."
    334     },
    335     {
    336       "flag": "Base model outperforms on external benchmark",
    337       "detail": "On MedQA (Table 1), the base QwQ model (85.62/87.09 with scaling) outperforms QwQ-Med-3 (82.72/85.39 with scaling), suggesting the curriculum tuning may cause some forgetting on standard medical tasks even while improving KG-grounded reasoning."
    338     }
    339   ],
    340   "cited_papers": [
    341     {
    342       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    343       "authors": ["DeepSeek-AI"],
    344       "year": 2025,
    345       "arxiv_id": "2501.12948",
    346       "relevance": "Major open-source reasoning model used as a baseline, representing the RL-based approach to reasoning in LLMs."
    347     },
    348     {
    349       "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective Than Scaling Model Parameters",
    350       "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    351       "year": 2024,
    352       "arxiv_id": "2408.03314",
    353       "relevance": "Foundational work on inference-time scaling that this paper builds upon for its parallel and iterative refinement strategies."
    354     },
    355     {
    356       "title": "s1: Simple Test-Time Scaling",
    357       "authors": ["Niklas Muennighoff", "Zitong Yang", "Weijia Shi"],
    358       "year": 2025,
    359       "arxiv_id": "2501.19393",
    360       "relevance": "Inference-time scaling approach used as basis for the iterative refinement strategy in this paper."
    361     },
    362     {
    363       "title": "Emergent Abilities of Large Language Models",
    364       "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"],
    365       "year": 2022,
    366       "arxiv_id": "2206.07682",
    367       "relevance": "Foundational work on emergent abilities in LLMs, relevant to claims about emergence of domain-specific reasoning."
    368     },
    369     {
    370       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    371       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    372       "year": 2023,
    373       "arxiv_id": "2304.15004",
    374       "relevance": "Challenges the notion of emergent abilities in LLMs, directly relevant to evaluating this paper's claims about 'emergence' of domain-specific reasoning."
    375     },
    376     {
    377       "title": "QA-GNN: Reasoning with Language Models and Knowledge Graphs for Question Answering",
    378       "authors": ["Michihiro Yasunaga", "Hongyu Ren", "Antoine Bosselut", "Percy Liang", "Jure Leskovec"],
    379       "year": 2021,
    380       "relevance": "Key prior work on integrating knowledge graphs with language models for QA, which this paper builds upon for the UMLS KG construction."
    381     },
    382     {
    383       "title": "Small Language Models Learn Enhanced Reasoning Skills from Medical Textbooks",
    384       "authors": ["Hyunjae Kim"],
    385       "year": 2025,
    386       "relevance": "Meerkat model family used as a baseline; demonstrates potential of small LMs with careful data curation for medical reasoning."
    387     },
    388     {
    389       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    390       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    391       "year": 2023,
    392       "arxiv_id": "2201.11903",
    393       "relevance": "Foundational work on chain-of-thought reasoning that underpins the thinking trace approach used in this paper."
    394     },
    395     {
    396       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    397       "authors": ["Edward J. Hu", "Yelong Shen", "Phillip Wallis"],
    398       "year": 2021,
    399       "arxiv_id": "2106.09685",
    400       "relevance": "The fine-tuning method used in this paper for curriculum-tuning the QwQ-32B model."
    401     },
    402     {
    403       "title": "Capabilities of GPT-4 on Medical Challenge Problems",
    404       "authors": ["Harsha Nori", "Nicholas King", "Scott Mayer Mckinney"],
    405       "year": 2023,
    406       "arxiv_id": "2303.13375",
    407       "relevance": "Demonstrates LLM capabilities on medical exams, providing context for evaluating the medical QA benchmark results in this paper."
    408     },
    409     {
    410       "title": "Curriculum Learning",
    411       "authors": ["Yoshua Bengio", "Jerome Louradour", "Ronan Collobert", "Jason Weston"],
    412       "year": 2009,
    413       "relevance": "Foundational work on curriculum learning that motivates the progressive training approach used in this paper."
    414     },
    415     {
    416       "title": "AlphaEvolve: A Coding Agent for Scientific and Algorithmic Discovery",
    417       "authors": ["Alexander Novikov"],
    418       "year": 2025,
    419       "arxiv_id": "2506.13131",
    420       "relevance": "Example of domain-specific AI achieving expert-level performance, relevant to the broader discussion of domain-specific superintelligence."
    421     }
    422   ]
    423 }

Impressum · Datenschutz