scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27102B)
      1 {
      2   "paper": {
      3     "title": "IntroLM: Introspective Language Models via Prefilling-Time Self-Evaluation",
      4     "authors": [
      5       "Hossein Hosseini Kasnavieh",
      6       "Gholamreza Haffari",
      7       "Chris Leckie",
      8       "Adel N. Toosi"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv",
     12     "arxiv_id": "2601.03511",
     13     "doi": "10.48550/arXiv.2601.03511"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "IntroLM enables causal language models to predict their own output quality during the prefilling phase using special [CPX] tokens and token-conditional LoRA, achieving ROC-AUC of 86-90% across QA and chat benchmarks, outperforming DeBERTa-v3-Large baselines by 4-14 points. When integrated into multi-model routing, IntroLM reduces large-model usage by up to 50% and end-to-end latency by up to 34% at matched reliability. Ablation studies show token-conditional LoRA is critical for strong performance, especially on long-context tasks, and that introspective signals improve with backbone model capacity.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "All evaluation datasets are publicly available: MMLU, MMLU-PRO, GSM8K, HotpotQA, and LMSYS-Chat-1M are standard public benchmarks referenced with citations."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Hardware is mentioned (two H100 GPUs, vLLM) and model names are given, but no requirements.txt, Dockerfile, or detailed dependency specification is provided."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. Training details are in Appendix C but are not sufficient to reproduce without code."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All results in Tables 1-5 report single point estimates (e.g., 'ROC-AUC 89.1', 'PR-AUC 63.4') with no confidence intervals or error bars."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims IntroLM 'outperforms' baselines based solely on comparing point estimates. No statistical significance tests (t-tests, bootstrap, etc.) are reported for any comparison."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Effect sizes are reported with baseline context throughout: '14 points' improvement over DeBERTa (Section 1), 'up to 50%' reduction in large-model calls, 'up to 34%' latency reduction (Section 6.2.2), and absolute differences are visible in tables."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Dataset sizes are stated (136,515 for General QA, 97,074 for HotpotQA, 100K for LMSYS) but no justification is given for why these sizes are sufficient and no power analysis is discussed."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance, standard deviation, or spread measures are reported. All results appear to be from single training runs with no repeated experiments."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "DeBERTa-v3-Base (184M) and DeBERTa-v3-Large (435M) are included as baselines for complexity evaluation. Random routing serves as a baseline for routing evaluation (Figure 5)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "DeBERTa-v3 models are strong encoder-based classifiers widely used in prior routing work. The paper justifies that BERT-based classifiers are the standard approach in prior routing studies (Section 2)."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Extensive ablations: LoRA target modules (Table 4), effect of [CPX] tokens vs backbone-only (Table 3), backbone model capacity (Table 2), and layer-wise introspection (Table 5, Appendix D)."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Complexity evaluation uses ROC-AUC and PR-AUC. Routing evaluation uses reliability, large-model call rate, and end-to-end latency (Section 5.3)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation of any kind. All evaluation is automated: ROC-AUC/PR-AUC for classification, and analytical latency/cost formulas for routing."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Appendix B.2 states: 'All datasets use a consistent 80/10/10 train/validation/test split. Splits are constructed at the prompt level and fixed across all experiments.'"
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down across three distinct benchmarks (General QA, HotpotQA, Chat) in Table 1, and routing results shown separately for General QA and HotpotQA in Figure 5."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No qualitative analysis of failure cases. The paper does not show examples where IntroLM makes incorrect predictions or discuss what types of prompts are misclassified."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The 'No LoRA' variant in Table 4 shows degraded performance (85.7 vs 89.1 ROC-AUC). Table 3 shows backbone-only is suboptimal. Table 2 shows the smaller 1.7B model performs worse. Gains on chat-style data are acknowledged as smaller (Section 6.2.1)."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The abstract claims 'ROC-AUC of 90%' and 'outperforming by 14%' in the same sentence. The 90% comes from CHAT (90.1, Table 1), while the 14% gap comes from General QA (89.1 - 75.8 = 13.3). These are cherry-picked best-case numbers from different benchmarks combined into a single misleading claim."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims like 'token-conditional LoRA enables effective introspection' are supported by controlled ablation studies (Tables 3-4) that isolate individual components through single-variable manipulation."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The Limitations section (Section 9) explicitly bounds scope: 'Extending IntroLM to other task families—such as creative generation, multi-turn dialogue, code generation, or domain-specific applications—may require additional task-specific adaptation.' Also notes only two backbone sizes tested."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "No discussion of alternative explanations. The 8B IntroLM backbone has ~18x the parameters of DeBERTa-v3-Large (435M). The performance advantage could partly be due to raw capacity rather than the introspection mechanism. This confound is not addressed."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper measures ROC-AUC for predicting binary success/failure labels and reports exactly that. Routing metrics (reliability, call rate, latency) are clearly defined in Section 5.3. Claims match the granularity of measurements."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Specific model versions are given: Qwen3-8B, Qwen3-1.7B, Qwen3-32B, DeBERTa-v3-Base, DeBERTa-v3-Large, Llama-3.1-8B-Instruct, Qwen2.5-32B-Instruct. For open-source models, the name+size combination identifies a unique checkpoint."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Judge prompts for correctness evaluation and chat quality evaluation are provided in full (Figures 6 and 7). IntroLM itself uses no prompts — it appends [CPX] tokens with a classification head, so no prompt text is needed."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Appendix C provides detailed hyperparameters: context window 2048, batch size 64, cosine scheduling, 10% warmup, LoRA rank 32, α=64, gradient norm 0.3, weight decay 0.002, learning rates 4-8×10⁻⁵. DeBERTa baselines similarly detailed."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. IntroLM is a direct classification mechanism applied during prefilling."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Appendix B describes dataset construction in detail: sources combined, filtering of LMSYS-Chat-1M (extracting individual turns, removing trivial/context-dependent inputs), labeling thresholds (score <8 = unsuccessful), and class distributions (21% complex for QA, 14% for HotpotQA, ~50% for Chat)."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 9 'Limitations' provides substantive discussion covering task scope, training cost, backbone sizes tested, and supervision requirements."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Limitations are specific to this study: tested only on QA/chat tasks not creative generation or code; higher training cost than BERT baselines; only two backbone sizes evaluated; reliance on supervised labeled data."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 9 explicitly states what was NOT tested: 'creative generation, multi-turn dialogue, code generation, or domain-specific applications' and notes 'a systematic evaluation of IntroLM on substantially larger backbones remains an important direction.'"
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The processed training data (benchmark questions with model-generated labels) is not released. While source benchmarks are public, the specific labeled datasets constructed for IntroLM training are not available."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Appendix B describes data collection in detail: source benchmarks, sample sizes, labeling procedure (LLM judges), judge prompts, threshold decisions, and class distributions."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. All data comes from standard public benchmarks (MMLU, GSM8K, HotpotQA, LMSYS-Chat-1M)."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Pipeline is documented: benchmark questions → model generates answers → LLM judge evaluates correctness (Figures 6-7) → binary labels assigned → 80/10/10 split. Total counts and label distributions provided."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "Section 8 'Acknowledgments' only mentions use of AI tools for proofreading. No funding sources, grants, or financial support are disclosed."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly listed: University of Melbourne (School of Computing and Information Systems) and Monash University (Department of Data Science & AI). No product being evaluated is affiliated with the authors."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding is disclosed at all. Without a funding statement, independence cannot be assessed."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests statement or financial disclosure is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The training data cutoff for Qwen3-8B is not stated. The paper uses this model on public benchmarks without specifying when its training data was collected."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether Qwen3-8B may have seen MMLU, GSM8K, or HotpotQA examples during pre-training. These are widely-used public benchmarks likely present in web-scraped training data."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "MMLU (2021), GSM8K (2021), and HotpotQA (2018) were all publicly available well before Qwen3's training. No contamination analysis is performed despite high contamination risk."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Inference latency is a central evaluation metric. Section 5.3 defines TTFT and TPOT metrics. Section 6.2.2 reports end-to-end latency measured with vLLM on H100 GPUs, with latency reduction quantified (up to 34%)."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Inference hardware is mentioned (two H100 GPUs) but total training compute is not stated — no GPU hours, training time, or total compute budget for training IntroLM or the baselines."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of multiple random seeds. All results appear to be from single training runs."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of experimental runs is never explicitly stated. Results are presented as single values without indicating how many runs produced them."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Hyperparameters are listed (Appendix C) but no search budget is stated — the number of configurations tried, search method, or compute spent on tuning is not reported."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The final configuration (LoRA rank 32, specific learning rates, target modules) is reported but the selection process is not justified. Ablation tables show alternatives but don't explain how the final config was chosen on validation data."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors compare their IntroLM system against DeBERTa baselines without acknowledging the inherent bias of evaluating their own system. No discussion of author-evaluation bias or independent evaluation."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "IntroLM uses a Qwen3-8B backbone (~8B parameters) while baselines use DeBERTa-v3-Large (~435M parameters), an ~18x capacity difference. The paper does not discuss performance at matched compute budgets. The Limitations section mentions higher training cost but does not quantify or compare fairly."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "No discussion of whether MMLU, GSM8K, HotpotQA, or LMSYS-Chat-1M actually measure the 'prompt complexity' that IntroLM claims to evaluate. The relationship between benchmark correctness and prompt complexity is assumed without validation."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is involved in the evaluation. IntroLM is a direct classification mechanism."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of temporal leakage. MMLU (2021), GSM8K (2021), and HotpotQA (2018) were all publicly available before Qwen3's likely training period. Solutions may be in the training data."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of feature leakage. The evaluation setup where the model generates answers to create training labels is not analyzed for potential information leakage."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether train and test splits share structural similarities. The 80/10/10 split is random but potential non-independence (e.g., similar questions across splits) is not addressed."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, n-gram overlap, or decontamination analysis."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "IntroLM applied to Qwen3-8B achieves ROC-AUC of 89.1% on General QA, 86.3% on HotpotQA, and 90.1% on Chat, outperforming DeBERTa-v3-Large by 13.3, 14.5, and 3.8 points respectively.",
    370       "evidence": "Table 1 reports ROC-AUC and PR-AUC across three benchmarks with direct comparison to DeBERTa baselines.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "IntroLM-based routing reduces large-model usage by up to 50% and end-to-end latency by up to 34% at matched reliability.",
    375       "evidence": "Figure 5 shows routing trade-off curves. Section 6.2.2 reports 'up to 50% (30% on average)' for call rate reduction and 'up to 34% (15% on average)' for latency reduction on General QA. Latency computed analytically using formulas in Section 5.3 with vLLM measurements on H100 GPUs.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Token-conditional LoRA is essential for effective introspective complexity prediction, with substantial degradation when removed.",
    380       "evidence": "Table 4 shows No LoRA achieves 85.7 vs 89.1 ROC-AUC; Table 3 shows backbone-only achieves 81.0 vs 86.3 on HotpotQA. The gap is especially large on PR-AUC for the minority class.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Increasing model capacity improves the quality of introspective signals.",
    385       "evidence": "Table 2 compares Qwen3-8B (89.1 ROC-AUC) vs Qwen3-1.7B (84.24 ROC-AUC). Both outperform DeBERTa but larger backbone shows bigger gains.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "IntroLM preserves the original backbone's generation behavior unchanged.",
    390       "evidence": "Section 4 describes the mechanism: [CPX] tokens excluded from KV cache, decoding starts from original prompt's last hidden state. Token-conditional LoRA masks ensure non-[CPX] tokens are unaffected. Architecture design argument, not empirically verified.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Meaningful introspective signals emerge well before the final layer.",
    395       "evidence": "Table 5 shows layer-wise introspection results: first 24/36 layers of Qwen3-8B achieve 87.9 ROC-AUC vs 89.1 for the full model.",
    396       "supported": "strong"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "Abstract cherry-picks best-case numbers from different benchmarks",
    402       "detail": "The abstract claims 'ROC-AUC of 90%' (from Chat benchmark: 90.1%) and 'outperforming by 14%' (from General QA benchmark: 89.1 - 75.8 = 13.3) in the same sentence, creating a misleadingly strong impression by combining best-case figures from different datasets."
    403     },
    404     {
    405       "flag": "No error bars or variance on any result",
    406       "detail": "All results in Tables 1-5 are single point estimates with no confidence intervals, standard deviations, or repeated runs. For a method involving LoRA training and classification thresholds, single-run results are insufficient to assess stability."
    407     },
    408     {
    409       "flag": "Unfair capacity comparison without acknowledgment",
    410       "detail": "IntroLM uses a Qwen3-8B backbone (~8B parameters) compared against DeBERTa-v3-Large (~435M parameters), an ~18x capacity difference. The performance advantage could partly be attributable to raw model capacity rather than the introspection mechanism. This confound is not discussed."
    411     },
    412     {
    413       "flag": "No contamination analysis despite high-risk benchmarks",
    414       "detail": "MMLU, GSM8K, and HotpotQA are all public benchmarks from 2018-2021, widely known to be present in LLM training corpora. Qwen3-8B almost certainly saw these during training, affecting the label distribution. No contamination analysis is performed."
    415     },
    416     {
    417       "flag": "Routing latency analysis is analytical, not measured end-to-end",
    418       "detail": "Latency comparisons use analytical formulas (Section 5.3) with separately measured TTFT and TPOT, not actual end-to-end routing system measurements. Real-world routing overhead (network, decision logic, model switching) is not captured."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    424       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    425       "year": 2023,
    426       "relevance": "Foundational work on LLM cost optimization through sequential cascade routing, directly compared against in the routing literature."
    427     },
    428     {
    429       "title": "AutoMix: Automatically mixing language models",
    430       "authors": ["Pranjal Aggarwal", "Aman Madaan"],
    431       "year": 2023,
    432       "relevance": "Post-execution routing approach that uses the same LLM to verify its output, representing an alternative routing paradigm to prefilling-time prediction."
    433     },
    434     {
    435       "title": "RouteLLM: Learning to route LLMs from preference data",
    436       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"],
    437       "year": 2025,
    438       "relevance": "Major LLM routing framework exploring multiple strategies including BERT classifiers and causal LLM-based classifiers."
    439     },
    440     {
    441       "title": "Hybrid LLM: Cost-efficient and quality-aware query routing",
    442       "authors": ["Dujian Ding", "Ankur Mallick", "Chi Wang"],
    443       "year": 2024,
    444       "relevance": "Pre-execution routing using BERT-style classifiers to select between LLMs, representative of the BERT-based routing paradigm IntroLM aims to replace."
    445     },
    446     {
    447       "title": "BEST-Route: Adaptive LLM routing with test-time optimal compute",
    448       "authors": ["Dujian Ding", "Ankur Mallick", "Shaokun Zhang"],
    449       "year": 2025,
    450       "relevance": "Adaptive LLM routing with test-time optimization, extending BERT-based routing with compute-aware strategies."
    451     },
    452     {
    453       "title": "Learning to route LLMs with confidence tokens",
    454       "authors": ["Yu-Neng Chuang", "Prathusha Kameswara Sarma"],
    455       "year": 2025,
    456       "relevance": "Most closely related work: introduces confidence tokens generated at end of decoding for routing decisions, contrasting with IntroLM's prefilling-time approach."
    457     },
    458     {
    459       "title": "LoRA: Low-rank adaptation of large language models",
    460       "authors": ["Edward J. Hu", "Phillip Wallis", "Zeyuan Allen-Zhu"],
    461       "year": 2022,
    462       "relevance": "Foundational parameter-efficient fine-tuning method that IntroLM extends with token-conditional masking."
    463     },
    464     {
    465       "title": "Routing to the expert: Efficient reward-guided ensemble of large language models",
    466       "authors": ["Keming Lu", "Hongyi Yuan", "Runji Lin"],
    467       "year": 2024,
    468       "relevance": "Pre-generation routing using distilled reward-model signals into lightweight encoder routers."
    469     },
    470     {
    471       "title": "DynamoLLM: Designing LLM inference clusters for performance and energy efficiency",
    472       "authors": ["Jovan Stojkovic", "Chaojie Zhang"],
    473       "year": 2025,
    474       "relevance": "LLM inference cluster design focusing on latency metrics (TTFT, TPOT) used in IntroLM's analytical latency model."
    475     }
    476   ]
    477 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs