ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28895B)


      1 {
      2   "scan_version": 2,
      3   "active_modules": ["experimental_rigor", "data_leakage"],
      4   "paper": {
      5     "title": "Interfaze: The Future of AI is built on Task-Specific Small Models",
      6     "authors": [
      7       "Harsha Vardhan Khurdula",
      8       "Vineet Agarwal",
      9       "Yoeven D Khemlani"
     10     ],
     11     "year": 2026,
     12     "venue": "IEEE Conference on Artificial Intelligence (CAI) 2026",
     13     "arxiv_id": "2602.04101"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No source code repository or URL is provided anywhere in the paper. The system architecture is described but no code is released."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "All evaluation benchmarks are publicly available standard datasets: MMLU-Pro, MMLU, GPQA-Diamond, AIME-2025, LiveCodeBench v5, MMMU, AI2D, ChartQA, Common Voice v16. References with URLs are provided for each."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No environment specifications, dependency lists, or hardware details are provided. The paper mentions 'separate GPU pools with batching and caching' (Section III-B) but gives no specifics."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No reproduction instructions are included. The paper describes architecture conceptually but provides no runnable scripts, commands, or step-by-step instructions to replicate results."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Table II reports only point estimates (e.g., '83.6', '91.38') with no confidence intervals, error bars, or uncertainty measures on any result."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper claims improvements like '+3.0 (MMLU-Pro)', '+55.3 (AIME-2025)' over GPT-4.1 based solely on comparing two numbers without any statistical significance tests."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section V reports point differences with baseline context (e.g., '+3.0 (MMLU-Pro), +1.18 (MMLU), +55.3 (AIME-2025)') and Table II shows both system scores and baseline scores, providing enough context to assess magnitude."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No discussion of sample sizes for any benchmark evaluation. The number of test examples is not stated for any benchmark, nor is there any power analysis."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No variance, standard deviation, or multiple-run results reported. All results appear to be single-run point estimates."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Table II compares against seven baselines: GPT-4.1, GPT-5 (Minimal Reasoning), Claude Sonnet 4, Gemini 2.5 Flash, Claude Sonnet 4 (Thinking), Claude Opus 4 (Thinking), and Gemini 2.5 Pro."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Baselines include very recent models: GPT-5, Claude Opus 4, Gemini 2.5 Pro, Claude Sonnet 4 — all contemporary frontier models at time of writing."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Section V mentions ablation observations: 'removing OCR/diagram/chart parsers drops AI2D/ChartQA by 4–7 points; disabling context compilation costs ≈2 points on GPQA-Diamond; turning off the optional short reasoning head hurts AIME and MMLU-Pro in high-difficulty slices.' However, no ablation table is provided — these are mentioned only in passing."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Nine benchmarks are used spanning knowledge, reasoning, code, multimodal, and speech: MMLU-Pro, MMLU, GPQA-Diamond, AIME-2025, LiveCodeBench v5, MMMU, AI2D, ChartQA, Common Voice v16."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No human evaluation of system outputs is performed. All evaluation is automated benchmark scoring."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are reported on standard benchmark test sets (MMLU, GPQA-Diamond, AIME-2025, LiveCodeBench v5, MMMU validation split, AI2D, ChartQA, Common Voice v16) which have established train/test splits."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Table II provides per-benchmark breakdown across nine different benchmarks. Section V-A provides per-domain discussion (knowledge, science, math, coding, multimodal, speech)."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No specific failure cases or error analyses are shown. Section VI discusses engineering limitations (delay, over-building) but does not show examples where the system fails or produces incorrect outputs."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper acknowledges that LiveCodeBench v5 (57.77) trails Sonnet 4 Thinking by 7.73 and Gemini 2.5 Pro by 18.13, and explicitly attributes this to 'a deliberate choice to favor lightweight sandbox checks over longer agentic debugging loops.'"
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "All specific benchmark numbers claimed in the abstract (83.6% MMLU-Pro, 91.4% MMLU, 81.3% GPQA-Diamond, 57.8% LiveCodeBench v5, 90.0% AIME-2025, 77.3% MMMU, 91.5% AI2D, 90.9% ChartQA, 90.8% Common Voice) match Table II."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper claims 'most of the improvement comes from the small-model and tool stack and the way it compiles context' (Section V). While ablations are mentioned in passing, they are not presented in a controlled table and the comparison is between a tooled system and non-tooled baselines, making it impossible to isolate the causal contribution of each component rigorously."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The title claims 'The Future of AI is built on Task-Specific Small Models' — an extremely broad generalization from benchmark evaluations on nine specific tasks. The paper does not bound its claims to the tested benchmarks or acknowledge that results may not generalize to other domains."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "No alternative explanations are discussed. The paper does not consider whether gains come from tool access (search, code sandbox) rather than the small-model architecture, or whether baselines with equivalent tool access would close the gap."
    132       },
    133       "proxy_outcome_distinction": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper measures benchmark accuracy scores and frames them as evidence that 'the future of AI is built on task-specific small models' — a claim about production deployment economics and architecture. The gap between benchmark performance and the claimed practical superiority of the architecture is not acknowledged."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "Baselines are listed as 'GPT-4.1', 'GPT-5 (Minimal Reasoning)', 'Claude Sonnet 4', etc. — marketing names without snapshot dates or API versions. The paper's own small models are described only at the architecture level ('a stack of convolutional and self-attention blocks') with no model names or versions."
    144       },
    145       "prompts_provided": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "No prompts or system instructions are provided for any benchmark evaluation. The paper describes that prompts are compiled by the context layer but does not show actual prompt text."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "No hyperparameters are reported — no temperature, top-p, max tokens, learning rates, or sampling settings for any model in the system."
    154       },
    155       "scaffolding_described": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "The agentic scaffolding is described in substantial detail across Sections III-IV: the ingress stage, small-model perception stack, context construction layer with schema (observations, entities, relations, provenance), action layer with controller and tool chains, and fallback mechanisms."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "While the perception pipeline is described architecturally (OCR, ASR, context compilation), no details are provided about how benchmark inputs were specifically prepared, preprocessed, or formatted for evaluation."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section VI 'Limitations and Future Work' discusses delay from SLM fan-out and cold starts, and over-building of context when the controller invokes more tools than needed."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section VI identifies specific threats: 'Delay stems from context fan out from SLMs (OCR/ASR, scraping, retrieval) and the bounded agentic loop, plus cold starts for small models' and 'Over-building happens when the controller invokes more tools or retrieval passes than are needed for a given query, inflating cost without clear quality gains.'"
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "The paper does not state specific scope boundaries. The title claims 'The Future of AI' but the limitations section discusses only engineering improvements (delay, over-building), not what domains, modalities, or use cases the results do NOT apply to."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No raw evaluation data (per-example predictions, model outputs, logs) is available. Only aggregate accuracy numbers are reported in Table II."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper states which benchmarks were used but does not describe how evaluations were conducted — which splits were used, how inputs were formatted, or how scoring was performed."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": false,
    196         "answer": false,
    197         "justification": "No human participants; all evaluation uses standard public benchmarks."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "The architecture pipeline is described but the evaluation data pipeline is not. No details on how benchmark inputs flow through the system during evaluation, or how scoring is performed."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding source is disclosed. All three authors are affiliated with JigsawStack, Inc. but no funding statement is provided."
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "All authors list 'JigsawStack, Inc.' as their affiliation with email addresses at jigsawstack.com. The affiliation with the company behind the product is clear."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "All authors are employees of JigsawStack, Inc., the company behind the Interfaze product. The employer has a direct commercial interest in positive benchmark results for their system."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No competing interests or financial interests statement is included, despite all authors being employees of the company whose product is being evaluated."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No training cutoff dates are stated for any model — not for the proprietary small models trained in-house, nor for the final LLM, nor for the baseline models."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No discussion of potential train/test overlap for any model or benchmark. The small models are 'trained on a mixture of public and proprietary data' (Section III-B) with no overlap analysis."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No contamination analysis despite using well-known public benchmarks (MMLU published 2020, GPQA 2023, etc.) that could appear in training data of both the small models and the final LLM."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "demographics_reported": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "blinding_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "attrition_reported": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "The paper argues for cost efficiency ('shifting the bulk of computation away from the most expensive and monolithic models') but reports no actual costs, latencies, token counts, or per-example timing."
    286       },
    287       "compute_budget_stated": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No total computational budget is stated. No GPU hours, API costs, hardware specs, or training time for the small models."
    291       }
    292     },
    293     "experimental_rigor": {
    294       "seed_sensitivity_reported": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No multiple seed experiments. All results appear to be from single runs with no sensitivity analysis."
    298       },
    299       "number_of_runs_stated": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "The number of experimental runs is never stated. It is unclear whether results are from single runs or averaged."
    303       },
    304       "hyperparameter_search_budget": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "No hyperparameter search budget is reported despite the system having many tunable components (controller thresholds, context budgets, confidence thresholds for OCR fallback, etc.)."
    308       },
    309       "best_config_selection_justified": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No discussion of how the system configuration was selected. The controller is 'trained on offline tuples of requests, tool chains, and pass/fail labels' (Section III-E) with no detail on configuration selection."
    313       },
    314       "multiple_comparison_correction": {
    315         "applies": false,
    316         "answer": false,
    317         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    318       },
    319       "self_comparison_bias_addressed": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "JigsawStack employees evaluate their own Interfaze product against third-party models without acknowledging author-evaluation bias. No independent evaluation or bias mitigation."
    323       },
    324       "compute_budget_vs_performance": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The paper claims cost efficiency but does not report performance as a function of compute. No comparison at matched compute budgets with baselines."
    328       },
    329       "benchmark_construct_validity": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "No discussion of whether benchmarks measure what the paper claims. The paper uses AIME-2025 and MMLU to support claims about 'the future of AI' without discussing construct validity."
    333       },
    334       "scaffold_confound_addressed": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "Interfaze-Beta is compared against standalone LLMs (GPT-4.1, Claude, Gemini) without equivalent tool access. The comparison conflates the tool stack contribution with the architecture. The paper partially acknowledges this ('most of the improvement comes from the small-model and tool stack') but does not test baselines with equivalent tool augmentation."
    338       }
    339     },
    340     "data_leakage": {
    341       "temporal_leakage_addressed": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "No discussion of temporal leakage. The small models are 'trained on a mixture of public and proprietary data' with no temporal analysis of whether benchmark data appears in training sets."
    345       },
    346       "feature_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "Not addressed. The tool stack includes web search and retrieval, which could leak benchmark answers during evaluation. No discussion of whether the search/retrieval tools could access benchmark solutions."
    350       },
    351       "non_independence_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of train/test independence for any model in the system."
    355       },
    356       "leakage_detection_method": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, decontamination, or temporal splits."
    360       }
    361     }
    362   },
    363   "claims": [
    364     {
    365       "claim": "Interfaze-Beta achieves 83.6% on MMLU-Pro, 91.4% on MMLU, 81.3% on GPQA-Diamond, 57.8% on LiveCodeBench v5, 90.0% on AIME-2025, 77.3% on MMMU, 91.5% on AI2D, 90.9% on ChartQA, and 90.8% on Common Voice v16.",
    366       "evidence": "Table II provides head-to-head benchmark results with seven baselines. Numbers match abstract claims.",
    367       "supported": "moderate"
    368     },
    369     {
    370       "claim": "Interfaze-Beta improves over GPT-4.1 by a macro-average of +13.53 points (median +5.61) on shared benchmarks.",
    371       "evidence": "Section V computes differences from Table II across seven shared benchmarks. However, the comparison is between a tool-augmented system and a standalone LLM, making the comparison inherently unfair.",
    372       "supported": "weak"
    373     },
    374     {
    375       "claim": "Most queries are handled primarily by the small-model and tool stack, with the large LLM operating only on distilled context.",
    376       "evidence": "Described architecturally in Sections III-IV. No quantitative data on what fraction of compute or queries are handled by small models vs. the final LLM. The claim is an architectural assertion rather than an empirical finding.",
    377       "supported": "weak"
    378     },
    379     {
    380       "claim": "Removing OCR/diagram/chart parsers drops AI2D/ChartQA by 4-7 points; disabling context compilation costs ~2 points on GPQA-Diamond.",
    381       "evidence": "Mentioned in passing in Section V with no ablation table, no details on experimental setup, and no error bars. These numbers are stated without supporting methodology.",
    382       "supported": "weak"
    383     },
    384     {
    385       "claim": "The system achieves competitive accuracy while shifting the bulk of computation away from the most expensive models.",
    386       "evidence": "No cost data, latency measurements, or compute breakdowns are provided. The claim about compute shifting is entirely unsupported by empirical evidence.",
    387       "supported": "unsupported"
    388     }
    389   ],
    390   "methodology_tags": ["benchmark-eval"],
    391   "key_findings": "Interfaze-Beta, a tool-augmented system combining small task-specific DNNs/SLMs for perception (OCR, ASR, object detection) with context construction and a final LLM, reports competitive scores across nine benchmarks including 90.0% on AIME-2025 and 91.4% on MMLU. The paper's central architectural claim — that small models building structured context for a final LLM can match or beat standalone frontier models — is supported by benchmark scores but undermined by the unfair comparison (tooled system vs. non-tooled baselines), absence of cost data, and lack of statistical rigor. Ablation results are mentioned in passing but not presented in a rigorous table.",
    392   "red_flags": [
    393     {
    394       "flag": "Company evaluating own product",
    395       "detail": "All three authors are JigsawStack employees evaluating Interfaze, a JigsawStack product. No independent evaluation, no competing interests statement, and no acknowledgment of this conflict."
    396     },
    397     {
    398       "flag": "Unfair baseline comparison",
    399       "detail": "Interfaze-Beta (with web search, code sandbox, OCR, ASR, retrieval tools) is compared against standalone LLMs without equivalent tool access. The improvement is attributed to the architecture, but adding equivalent tools to baselines might close the gap. The paper partially acknowledges this but does not test tool-augmented baselines."
    400     },
    401     {
    402       "flag": "No error bars or uncertainty quantification",
    403       "detail": "All nine benchmark results are single point estimates with no confidence intervals, standard deviations, or multi-run results. The AIME-2025 score of 90.0% in particular (a small test set) could have high variance."
    404     },
    405     {
    406       "flag": "Claims far exceed evidence",
    407       "detail": "The title claims 'The Future of AI is built on Task-Specific Small Models' based on benchmark results from one system on nine tasks. No production deployment data, cost analysis, or generalizability evidence supports this broad claim."
    408     },
    409     {
    410       "flag": "Incomplete baseline table",
    411       "detail": "Table II has many blank cells (dashes) — 5 of 8 baselines are missing ChartQA, AI2D, and Common Voice results. The paper may appear stronger in these areas simply because baselines are not reported."
    412     },
    413     {
    414       "flag": "Ablation without rigor",
    415       "detail": "Ablation results are mentioned in a single paragraph without a table, without error bars, and without describing the ablation methodology. These unverifiable numbers could be cherry-picked."
    416     },
    417     {
    418       "flag": "Search/retrieval tool may leak benchmark answers",
    419       "detail": "The system includes web search and retrieval tools. On benchmarks like MMLU and GPQA, the system could retrieve answers from the web during evaluation. This is not discussed or controlled for."
    420     },
    421     {
    422       "flag": "Proprietary training data with no details",
    423       "detail": "Small models are 'trained on a mixture of public and proprietary data' (Section III-B) with no further detail. The proprietary data could contain benchmark-related content, and there is no contamination analysis."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    429       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    430       "year": 2023,
    431       "arxiv_id": "2305.05176",
    432       "relevance": "Cost-aware LLM cascading — directly relevant to model routing and cost-efficiency methodology."
    433     },
    434     {
    435       "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing",
    436       "authors": ["D. Ding", "A. Mallick", "C. Wang"],
    437       "year": 2024,
    438       "relevance": "LLM routing between small local and large cloud models — core topic of the survey's routing coverage."
    439     },
    440     {
    441       "title": "Universal Model Routing for Efficient LLM Inference",
    442       "authors": ["W. Jitkrittum"],
    443       "year": 2025,
    444       "arxiv_id": "2502.08773",
    445       "relevance": "Generalized LLM routing across expert pools — relevant to efficient inference methodology."
    446     },
    447     {
    448       "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    449       "authors": ["T. Schick", "J. Dwivedi-Yu", "R. Dessì"],
    450       "year": 2023,
    451       "arxiv_id": "2302.04761",
    452       "relevance": "Foundational work on tool-augmented LLMs, directly relevant to agentic AI capabilities."
    453     },
    454     {
    455       "title": "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face",
    456       "authors": ["Y. Shen", "K. Song", "X. Tan"],
    457       "year": 2023,
    458       "arxiv_id": "2303.17580",
    459       "relevance": "LLM as planner over specialist model registry — key precedent for multi-model orchestration systems."
    460     },
    461     {
    462       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    463       "authors": ["S. Yao", "J. Zhao", "D. Yu"],
    464       "year": 2022,
    465       "arxiv_id": "2210.03629",
    466       "relevance": "Foundational agentic prompting pattern interleaving reasoning and tool use."
    467     },
    468     {
    469       "title": "Chameleon: Plug-and-Play Compositional Reasoning with Large Language Models",
    470       "authors": ["P. Lu", "B. Peng", "H. Cheng"],
    471       "year": 2023,
    472       "arxiv_id": "2304.09842",
    473       "relevance": "Composable model+tool pipelines — relevant to evaluation of multi-component LLM systems."
    474     },
    475     {
    476       "title": "MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark",
    477       "authors": ["Y. Wang", "X. Ma", "G. Zhang"],
    478       "year": 2024,
    479       "arxiv_id": "2406.01574",
    480       "relevance": "Key benchmark for evaluating LLM knowledge and reasoning capability."
    481     },
    482     {
    483       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    484       "authors": ["N. Jain", "K. Han", "A. Gu"],
    485       "year": 2024,
    486       "arxiv_id": "2403.07974",
    487       "relevance": "Code generation benchmark designed to address contamination — directly relevant to code evaluation methodology."
    488     },
    489     {
    490       "title": "LLM With Tools: A Survey",
    491       "authors": ["Z. Shen"],
    492       "year": 2024,
    493       "arxiv_id": "2409.18807",
    494       "relevance": "Survey of tool-augmented LLM design patterns and failure modes."
    495     },
    496     {
    497       "title": "Small Language Models are the Future of Agentic AI",
    498       "authors": ["P. Belcak", "G. Heinrich", "S. Diao"],
    499       "year": 2025,
    500       "arxiv_id": "2506.02153",
    501       "relevance": "Position paper on small models for agentic workflows — directly relevant to the small model vs. large model debate."
    502     },
    503     {
    504       "title": "Small Models are Valuable Plug-ins for Large Language Models",
    505       "authors": ["C. Xu", "Y. Xu", "S. Wang"],
    506       "year": 2024,
    507       "relevance": "Small models as specialist components in LLM systems — relevant to model composition methodology."
    508     },
    509     {
    510       "title": "Unified Scaling Laws for Routed Language Models",
    511       "authors": ["A. Clark", "D. de las Casas", "A. Guy"],
    512       "year": 2022,
    513       "relevance": "Scaling laws for mixture-of-experts models — relevant to routing and model efficiency research."
    514     }
    515   ]
    516 }

Impressum · Datenschutz