scan-v5.json (28685B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Interfaze: The Future of AI is built on Task-Specific Small Models", 6 "authors": [ 7 "Harsha Vardhan Khurdula", 8 "Vineet Agarwal", 9 "Yoeven D Khemlani" 10 ], 11 "year": 2026, 12 "venue": "IEEE Conference on Artificial Intelligence (CAI)", 13 "arxiv_id": "2602.04101", 14 "doi": null 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": false, 21 "justification": "Architecture and benchmark results are supported, but the claim that 'most queries are handled primarily by the small-model and tool stack' is asserted for MMLU only ('In practice, most items resolve on SLM+tool routes') without systematic quantitative evidence across all benchmarks.", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": true, 27 "justification": "Ablation studies support causal claims: 'removing OCR/diagram/chart parsers drops AI2D/ChartQA by 4–7 points; disabling context compilation costs ≈2 points on GPQA-Diamond; turning off the optional short reasoning head hurts AIME and MMLU-Pro.' Ablations are appropriate for system papers.", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": false, 33 "justification": "Title 'The Future of AI is built on Task-Specific Small Models' makes sweeping generalization claims unsupported by evidence limited to 8 standard benchmarks. Actual findings apply to specific benchmark tasks, not 'the future of AI.'", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": false, 39 "justification": "No discussion of alternative explanations: Why do improvements occur? Could better prompting explain gains? Could benchmark-specific biases favor this architecture? Only ablations show what components matter, not why.", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": false, 45 "justification": "Paper conflates benchmark scores with 'the future of AI' and claims about real-world system effectiveness. No distinction between measured outcomes (benchmark accuracy) and claimed broader impact.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": true, 53 "justification": "Section VI 'LIMITATIONS AND FUTURE WORK' present. Identifies two specific pain points: 'delay stems from context fan out from SLMs...plus cold starts' and 'over-building happens when the controller invokes more tools or retrieval passes than are needed.'", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": true, 59 "justification": "Specific threats identified: latency from context fan-out and cold starts, over-building of context. However, major threats unaddressed: no failure case analysis, no discussion of which components benefit which benchmarks, potential benchmark contamination not discussed.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": false, 65 "justification": "Paper states 'All results for Interfaze-Beta use the same tool-orchestrated stack with the OCR/ASR, retrieval, chart/diagram, and sandbox tools enabled' and evaluates on specific benchmarks, but title 'The Future of AI' vastly exceeds the actual scope of evidence.", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding source disclosed. Work is conducted by JigsawStack, Inc., suggesting internal funding, but this is not explicitly stated.", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "All three authors listed as 'JigsawStack, Inc.' with specific locations disclosed. Affiliation with the company creating the evaluated product is stated.", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": true, 84 "answer": false, 85 "justification": "Authors are employees of JigsawStack evaluating JigsawStack's product (Interfaze-Beta). Funder is entirely dependent on positive outcomes for the product.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests statement provided. Company publication of company product is itself a financial interest, but not formally declared or discussed.", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "Key terms defined: 'context' as structured merged state (Section III-C), 'tool chain' as sequence of tool invocations, 'small models' for perception/classification, 'controller' as selection mechanism. Definitions are reasonably precise.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "Paper lists four clear contributions: (1) context-centric system architecture, (2) concrete instantiation Interfaze-Beta, (3) empirical study on benchmarks, (4) analysis of limitations. Contribution types explicitly stated.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section II surveys context importance, routing, tools, and small models, positioning Interfaze as treating small DNNs as first-class rather than opaque tools. Differentiates from routing-only and tool-calling paradigms.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": false, 122 "justification": "Interfaze-Beta is proprietary to JigsawStack with no source code released. No GitHub, no 'available upon request' statement; completely closed.", 123 "source": "haiku" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": true, 128 "justification": "Evaluations use standard public benchmarks (MMLU, GPQA-Diamond, AIME, LiveCodeBench, MMMU, AI2D, ChartQA, Common Voice). These are publicly available.", 129 "source": "haiku" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "Architecture described at conceptual level but no reproducibility details: no Dockerfile, requirements.txt, hardware specs, or inference framework specified. 'Task-specific models trained in-house' without versions or specifications.", 135 "source": "haiku" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "No step-by-step reproduction instructions. This is a proprietary system from a company; no path to reproduce results is provided.", 141 "source": "haiku" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": false, 148 "justification": "Table II reports only point estimates (e.g., '83.6', '91.4') with no error bars, confidence intervals, or variance measures across any benchmarks.", 149 "source": "haiku" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": false, 154 "justification": "No statistical significance testing reported. Improvements reported as point differences (e.g., '+3.0 on MMLU-Pro') without p-values or significance thresholds.", 155 "source": "haiku" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "Absolute differences reported (e.g., '+3.0 on MMLU-Pro'), allowing manual effect size calculation, but effect sizes not formally reported with baseline context or normalized metrics.", 161 "source": "haiku" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": false, 166 "justification": "Benchmarks use standard sample sizes (MMLU ~1K items) but no justification provided for whether these are sufficient for the claims, and no power analysis mentioned.", 167 "source": "haiku" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": false, 172 "justification": "No variance, standard deviation, or multiple runs reported. Each benchmark reports a single accuracy number with no spread.", 173 "source": "haiku" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": true, 180 "justification": "Table II compares Interfaze-Beta against 7 baseline models: GPT-4.1, GPT-5, Claude Sonnet 4, Gemini 2.5 Flash, Claude Sonnet 4 (Thinking), Claude Opus 4 (Thinking), Gemini 2.5 Pro.", 181 "source": "haiku" 182 }, 183 "baselines_contemporary": { 184 "applies": true, 185 "answer": true, 186 "justification": "Baselines are from late 2024 - early 2026 (GPT-4.1, Gemini 2.5, Claude Opus 4, GPT-5). Contemporary with the paper's 2026 publication. Some results missing ('—') but selection is current.", 187 "source": "haiku" 188 }, 189 "ablation_study": { 190 "applies": true, 191 "answer": true, 192 "justification": "Ablations mentioned but not systematically presented: 'removing OCR/diagram/chart parsers drops AI2D/ChartQA by 4–7 points; disabling context compilation costs ≈2 points; turning off the optional short reasoning head hurts AIME and MMLU-Pro.' Presence but minimal detail.", 193 "source": "haiku" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": false, 198 "justification": "Eight different benchmarks evaluated (MMLU-Pro, MMLU, GPQA, AIME, LiveCodeBench, MMMU, AI2D, ChartQA, Common Voice), but each reports only a single accuracy metric with no per-benchmark sub-metrics.", 199 "source": "haiku" 200 }, 201 "human_evaluation": { 202 "applies": false, 203 "answer": false, 204 "justification": "N/A: System evaluation on automated benchmarks, not generating text for human evaluation.", 205 "source": "haiku" 206 }, 207 "held_out_test_set": { 208 "applies": true, 209 "answer": true, 210 "justification": "Standard benchmarks include official train/test splits. Results reported on standard test sets (MMLU test, GPQA diamond-hard, etc.).", 211 "source": "haiku" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": false, 216 "justification": "Table II reports aggregate scores only. Section V includes high-level per-domain notes ('On knowledge and general reasoning...') but no systematic per-category breakdowns or confusion matrices.", 217 "source": "haiku" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": false, 222 "justification": "No systematic discussion of failure modes. Paper identifies weaker performance on LiveCodeBench (57.77) but doesn't analyze why or show example failures.", 223 "source": "haiku" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": false, 228 "justification": "Weaker results mentioned in passing ('trails...by 7.73', 'within 3.09 of Gemini') but not presented as systematic negative results or analyzed in depth.", 229 "source": "haiku" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": false, 236 "justification": "Baseline models specified (GPT-4.1, Gemini 2.5 Pro, etc.) but critically, the final LLM used by Interfaze-Beta is never identified—only 'user-selected' and 'fixed by deployment configuration.' Small model versions and training dates completely unspecified.", 237 "source": "haiku" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": false, 242 "justification": "No actual prompts, system instructions, or prompt templates provided. Architecture described conceptually but not the exact text given to models.", 243 "source": "haiku" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": false, 248 "justification": "No hyperparameters reported: temperature, top-p, beam size, quality thresholds, cost proxies, or any LLM inference settings mentioned.", 249 "source": "haiku" 250 }, 251 "scaffolding_described": { 252 "applies": true, 253 "answer": true, 254 "justification": "Agentic scaffolding detailed: controller trained on offline tuples, selects tool chains, minimizes cost/latency proxy, uses quality thresholds, includes fallback mechanism. Architecture-level detail provided.", 255 "source": "haiku" 256 }, 257 "data_preprocessing_documented": { 258 "applies": true, 259 "answer": true, 260 "justification": "Preprocessing documented: OCR pipeline includes STFT, mel-filterbank, detector/recognizer cascade, reading-order graph construction. ASR includes voice activity detection, diarization. Retrieval includes crawling, indexing, parsing.", 261 "source": "haiku" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": true, 268 "justification": "Standard public benchmarks used (MMLU, GPQA, AIME, LiveCodeBench, MMMU, AI2D, ChartQA, Common Voice) are publicly available for independent verification.", 269 "source": "haiku" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "No new data collected; evaluation uses published benchmarks with documented collection procedures (MMLU, GPQA, etc.). Standard benchmark protocols followed.", 275 "source": "haiku" 276 }, 277 "recruitment_methods_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "N/A: Evaluation on standard benchmarks without human recruitment or participant collection.", 281 "source": "haiku" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": true, 286 "justification": "Pipeline for small models documented: OCR (detection→recognition→reading order), ASR (STFT→encoder→decoder→diarization), object detection. Benchmark evaluation uses standard protocols.", 287 "source": "haiku" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No training cutoff stated for any model. Small models trained 'in-house on a mixture of public and proprietary data' without dates. Final LLM not even identified. Critical omission for 2026 evaluation.", 295 "source": "haiku" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": true, 299 "answer": false, 300 "justification": "No discussion of train/test overlap risk. Evaluating on well-known benchmarks (MMLU 2021, GPQA 2023, AIME established) with unspecified model training dates is a contamination risk not addressed.", 301 "source": "haiku" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": true, 305 "answer": false, 306 "justification": "Benchmark contamination not addressed. Models likely trained on internet text including these benchmarks, but no analysis, decontamination, or discussion of this risk provided.", 307 "source": "haiku" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": false, 313 "answer": false, 314 "justification": "N/A: No human participants.", 315 "source": "haiku" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": false, 319 "answer": false, 320 "justification": "N/A: No human participants.", 321 "source": "haiku" 322 }, 323 "demographics_reported": { 324 "applies": false, 325 "answer": false, 326 "justification": "N/A: No human participants.", 327 "source": "haiku" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": false, 331 "answer": false, 332 "justification": "N/A: No human participants.", 333 "source": "haiku" 334 }, 335 "randomization_described": { 336 "applies": false, 337 "answer": false, 338 "justification": "N/A: No human participants.", 339 "source": "haiku" 340 }, 341 "blinding_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "N/A: No human participants.", 345 "source": "haiku" 346 }, 347 "attrition_reported": { 348 "applies": false, 349 "answer": false, 350 "justification": "N/A: No human participants.", 351 "source": "haiku" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": true, 357 "answer": false, 358 "justification": "Cost discussed conceptually ('most compute is spent in small models') but no numbers provided: no cost per query, latency measurements, or computational budget quantified.", 359 "source": "haiku" 360 }, 361 "compute_budget_stated": { 362 "applies": true, 363 "answer": false, 364 "justification": "No compute budget for training or evaluation provided. Cost modeling mentioned ('approximately minimizing a proxy for small-model cost') but no actual costs reported.", 365 "source": "haiku" 366 } 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "Treating LLM applications as context-construction and action problems rather than monolithic model selection yields competitive or superior benchmark performance", 373 "evidence": "Table II shows Interfaze-Beta competitive across 8 benchmarks with macro-average +13.53pp gain over GPT-4.1 (median +5.61pp); particularly strong on visual-numerical tasks (AI2D 91.51%, ChartQA 90.88%, AIME-2025 90.0%)", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Most queries can be resolved by the small-model and tool stack with large LLMs operating only on distilled context", 378 "evidence": "Asserted in Section V.A: 'In practice, most items resolve on SLM+tool routes' for MMLU specifically, but quantitative evidence across all benchmarks not provided", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "Structured context compilation from specialized perception models (OCR, ASR, object detection) reduces hallucination on visual and numerical reasoning tasks", 383 "evidence": "Strong performance on visual-numerical benchmarks; Section V states 'Structured OCR text, bounding boxes, chart axes, and object relations merged into compact prompts that reduce hallucination,' but no direct evidence of hallucination reduction shown", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "OCR/diagram/chart parsing, context compilation, and reasoning scaffolding are the primary performance drivers", 388 "evidence": "Ablation studies: removing OCR/diagram/chart drops AI2D/ChartQA by 4–7pp; disabling context compilation costs 2pp on GPQA-Diamond; disabling reasoning head hurts AIME/MMLU-Pro", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "Specialized perception modules can handle perception and classification work across modalities, leaving only high-level reasoning to large LLMs", 393 "evidence": "Architecture description in Sections III-V detailing OCR, ASR, object detection, diarization; but no systematic analysis of which modalities benefit vs. hurt from this decomposition", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "Multilingual ASR with diarization yields strong multilingual speech understanding performance", 398 "evidence": "Common Voice v16 score of 90.8, but no per-language breakdown, error analysis, or comparison to baselines on same benchmark", 399 "supported": "moderate" 400 } 401 ], 402 "methodology_tags": [ 403 "benchmark-eval", 404 "observational" 405 ], 406 "key_findings": "Interfaze-Beta presents a context-centric system architecture decomposing LLM applications into perception (OCR, ASR, object detection), context construction (retrieval, indexing, schema-based compilation), and reasoning layers. Evaluation across 8 standard benchmarks (MMLU, GPQA, AIME, LiveCodeBench, MMMU, AI2D, ChartQA, Common Voice) demonstrates competitive or superior performance compared to frontier models, with particular strength on visual-numerical tasks (91.51% AI2D, 90.88% ChartQA, 90.0% AIME) where structured context compilation from specialized models outperforms end-to-end approaches. Ablation studies confirm OCR/diagram/chart parsing, context compilation, and reasoning scaffolding as primary performance drivers, with claimed evidence that most queries resolve through small-model and tool chains rather than large LLM inference.", 407 "red_flags": [ 408 { 409 "flag": "Unspecified final LLM", 410 "detail": "Critical gap: The final LLM used in Interfaze-Beta evaluation is never identified. Paper states it is 'user-selected' and 'fixed by deployment configuration' but which model (Claude, GPT, Gemini, proprietary) is actually used cannot be determined. This makes results non-reproducible and comparisons non-transparent." 411 }, 412 { 413 "flag": "No statistical variance or error bars", 414 "detail": "All results reported as single point estimates (Table II) with no confidence intervals, standard deviations, or multiple runs. Differences of 1-3% between systems could be noise; statistical significance completely unknown." 415 }, 416 { 417 "flag": "Small-model components trained on proprietary data with no version control", 418 "detail": "OCR, ASR, diagram parsing models are 'trained in-house on a mixture of public and proprietary data' with no model versions, release dates, or reproducibility information. Black-box components undermine claims about system transparency." 419 }, 420 { 421 "flag": "Benchmark contamination not addressed", 422 "detail": "No discussion of model training data cutoff dates relative to benchmark release dates. For a 2026 paper evaluating on 2021-2024 benchmarks with unspecified model training data, contamination risk is severe and unaddressed." 423 }, 424 { 425 "flag": "Overstated title and framing", 426 "detail": "Title 'The Future of AI is built on Task-Specific Small Models' makes sweeping claims far exceeding the evidence, which is limited to 8 standard benchmarks with a single proprietary system." 427 }, 428 { 429 "flag": "Ablations presented informally without error bars", 430 "detail": "Ablation results mentioned in passing ('removing OCR/diagram/chart parsers drops AI2D/ChartQA by 4–7 points') rather than systematically presented in a table. No indication of whether differences are within noise." 431 }, 432 { 433 "flag": "Conflict of interest not disclosed", 434 "detail": "All three authors from JigsawStack evaluating JigsawStack's Interfaze-Beta product. Affiliation disclosed but conflict-of-interest implications not discussed. No independent evaluation." 435 }, 436 { 437 "flag": "No failure case analysis or breakdown", 438 "detail": "No examples of queries/benchmark items where Interfaze fails, no per-question analysis of weak points, no systematic investigation of when the architecture underperforms or struggles." 439 }, 440 { 441 "flag": "Missing hyperparameters and prompt details", 442 "detail": "No temperature, top-p, beam size, quality thresholds, or other inference configuration reported. No actual prompts or system instructions provided. Setup not reproducible at the implementation level." 443 }, 444 { 445 "flag": "Proprietary code and data pipeline", 446 "detail": "All small model components proprietary. No code released. No requirements.txt, no Dockerfile, no environment specification. Stated as system description but not reproducible research." 447 } 448 ], 449 "cited_papers": [ 450 { 451 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 452 "relevance": "Directly related work on cost-aware cascading and routing of LLMs based on query difficulty and cost-benefit analysis" 453 }, 454 { 455 "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing", 456 "relevance": "Related routing approach selecting between small and large models based on predicted query difficulty" 457 }, 458 { 459 "title": "Unified Scaling Laws for Routed Language Models", 460 "relevance": "Foundational work on routing and mixture-of-experts approaches for language models" 461 }, 462 { 463 "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", 464 "relevance": "Core related work on tool-augmented LLMs and learning when to invoke external functions" 465 }, 466 { 467 "title": "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends on Hugging Face", 468 "relevance": "LLM as orchestrator over specialist models—directly related architecture pattern" 469 }, 470 { 471 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 472 "relevance": "Reasoning and action prompting patterns underlying agentic scaffolding" 473 }, 474 { 475 "title": "Small Models are Valuable Plug-ins for Large Language Models", 476 "relevance": "Core thesis: small models as effective specialists complementing large LLMs" 477 }, 478 { 479 "title": "Small Language Models are the Future of Agentic AI", 480 "relevance": "Directly supports the paper's position on small language models in agentic systems" 481 } 482 ], 483 "engagement_factors": { 484 "practical_relevance": { 485 "score": 1, 486 "justification": "Proprietary JigsawStack system with no released code, weights, or open API access. Architects can learn from the design but practitioners cannot use or reproduce it." 487 }, 488 "surprise_contrarian": { 489 "score": 2, 490 "justification": "Contradicts 'bigger monolithic LLM is better' narrative, but aligns with known practice that production systems decompose into perception, retrieval, and reasoning. Insight is architectural rather than fundamental." 491 }, 492 "fear_safety": { 493 "score": 0, 494 "justification": "No discussion of AI safety, alignment, adversarial robustness, interpretability, or risk concerns. Purely focused on benchmark performance." 495 }, 496 "drama_conflict": { 497 "score": 1, 498 "justification": "Company evaluates its own product (conflict of interest potential), but presented straightforwardly without sensationalism or explicit controversy." 499 }, 500 "demo_ability": { 501 "score": 0, 502 "justification": "Proprietary system with no public demo, no code release, no public API access. Readers cannot try the system despite the architecture being web/API based." 503 }, 504 "brand_recognition": { 505 "score": 0, 506 "justification": "JigsawStack is a startup without major brand recognition. Authors not prominent researchers in AI. Limited institutional prestige." 507 } 508 }, 509 "hn_data": { 510 "threads": [ 511 { 512 "hn_id": "46925536", 513 "title": "Learning to Reason in 13 Parameters", 514 "points": 3, 515 "comments": 0, 516 "url": "https://news.ycombinator.com/item?id=46925536", 517 "created_at": "2026-02-07T17:16:58Z" 518 }, 519 { 520 "hn_id": "47002162", 521 "title": "Learning to Reason in 13 Parameters", 522 "points": 2, 523 "comments": 0, 524 "url": "https://news.ycombinator.com/item?id=47002162", 525 "created_at": "2026-02-13T12:52:41Z" 526 }, 527 { 528 "hn_id": "47027127", 529 "title": "Multi-Agent Teams Hold Experts Back", 530 "points": 1, 531 "comments": 0, 532 "url": "https://news.ycombinator.com/item?id=47027127", 533 "created_at": "2026-02-15T20:18:24Z" 534 } 535 ], 536 "top_points": 3, 537 "total_points": 6, 538 "total_comments": 0 539 } 540 }