scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28685B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Interfaze: The Future of AI is built on Task-Specific Small Models",
      6     "authors": [
      7       "Harsha Vardhan Khurdula",
      8       "Vineet Agarwal",
      9       "Yoeven D Khemlani"
     10     ],
     11     "year": 2026,
     12     "venue": "IEEE Conference on Artificial Intelligence (CAI)",
     13     "arxiv_id": "2602.04101",
     14     "doi": null
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "Architecture and benchmark results are supported, but the claim that 'most queries are handled primarily by the small-model and tool stack' is asserted for MMLU only ('In practice, most items resolve on SLM+tool routes') without systematic quantitative evidence across all benchmarks.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Ablation studies support causal claims: 'removing OCR/diagram/chart parsers drops AI2D/ChartQA by 4–7 points; disabling context compilation costs ≈2 points on GPQA-Diamond; turning off the optional short reasoning head hurts AIME and MMLU-Pro.' Ablations are appropriate for system papers.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Title 'The Future of AI is built on Task-Specific Small Models' makes sweeping generalization claims unsupported by evidence limited to 8 standard benchmarks. Actual findings apply to specific benchmark tasks, not 'the future of AI.'",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No discussion of alternative explanations: Why do improvements occur? Could better prompting explain gains? Could benchmark-specific biases favor this architecture? Only ablations show what components matter, not why.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Paper conflates benchmark scores with 'the future of AI' and claims about real-world system effectiveness. No distinction between measured outcomes (benchmark accuracy) and claimed broader impact.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section VI 'LIMITATIONS AND FUTURE WORK' present. Identifies two specific pain points: 'delay stems from context fan out from SLMs...plus cold starts' and 'over-building happens when the controller invokes more tools or retrieval passes than are needed.'",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats identified: latency from context fan-out and cold starts, over-building of context. However, major threats unaddressed: no failure case analysis, no discussion of which components benefit which benchmarks, potential benchmark contamination not discussed.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Paper states 'All results for Interfaze-Beta use the same tool-orchestrated stack with the OCR/ASR, retrieval, chart/diagram, and sandbox tools enabled' and evaluates on specific benchmarks, but title 'The Future of AI' vastly exceeds the actual scope of evidence.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding source disclosed. Work is conducted by JigsawStack, Inc., suggesting internal funding, but this is not explicitly stated.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors listed as 'JigsawStack, Inc.' with specific locations disclosed. Affiliation with the company creating the evaluated product is stated.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "Authors are employees of JigsawStack evaluating JigsawStack's product (Interfaze-Beta). Funder is entirely dependent on positive outcomes for the product.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement provided. Company publication of company product is itself a financial interest, but not formally declared or discussed.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms defined: 'context' as structured merged state (Section III-C), 'tool chain' as sequence of tool invocations, 'small models' for perception/classification, 'controller' as selection mechanism. Definitions are reasonably precise.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Paper lists four clear contributions: (1) context-centric system architecture, (2) concrete instantiation Interfaze-Beta, (3) empirical study on benchmarks, (4) analysis of limitations. Contribution types explicitly stated.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section II surveys context importance, routing, tools, and small models, positioning Interfaze as treating small DNNs as first-class rather than opaque tools. Differentiates from routing-only and tool-calling paradigms.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "Interfaze-Beta is proprietary to JigsawStack with no source code released. No GitHub, no 'available upon request' statement; completely closed.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Evaluations use standard public benchmarks (MMLU, GPQA-Diamond, AIME, LiveCodeBench, MMMU, AI2D, ChartQA, Common Voice). These are publicly available.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Architecture described at conceptual level but no reproducibility details: no Dockerfile, requirements.txt, hardware specs, or inference framework specified. 'Task-specific models trained in-house' without versions or specifications.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions. This is a proprietary system from a company; no path to reproduce results is provided.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Table II reports only point estimates (e.g., '83.6', '91.4') with no error bars, confidence intervals, or variance measures across any benchmarks.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No statistical significance testing reported. Improvements reported as point differences (e.g., '+3.0 on MMLU-Pro') without p-values or significance thresholds.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "Absolute differences reported (e.g., '+3.0 on MMLU-Pro'), allowing manual effect size calculation, but effect sizes not formally reported with baseline context or normalized metrics.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Benchmarks use standard sample sizes (MMLU ~1K items) but no justification provided for whether these are sufficient for the claims, and no power analysis mentioned.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No variance, standard deviation, or multiple runs reported. Each benchmark reports a single accuracy number with no spread.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Table II compares Interfaze-Beta against 7 baseline models: GPT-4.1, GPT-5, Claude Sonnet 4, Gemini 2.5 Flash, Claude Sonnet 4 (Thinking), Claude Opus 4 (Thinking), Gemini 2.5 Pro.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Baselines are from late 2024 - early 2026 (GPT-4.1, Gemini 2.5, Claude Opus 4, GPT-5). Contemporary with the paper's 2026 publication. Some results missing ('—') but selection is current.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Ablations mentioned but not systematically presented: 'removing OCR/diagram/chart parsers drops AI2D/ChartQA by 4–7 points; disabling context compilation costs ≈2 points; turning off the optional short reasoning head hurts AIME and MMLU-Pro.' Presence but minimal detail.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": false,
    198           "justification": "Eight different benchmarks evaluated (MMLU-Pro, MMLU, GPQA, AIME, LiveCodeBench, MMMU, AI2D, ChartQA, Common Voice), but each reports only a single accuracy metric with no per-benchmark sub-metrics.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "N/A: System evaluation on automated benchmarks, not generating text for human evaluation.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Standard benchmarks include official train/test splits. Results reported on standard test sets (MMLU test, GPQA diamond-hard, etc.).",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": false,
    216           "justification": "Table II reports aggregate scores only. Section V includes high-level per-domain notes ('On knowledge and general reasoning...') but no systematic per-category breakdowns or confusion matrices.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "No systematic discussion of failure modes. Paper identifies weaker performance on LiveCodeBench (57.77) but doesn't analyze why or show example failures.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": false,
    228           "justification": "Weaker results mentioned in passing ('trails...by 7.73', 'within 3.09 of Gemini') but not presented as systematic negative results or analyzed in depth.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "Baseline models specified (GPT-4.1, Gemini 2.5 Pro, etc.) but critically, the final LLM used by Interfaze-Beta is never identified—only 'user-selected' and 'fixed by deployment configuration.' Small model versions and training dates completely unspecified.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "No actual prompts, system instructions, or prompt templates provided. Architecture described conceptually but not the exact text given to models.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "No hyperparameters reported: temperature, top-p, beam size, quality thresholds, cost proxies, or any LLM inference settings mentioned.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Agentic scaffolding detailed: controller trained on offline tuples, selects tool chains, minimizes cost/latency proxy, uses quality thresholds, includes fallback mechanism. Architecture-level detail provided.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Preprocessing documented: OCR pipeline includes STFT, mel-filterbank, detector/recognizer cascade, reading-order graph construction. ASR includes voice activity detection, diarization. Retrieval includes crawling, indexing, parsing.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "Standard public benchmarks used (MMLU, GPQA, AIME, LiveCodeBench, MMMU, AI2D, ChartQA, Common Voice) are publicly available for independent verification.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "No new data collected; evaluation uses published benchmarks with documented collection procedures (MMLU, GPQA, etc.). Standard benchmark protocols followed.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "N/A: Evaluation on standard benchmarks without human recruitment or participant collection.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "Pipeline for small models documented: OCR (detection→recognition→reading order), ASR (STFT→encoder→decoder→diarization), object detection. Benchmark evaluation uses standard protocols.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "No training cutoff stated for any model. Small models trained 'in-house on a mixture of public and proprietary data' without dates. Final LLM not even identified. Critical omission for 2026 evaluation.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of train/test overlap risk. Evaluating on well-known benchmarks (MMLU 2021, GPQA 2023, AIME established) with unspecified model training dates is a contamination risk not addressed.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "Benchmark contamination not addressed. Models likely trained on internet text including these benchmarks, but no analysis, decontamination, or discussion of this risk provided.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "N/A: No human participants.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "N/A: No human participants.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "N/A: No human participants.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "N/A: No human participants.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "N/A: No human participants.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "N/A: No human participants.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "N/A: No human participants.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "Cost discussed conceptually ('most compute is spent in small models') but no numbers provided: no cost per query, latency measurements, or computational budget quantified.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No compute budget for training or evaluation provided. Cost modeling mentioned ('approximately minimizing a proxy for small-model cost') but no actual costs reported.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Treating LLM applications as context-construction and action problems rather than monolithic model selection yields competitive or superior benchmark performance",
    373       "evidence": "Table II shows Interfaze-Beta competitive across 8 benchmarks with macro-average +13.53pp gain over GPT-4.1 (median +5.61pp); particularly strong on visual-numerical tasks (AI2D 91.51%, ChartQA 90.88%, AIME-2025 90.0%)",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Most queries can be resolved by the small-model and tool stack with large LLMs operating only on distilled context",
    378       "evidence": "Asserted in Section V.A: 'In practice, most items resolve on SLM+tool routes' for MMLU specifically, but quantitative evidence across all benchmarks not provided",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Structured context compilation from specialized perception models (OCR, ASR, object detection) reduces hallucination on visual and numerical reasoning tasks",
    383       "evidence": "Strong performance on visual-numerical benchmarks; Section V states 'Structured OCR text, bounding boxes, chart axes, and object relations merged into compact prompts that reduce hallucination,' but no direct evidence of hallucination reduction shown",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "OCR/diagram/chart parsing, context compilation, and reasoning scaffolding are the primary performance drivers",
    388       "evidence": "Ablation studies: removing OCR/diagram/chart drops AI2D/ChartQA by 4–7pp; disabling context compilation costs 2pp on GPQA-Diamond; disabling reasoning head hurts AIME/MMLU-Pro",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Specialized perception modules can handle perception and classification work across modalities, leaving only high-level reasoning to large LLMs",
    393       "evidence": "Architecture description in Sections III-V detailing OCR, ASR, object detection, diarization; but no systematic analysis of which modalities benefit vs. hurt from this decomposition",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Multilingual ASR with diarization yields strong multilingual speech understanding performance",
    398       "evidence": "Common Voice v16 score of 90.8, but no per-language breakdown, error analysis, or comparison to baselines on same benchmark",
    399       "supported": "moderate"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "benchmark-eval",
    404     "observational"
    405   ],
    406   "key_findings": "Interfaze-Beta presents a context-centric system architecture decomposing LLM applications into perception (OCR, ASR, object detection), context construction (retrieval, indexing, schema-based compilation), and reasoning layers. Evaluation across 8 standard benchmarks (MMLU, GPQA, AIME, LiveCodeBench, MMMU, AI2D, ChartQA, Common Voice) demonstrates competitive or superior performance compared to frontier models, with particular strength on visual-numerical tasks (91.51% AI2D, 90.88% ChartQA, 90.0% AIME) where structured context compilation from specialized models outperforms end-to-end approaches. Ablation studies confirm OCR/diagram/chart parsing, context compilation, and reasoning scaffolding as primary performance drivers, with claimed evidence that most queries resolve through small-model and tool chains rather than large LLM inference.",
    407   "red_flags": [
    408     {
    409       "flag": "Unspecified final LLM",
    410       "detail": "Critical gap: The final LLM used in Interfaze-Beta evaluation is never identified. Paper states it is 'user-selected' and 'fixed by deployment configuration' but which model (Claude, GPT, Gemini, proprietary) is actually used cannot be determined. This makes results non-reproducible and comparisons non-transparent."
    411     },
    412     {
    413       "flag": "No statistical variance or error bars",
    414       "detail": "All results reported as single point estimates (Table II) with no confidence intervals, standard deviations, or multiple runs. Differences of 1-3% between systems could be noise; statistical significance completely unknown."
    415     },
    416     {
    417       "flag": "Small-model components trained on proprietary data with no version control",
    418       "detail": "OCR, ASR, diagram parsing models are 'trained in-house on a mixture of public and proprietary data' with no model versions, release dates, or reproducibility information. Black-box components undermine claims about system transparency."
    419     },
    420     {
    421       "flag": "Benchmark contamination not addressed",
    422       "detail": "No discussion of model training data cutoff dates relative to benchmark release dates. For a 2026 paper evaluating on 2021-2024 benchmarks with unspecified model training data, contamination risk is severe and unaddressed."
    423     },
    424     {
    425       "flag": "Overstated title and framing",
    426       "detail": "Title 'The Future of AI is built on Task-Specific Small Models' makes sweeping claims far exceeding the evidence, which is limited to 8 standard benchmarks with a single proprietary system."
    427     },
    428     {
    429       "flag": "Ablations presented informally without error bars",
    430       "detail": "Ablation results mentioned in passing ('removing OCR/diagram/chart parsers drops AI2D/ChartQA by 4–7 points') rather than systematically presented in a table. No indication of whether differences are within noise."
    431     },
    432     {
    433       "flag": "Conflict of interest not disclosed",
    434       "detail": "All three authors from JigsawStack evaluating JigsawStack's Interfaze-Beta product. Affiliation disclosed but conflict-of-interest implications not discussed. No independent evaluation."
    435     },
    436     {
    437       "flag": "No failure case analysis or breakdown",
    438       "detail": "No examples of queries/benchmark items where Interfaze fails, no per-question analysis of weak points, no systematic investigation of when the architecture underperforms or struggles."
    439     },
    440     {
    441       "flag": "Missing hyperparameters and prompt details",
    442       "detail": "No temperature, top-p, beam size, quality thresholds, or other inference configuration reported. No actual prompts or system instructions provided. Setup not reproducible at the implementation level."
    443     },
    444     {
    445       "flag": "Proprietary code and data pipeline",
    446       "detail": "All small model components proprietary. No code released. No requirements.txt, no Dockerfile, no environment specification. Stated as system description but not reproducible research."
    447     }
    448   ],
    449   "cited_papers": [
    450     {
    451       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    452       "relevance": "Directly related work on cost-aware cascading and routing of LLMs based on query difficulty and cost-benefit analysis"
    453     },
    454     {
    455       "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing",
    456       "relevance": "Related routing approach selecting between small and large models based on predicted query difficulty"
    457     },
    458     {
    459       "title": "Unified Scaling Laws for Routed Language Models",
    460       "relevance": "Foundational work on routing and mixture-of-experts approaches for language models"
    461     },
    462     {
    463       "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    464       "relevance": "Core related work on tool-augmented LLMs and learning when to invoke external functions"
    465     },
    466     {
    467       "title": "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends on Hugging Face",
    468       "relevance": "LLM as orchestrator over specialist models—directly related architecture pattern"
    469     },
    470     {
    471       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    472       "relevance": "Reasoning and action prompting patterns underlying agentic scaffolding"
    473     },
    474     {
    475       "title": "Small Models are Valuable Plug-ins for Large Language Models",
    476       "relevance": "Core thesis: small models as effective specialists complementing large LLMs"
    477     },
    478     {
    479       "title": "Small Language Models are the Future of Agentic AI",
    480       "relevance": "Directly supports the paper's position on small language models in agentic systems"
    481     }
    482   ],
    483   "engagement_factors": {
    484     "practical_relevance": {
    485       "score": 1,
    486       "justification": "Proprietary JigsawStack system with no released code, weights, or open API access. Architects can learn from the design but practitioners cannot use or reproduce it."
    487     },
    488     "surprise_contrarian": {
    489       "score": 2,
    490       "justification": "Contradicts 'bigger monolithic LLM is better' narrative, but aligns with known practice that production systems decompose into perception, retrieval, and reasoning. Insight is architectural rather than fundamental."
    491     },
    492     "fear_safety": {
    493       "score": 0,
    494       "justification": "No discussion of AI safety, alignment, adversarial robustness, interpretability, or risk concerns. Purely focused on benchmark performance."
    495     },
    496     "drama_conflict": {
    497       "score": 1,
    498       "justification": "Company evaluates its own product (conflict of interest potential), but presented straightforwardly without sensationalism or explicit controversy."
    499     },
    500     "demo_ability": {
    501       "score": 0,
    502       "justification": "Proprietary system with no public demo, no code release, no public API access. Readers cannot try the system despite the architecture being web/API based."
    503     },
    504     "brand_recognition": {
    505       "score": 0,
    506       "justification": "JigsawStack is a startup without major brand recognition. Authors not prominent researchers in AI. Limited institutional prestige."
    507     }
    508   },
    509   "hn_data": {
    510     "threads": [
    511       {
    512         "hn_id": "46925536",
    513         "title": "Learning to Reason in 13 Parameters",
    514         "points": 3,
    515         "comments": 0,
    516         "url": "https://news.ycombinator.com/item?id=46925536",
    517         "created_at": "2026-02-07T17:16:58Z"
    518       },
    519       {
    520         "hn_id": "47002162",
    521         "title": "Learning to Reason in 13 Parameters",
    522         "points": 2,
    523         "comments": 0,
    524         "url": "https://news.ycombinator.com/item?id=47002162",
    525         "created_at": "2026-02-13T12:52:41Z"
    526       },
    527       {
    528         "hn_id": "47027127",
    529         "title": "Multi-Agent Teams Hold Experts Back",
    530         "points": 1,
    531         "comments": 0,
    532         "url": "https://news.ycombinator.com/item?id=47027127",
    533         "created_at": "2026-02-15T20:18:24Z"
    534       }
    535     ],
    536     "top_points": 3,
    537     "total_points": 6,
    538     "total_comments": 0
    539   }
    540 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs