scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27727B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Intelligence per Watt: Measuring Intelligence Efficiency of Local AI",
      6     "authors": [
      7       "Jon Saad-Falcon",
      8       "Avanika Narayan",
      9       "Hakki Orhun Akengin",
     10       "J. Wes Griffin",
     11       "Herumb Shandilya",
     12       "Adrian Gamarra Lafuente",
     13       "Medhya Goel",
     14       "Rebecca Joseph",
     15       "Shlok Natarajan",
     16       "Etash Kumar Guha",
     17       "Shang Zhu",
     18       "Ben Athiwaratkun",
     19       "John Hennessy",
     20       "Azalia Mirhoseini",
     21       "Christopher Ré"
     22     ],
     23     "year": 2025,
     24     "venue": "arXiv.org",
     25     "arxiv_id": "2511.07885",
     26     "doi": "10.48550/arXiv.2511.07885"
     27   },
     28   "checklist": {
     29     "claims_and_evidence": {
     30       "abstract_claims_supported": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Key abstract claims—88.7% query coverage, 5.3× IPW improvement, 1.4× local/cloud efficiency gap—are directly supported by Table 2, Figure 2, and Table 3 respectively.",
     34         "source": "haiku"
     35       },
     36       "causal_claims_justified": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Model vs. accelerator contributions are decomposed by holding one factor fixed (e.g., model at GPT-OSS-120B while varying accelerator), a standard controlled comparison appropriate for engineering benchmarking.",
     40         "source": "haiku"
     41       },
     42       "generalization_bounded": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The 88.7% coverage figure supports broad claims about redistributing LLM inference demand, but the study is limited to single-turn interactions; multi-turn, agentic, and long-context workloads are excluded without systematic qualification in conclusions.",
     46         "source": "haiku"
     47       },
     48       "alternative_explanations_discussed": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper does not discuss whether WILDCHAT performance gains could reflect bias from using QWEN3-235B as reference (potentially favoring QWEN-family local models), nor whether benchmark saturation or contamination explains 2025 improvements.",
     52         "source": "haiku"
     53       },
     54       "proxy_outcome_distinction": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "WILDCHAT accuracy is measured as win-rate against QWEN3-235B outputs yet the paper claims local models 'successfully answer' queries; this conflates beating a reference LM with actual correctness without substantive discussion of the distinction.",
     58         "source": "haiku"
     59       }
     60     },
     61     "limitations_and_scope": {
     62       "limitations_section_present": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No dedicated limitations or threats-to-validity section exists; the single-turn focus and 10-15% software power measurement inaccuracy are mentioned briefly in passing, not systematically treated.",
     66         "source": "haiku"
     67       },
     68       "threats_to_validity_specific": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The only specific threat acknowledged is 10-15% software power measurement error; no discussion of LLM-judge bias, reference model selection effects, benchmark contamination, or sampling representativeness.",
     72         "source": "haiku"
     73       },
     74       "scope_boundaries_stated": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "Single-turn focus is noted as a design choice but the paper does not explicitly state what the findings do NOT generalize to (e.g., multi-turn conversations, agentic workflows, real-time streaming).",
     78         "source": "haiku"
     79       }
     80     },
     81     "conflicts_of_interest": {
     82       "funding_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Extensive acknowledgments cover NIH, NSF, DARPA (ARL), ONR, Stanford HAI, Google DeepMind, IBM, Microsoft, Anthropic, Together AI, and others.",
     86         "source": "haiku"
     87       },
     88       "affiliations_disclosed": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Authors are affiliated with Stanford University and Together AI, both disclosed in the paper header.",
     92         "source": "haiku"
     93       },
     94       "funder_independent_of_outcome": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "Together AI is a co-author institution providing cloud infrastructure evaluated in the study; corporate funders IBM, Anthropic, Google, and OpenAI have products (GRANITE, Claude Sonnet 4.5, Gemini 2.5 Pro, GPT-5) directly compared in experiments.",
     98         "source": "haiku"
     99       },
    100       "financial_interests_declared": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No competing interests statement is included; funding sources are disclosed but individual financial interests (equity, patents, consulting) are not declared.",
    104         "source": "haiku"
    105       }
    106     },
    107     "scope_and_framing": {
    108       "key_terms_defined": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Intelligence per watt, local LMs (≤20B active parameters), local vs. cloud accelerators, and the routing function are all formally defined in Section 3 with mathematical notation.",
    112         "source": "haiku"
    113       },
    114       "intended_contribution_clear": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Three explicit contributions are stated: the IPW metric, empirical demonstration of 88.7% local query coverage, and hybrid routing efficiency gains.",
    118         "source": "haiku"
    119       },
    120       "engagement_with_prior_work": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "A main related work section and extended appendix explicitly position contributions against routing benchmarks (RouterBench, RouteLLM, RouterEval), local-cloud systems (Minions, SLED, HAT), and Green AI literature.",
    124         "source": "haiku"
    125       }
    126     }
    127   },
    128   "type_checklist": {
    129     "empirical": {
    130       "artifacts": {
    131         "code_released": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "The IPW profiling harness is released at https://github.com/HazyResearch/intelligence-per-watt, explicitly mentioned in abstract and conclusion.",
    135           "source": "haiku"
    136         },
    137         "data_released": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Source benchmarks (WILDCHAT, NATURALREASONING) are public, but the filtered/sampled subsets used, model outputs, and telemetry data from the 1M+ query study are not explicitly released.",
    141           "source": "haiku"
    142         },
    143         "environment_specified": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "Hardware specs are detailed in Table 9 and decoding parameters are listed, but no requirements.txt, Dockerfile, or software dependency specification is provided.",
    147           "source": "haiku"
    148         },
    149         "reproduction_instructions": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "The profiling harness is released but the paper contains no step-by-step instructions for reproducing the full study across 20+ models and 8 accelerators.",
    153           "source": "haiku"
    154         }
    155       },
    156       "statistical_methodology": {
    157         "confidence_intervals_or_error_bars": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "± values are reported for IPW and accuracy metrics in Tables 2, 3, and 4, derived from 10-run averaging per query to reduce measurement noise.",
    161           "source": "haiku"
    162         },
    163         "significance_tests": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Comparative claims (1.40× efficiency gap, 5.3× improvement) are made without formal statistical significance tests; only ± intervals are provided.",
    167           "source": "haiku"
    168         },
    169         "effect_sizes_reported": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Multiplier improvements (5.3× overall, 3.1× model, 1.7× hardware, 1.40× local/cloud gap) are reported with baseline values throughout.",
    173           "source": "haiku"
    174         },
    175         "sample_size_justified": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "The 500K sample from each dataset is not justified through power analysis; the choice appears driven by dataset availability rather than analytical requirements.",
    179           "source": "haiku"
    180         },
    181         "variance_reported": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Standard error intervals (±) are reported for main efficiency metrics; each query is run 10 times and power is averaged across runs.",
    185           "source": "haiku"
    186         }
    187       },
    188       "evaluation_design": {
    189         "baselines_included": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Cloud frontier models (QWEN3-235B, Claude Sonnet 4.5, GPT-5, Gemini 2.5 Pro) serve as baselines for coverage and cloud-only routing comparisons.",
    193           "source": "haiku"
    194         },
    195         "baselines_contemporary": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Baselines include GPT-5-2025-08-07, Gemini 2.5 Pro, and Claude Sonnet 4.5 (September 2025)—all contemporary with the October 2025 study timeframe.",
    199           "source": "haiku"
    200         },
    201         "ablation_study": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Table 2 decomposes IPW gains by holding accelerator fixed (measuring model contribution: 3.1×) and model fixed (measuring accelerator contribution: 1.7×), effectively ablating each factor.",
    205           "source": "haiku"
    206         },
    207         "multiple_metrics": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "The study reports accuracy per watt, accuracy per joule, perplexity per watt, perplexity per joule, latency, throughput, energy, cost, and compute across model-accelerator pairs.",
    211           "source": "haiku"
    212         },
    213         "human_evaluation": {
    214           "applies": true,
    215           "answer": false,
    216           "justification": "LLM-as-judge (GPT-4O) is used for WILDCHAT evaluation rather than human raters; no human evaluation of system outputs is conducted despite claims about measuring 'intelligence' quality.",
    217           "source": "haiku"
    218         },
    219         "held_out_test_set": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Results are reported on standard held-out benchmark test sets (MMLU PRO, SUPERGPQA) and a held-out 500K sample of WILDCHAT queries not used in any training.",
    223           "source": "haiku"
    224         },
    225         "per_category_breakdown": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Table 7 and Figure 7 provide per-category performance breakdowns across 22 Anthropic Economic Index domains for both WILDCHAT and reasoning tasks.",
    229           "source": "haiku"
    230         },
    231         "failure_cases_discussed": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "The paper explicitly analyzes failure cases: Level 5 reasoning queries remain 95% unsolvable, technical domains like Architecture & Engineering show only 68% coverage, and local models show a 24pp gap on reasoning vs. chat.",
    235           "source": "haiku"
    236         },
    237         "negative_results_reported": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Local accelerators are shown to be 1.4-7.4× less energy-efficient than cloud accelerators; reasoning queries are substantially harder for local models; these findings are reported without minimization.",
    241           "source": "haiku"
    242         }
    243       },
    244       "setup_transparency": {
    245         "model_versions_specified": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Specific model variants are named (QWEN3-235B-A22B, GPT-5-2025-08-07, GRANITE-4.0-H-TINY) with references to technical reports and release dates.",
    249           "source": "haiku"
    250         },
    251         "prompts_provided": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Full LLM-as-judge prompts for WILDCHAT and NATURALREASONING evaluation, and the query categorization prompt with all options, are included verbatim in the appendix.",
    255           "source": "haiku"
    256         },
    257         "hyperparameters_reported": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Decoding parameters are fully specified: temperature=0.6, top-p=0.95, top-k=20, min-p=0.0, 32768-token output limit, repetition penalty=1.1 for QWEN models.",
    261           "source": "haiku"
    262         },
    263         "scaffolding_described": {
    264           "applies": false,
    265           "answer": false,
    266           "justification": "No agentic scaffolding is used; the study evaluates direct LM inference on single-turn queries.",
    267           "source": "haiku"
    268         },
    269         "data_preprocessing_documented": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Appendix B.1 documents filtering steps: removal of non-English entries, malformed/nonsensical queries (via GPT-4O-MINI judge), duplicates, and queries exceeding 32K characters.",
    273           "source": "haiku"
    274         }
    275       },
    276       "data_integrity": {
    277         "raw_data_available": {
    278           "applies": true,
    279           "answer": false,
    280           "justification": "Raw telemetry, model outputs, and the processed query subsets from the 1M+ query evaluation are not released; only the profiling harness is made available.",
    281           "source": "haiku"
    282         },
    283         "data_collection_described": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "Section 4.2 and Appendix B.1 describe telemetry collection via NVML, powermetrics, and ROCm SMI at 50ms sampling intervals, with 10 runs per query for noise reduction and nanosecond-synchronized timestamps.",
    287           "source": "haiku"
    288         },
    289         "recruitment_methods_described": {
    290           "applies": false,
    291           "answer": false,
    292           "justification": "No human participants; data derives from existing public query datasets.",
    293           "source": "haiku"
    294         },
    295         "data_pipeline_documented": {
    296           "applies": true,
    297           "answer": true,
    298           "justification": "The full pipeline from dataset curation, filtering, query categorization, model inference, LLM evaluation, to telemetry aggregation is described across Section 4 and Appendix B.",
    299           "source": "haiku"
    300         }
    301       },
    302       "contamination": {
    303         "training_cutoff_stated": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "Models are described as October 2025 releases but training data cutoffs are not stated for any evaluated model, making contamination assessment impossible.",
    307           "source": "haiku"
    308         },
    309         "train_test_overlap_discussed": {
    310           "applies": true,
    311           "answer": false,
    312           "justification": "No discussion of whether QWEN3, GPT-OSS, or other 2025 models may have been trained on MMLU PRO or SUPERGPQA examples.",
    313           "source": "haiku"
    314         },
    315         "benchmark_contamination_addressed": {
    316           "applies": true,
    317           "answer": false,
    318           "justification": "MMLU PRO has been publicly available since 2024, predating all evaluated 2025 models; contamination risk is neither acknowledged nor mitigated.",
    319           "source": "haiku"
    320         }
    321       },
    322       "human_studies": {
    323         "pre_registered": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "haiku"
    328         },
    329         "irb_or_ethics_approval": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "haiku"
    334         },
    335         "demographics_reported": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "haiku"
    340         },
    341         "inclusion_exclusion_criteria": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "haiku"
    346         },
    347         "randomization_described": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "haiku"
    352         },
    353         "blinding_described": {
    354           "applies": false,
    355           "answer": false,
    356           "justification": "No human participants.",
    357           "source": "haiku"
    358         },
    359         "attrition_reported": {
    360           "applies": false,
    361           "answer": false,
    362           "justification": "No human participants.",
    363           "source": "haiku"
    364         }
    365       },
    366       "cost_and_practicality": {
    367         "inference_cost_reported": {
    368           "applies": true,
    369           "answer": true,
    370           "justification": "Token pricing from OpenRouter is reported in Table 12 and cost savings (59-73% reductions) from routing scenarios are explicitly quantified in Figure 6 and Section 5.3.",
    371           "source": "haiku"
    372         },
    373         "compute_budget_stated": {
    374           "applies": true,
    375           "answer": false,
    376           "justification": "Per-query FLOPs and energy are reported but the total computational cost of running the full study (20+ models × 8 accelerators × 1M+ queries × 10 runs each) is not stated.",
    377           "source": "haiku"
    378         }
    379       }
    380     }
    381   },
    382   "claims": [
    383     {
    384       "claim": "Local LMs can successfully handle 88.7% of single-turn chat and reasoning queries using best-of-local routing across multiple local model families.",
    385       "evidence": "Figure 2 shows routing across QWEN3-4B/8B/14B and GPT-OSS-20B achieves 97.8%, 88.3%, 77.0%, and 92.4% on the four benchmarks respectively; 88.7% is the average.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Intelligence per watt improved 5.3× from 2023-2025, decomposable into 3.1× from model advances and 1.7× from hardware advances.",
    390       "evidence": "Table 2 shows IPW rising from 7.92×10⁻⁴ (MIXTRAL on RTX 6000) to 4.18×10⁻³ (GPT-OSS-120B on M4 Max); decomposition via controlled holdout of each factor.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Cloud accelerators (NVIDIA B200) achieve 1.40× higher intelligence per watt than local accelerators (Apple M4 Max) running identical models.",
    395       "evidence": "Table 3 shows B200 IPW values consistently 1.40× above M4 Max across all QWEN3 model sizes from 4B to 32B.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Oracle routing reduces energy by 80.4%, compute by 77.3%, and cost by 73.8% versus cloud-only deployment.",
    400       "evidence": "Figure 6 and Section 5.3 based on simulation of 80.2M queries over 24 hours routing between local QWEN3 models on M4 Max and QWEN3-235B on H200.",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "A routing system with 80% accuracy captures ~80% of oracle gains, achieving 64.3% energy reduction, 61.8% compute reduction, and 59.0% cost reduction.",
    405       "evidence": "Figure 6 simulation with 80% accurate router on 80.2M daily queries; assumes misrouted queries fall back to frontier model maintaining answer quality.",
    406       "supported": "moderate"
    407     },
    408     {
    409       "claim": "Locally-serviceable query coverage increased from 23.2% (2023) to 71.3% (2025) for single-turn interactions.",
    410       "evidence": "Table 2 year-over-year tracking: MIXTRAL-8X7B 23.2% → LLAMA-3.1-8B 48.7% → GPT-OSS-120B 71.3% on combined chat and reasoning queries.",
    411       "supported": "strong"
    412     },
    413     {
    414       "claim": "Chat queries are substantially more amenable to local processing than reasoning queries, with a 24pp gap for the best individual local model (88.9% vs 64.9%).",
    415       "evidence": "Section 5.1 directly compares WILDCHAT vs. NATURALREASONING coverage for best individual local LM; consistent with domain breakdowns in Table 6.",
    416       "supported": "strong"
    417     }
    418   ],
    419   "methodology_tags": [
    420     "benchmark-eval",
    421     "observational"
    422   ],
    423   "key_findings": "Local LMs with ≤20B active parameters can handle 88.7% of single-turn chat and reasoning queries when using best-of-local routing across model families, with coverage exceeding 90% for creative tasks but dropping to 68% for specialized technical fields like Architecture & Engineering. Intelligence per watt improved 5.3× from 2023-2025 through compounding model (3.1×) and hardware (1.7×) advances, while locally-serviceable query coverage grew from 23.2% to 71.3%. Despite local models' growing capability, cloud accelerators (NVIDIA B200, SambaNova SN40L) remain 1.4-7.4× more energy-efficient than the Apple M4 Max for identical workloads, with the gap widening substantially on per-joule metrics due to faster completion times. Hybrid local-cloud routing with 80% routing accuracy achieves 59-64% reductions in energy, compute, and cost relative to cloud-only deployment while maintaining answer quality.",
    424   "red_flags": [
    425     {
    426       "flag": "WILDCHAT reference model bias",
    427       "detail": "WILDCHAT accuracy is measured as win-rate against QWEN3-235B outputs rather than ground truth; GPT-OSS-120B achieves 89.2% by beating another LM, not by comparison to human-verified answers. This may systematically favor QWEN-family local models since the reference is from the same developer."
    428     },
    429     {
    430       "flag": "Single-turn only with broad infrastructure claims",
    431       "detail": "The 88.7% coverage figure is used to support broad claims about redistributing inference demand, but all results exclude multi-turn, agentic, long-context, and streaming use cases that constitute substantial real-world LLM traffic."
    432     },
    433     {
    434       "flag": "No limitations section",
    435       "detail": "No dedicated limitations or threats-to-validity discussion despite multiple methodological choices (LLM-as-judge, reference model selection, single-turn scope, software power measurement) that warrant scrutiny."
    436     },
    437     {
    438       "flag": "Benchmark contamination unaddressed",
    439       "detail": "MMLU PRO has been publicly available since 2024, predating all evaluated 2025 models; the possibility that QWEN3, GPT-OSS, and others were trained on benchmark examples is not acknowledged or mitigated."
    440     },
    441     {
    442       "flag": "Funder-outcome conflicts",
    443       "detail": "Together AI is a co-author institution providing cloud infrastructure evaluated in the study; IBM, Anthropic, Google, and OpenAI are listed funders whose products (GRANITE 4.0, Claude Sonnet 4.5, Gemini 2.5 Pro, GPT-5) are directly compared in experiments."
    444     },
    445     {
    446       "flag": "Oracle routing headlined over realistic figures",
    447       "detail": "The 80.4% energy reduction is a theoretical oracle bound; the more realistic 80%-accurate-router result (64.3%) is less prominent despite being more actionable, and the 80% routing accuracy assumption is itself unvalidated in this study."
    448     }
    449   ],
    450   "cited_papers": [
    451     {
    452       "title": "Minions: Cost-efficient collaboration between on-device and cloud language models",
    453       "relevance": "Directly related prior work on local-cloud LM collaboration protocols; co-authored by same Stanford group"
    454     },
    455     {
    456       "title": "MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark",
    457       "relevance": "Primary standardized knowledge benchmark used in evaluation"
    458     },
    459     {
    460       "title": "SuperGPQA: Scaling LLM Evaluation Across 285 Graduate Disciplines",
    461       "relevance": "Primary graduate-level reasoning benchmark used in evaluation"
    462     },
    463     {
    464       "title": "WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild",
    465       "relevance": "Source of WILDCHAT—1M real-world ChatGPT queries used as naturalistic workload"
    466     },
    467     {
    468       "title": "RouteLLM: Learning to Route LLMs with Preference Data",
    469       "relevance": "Key prior work on LLM routing that this study builds upon and extends to local-cloud setting"
    470     },
    471     {
    472       "title": "From Words to Watts: Benchmarking the Energy Costs of Large Language Model Inference",
    473       "relevance": "Prior work establishing methodology for LLM energy measurement that this study's telemetry collection follows"
    474     },
    475     {
    476       "title": "Green AI",
    477       "relevance": "Foundational work calling for energy as a first-class metric alongside accuracy; motivates the IPW metric"
    478     },
    479     {
    480       "title": "NaturalReasoning: Reasoning in the Wild with 2.8M Challenging Questions",
    481       "relevance": "Primary reasoning dataset (500K sample) used for evaluation"
    482     },
    483     {
    484       "title": "RouterBench: A Benchmark for Multi-LLM Routing System",
    485       "relevance": "Prior routing benchmark that this work explicitly positions against and extends"
    486     }
    487   ],
    488   "engagement_factors": {
    489     "practical_relevance": {
    490       "score": 3,
    491       "justification": "Directly addresses infrastructure cost and energy for LLM deployment with actionable routing recommendations and a released profiling harness."
    492     },
    493     "surprise_contrarian": {
    494       "score": 2,
    495       "justification": "The finding that small local models can handle 88.7% of queries challenges common assumptions that frontier cloud models are necessary for most tasks."
    496     },
    497     "fear_safety": {
    498       "score": 1,
    499       "justification": "Raises energy and infrastructure concerns about AI scaling but frames them constructively as a solved or solvable problem rather than a risk."
    500     },
    501     "drama_conflict": {
    502       "score": 1,
    503       "justification": "Mild local-vs-cloud framing, but positioned as complementary infrastructure rather than adversarial competition."
    504     },
    505     "demo_ability": {
    506       "score": 2,
    507       "justification": "Profiling harness is released on GitHub and can be used, but full reproduction requires access to significant hardware (NVIDIA B200, Apple M4 Max, etc.)."
    508     },
    509     "brand_recognition": {
    510       "score": 3,
    511       "justification": "Stanford, Together AI, and evaluations of GPT-5, Gemini 2.5 Pro, Claude Sonnet 4.5, QWEN3, and NVIDIA hardware ensure high brand recognition across multiple major AI organizations."
    512     }
    513   },
    514   "hn_data": {
    515     "threads": [
    516       {
    517         "hn_id": "45905451",
    518         "title": "LLM Output Drift in Financial Workflows: Validation and Mitigation (arXiv)",
    519         "points": 24,
    520         "comments": 26,
    521         "url": "https://news.ycombinator.com/item?id=45905451",
    522         "created_at": "2025-11-12T19:53:25Z"
    523       },
    524       {
    525         "hn_id": "46118400",
    526         "title": "Intelligence per Watt: Measuring Intelligence Efficiency of Local AI",
    527         "points": 3,
    528         "comments": 0,
    529         "url": "https://news.ycombinator.com/item?id=46118400",
    530         "created_at": "2025-12-02T06:55:58Z"
    531       },
    532       {
    533         "hn_id": "29775107",
    534         "title": "Program Synthesis Performance Of GitHub Copilot vs. Genetic Programming",
    535         "points": 3,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=29775107",
    538         "created_at": "2022-01-02T23:06:39Z"
    539       },
    540       {
    541         "hn_id": "38285475",
    542         "title": "Bring Your Own KG: Self-Supervised Program Synthesis for Zero-Shot KGQA",
    543         "points": 2,
    544         "comments": 0,
    545         "url": "https://news.ycombinator.com/item?id=38285475",
    546         "created_at": "2023-11-16T03:11:23Z"
    547       }
    548     ],
    549     "top_points": 24,
    550     "total_points": 32,
    551     "total_comments": 26
    552   }
    553 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs