scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27881B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Large Language Models as Generalist Policies for Network Optimization",
      6     "authors": [
      7       "Duo Wu",
      8       "Linjia Kang",
      9       "Zhimin Wang",
     10       "Fangxin Wang",
     11       "Wei Zhang",
     12       "Xuefeng Tao",
     13       "Wei Yang",
     14       "Le Zhang",
     15       "Peng Cui",
     16       "Zhi Wang"
     17     ],
     18     "year": 2025,
     19     "venue": "arXiv.org",
     20     "arxiv_id": "2512.11839",
     21     "doi": "10.48550/arXiv.2512.11839"
     22   },
     23   "checklist": {
     24     "claims_and_evidence": {
     25       "abstract_claims_supported": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Core abstract claims — outperforms specialist policies on ABR/CJS and outperforms VICC on Douyin — are backed by Fig. 3 simulation results and 3-week online A/B test data; the framing of 'minimal adaptation' is qualitative but supported by cross-task evaluation.",
     29         "source": "haiku"
     30       },
     31       "causal_claims_justified": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Ablation studies in Fig. 4a systematically remove pretrained weights (from-scratch training) and freeze LLM backbone to test causal roles of pretraining and domain knowledge; the design adequately supports the specific causal claims made.",
     35         "source": "haiku"
     36       },
     37       "generalization_bounded": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper makes broad claims about a 'generalist-driven paradigm' for network optimization, but evidence is from only 3 tasks — all in communications (video streaming, cluster scheduling, congestion control) — without acknowledging this scope limitation in conclusions.",
     41         "source": "haiku"
     42       },
     43       "alternative_explanations_discussed": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The Discussion attributes generalization to pretrained knowledge + domain alignment but does not consider alternatives such as larger parameter counts vs. baselines, the quality of the experience dataset itself, or architecture advantages unrelated to linguistic pretraining.",
     47         "source": "haiku"
     48       },
     49       "proxy_outcome_distinction": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "QoE, JCT, MAPE, and stall rate are defined precisely with formulas (Eqs. 5–9) and explicitly connected to user experience and service quality, clearly distinguishing what is measured from broader service claims.",
     53         "source": "haiku"
     54       }
     55     },
     56     "limitations_and_scope": {
     57       "limitations_section_present": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "There is no dedicated limitations or threats-to-validity section; the Discussion mentions interpretability as a future challenge in one sentence embedded in a broadly positive conclusion.",
     61         "source": "haiku"
     62       },
     63       "threats_to_validity_specific": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No specific threats are discussed — the paper does not address concerns such as the limited number and type of tasks tested, potential overfitting to simulated environments, or evaluation bias from internal Bytedance authorship.",
     67         "source": "haiku"
     68       },
     69       "scope_boundaries_stated": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The paper does not explicitly state what the results do not show; the 'generalist' framing is presented without bounds on which networking tasks the approach may not work for.",
     73         "source": "haiku"
     74       }
     75     },
     76     "conflicts_of_interest": {
     77       "funding_disclosed": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "No funding acknowledgment or grant information is present anywhere in the paper text.",
     81         "source": "haiku"
     82       },
     83       "affiliations_disclosed": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Author affiliations (Tsinghua University, CUHK Shenzhen, Bytedance, Tsinghua University) are clearly disclosed in the author block with specific campus/division detail.",
     87         "source": "haiku"
     88       },
     89       "funder_independent_of_outcome": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "Multiple authors are Bytedance employees; the key real-world evaluation compares Trailblazer against VICC — Douyin's own existing system — conducted and reported by the same organization, creating a clear conflict of interest.",
     93         "source": "haiku"
     94       },
     95       "financial_interests_declared": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No competing interests, financial interests, patent, or equity declarations are present in the paper.",
     99         "source": "haiku"
    100       }
    101     },
    102     "scope_and_framing": {
    103       "key_terms_defined": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Specialist vs. generalist policies, cross-task generalization, cross-environment generalization, ABR, CJS, CC, QoE, JCT, MAPE, NIOKA, and APC are all defined in context with sufficient precision.",
    107         "source": "haiku"
    108       },
    109       "intended_contribution_clear": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper explicitly frames its contribution as 'the first systematic framework to realize a generalist policy for networking' with two specific technical modules (NIOKA and APC) plus real-world validation on Douyin.",
    113         "source": "haiku"
    114       },
    115       "engagement_with_prior_work": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Section 4.3 engages with LLM-based networking and small/large model collaboration literature, explaining how Trailblazer differs (systematic framework, real-world A/B test, early saturation and selective invocation insights).",
    119         "source": "haiku"
    120       }
    121     }
    122   },
    123   "type_checklist": {
    124     "empirical": {
    125       "artifacts": {
    126         "code_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Section 4.6 confirms code is released at https://github.com/duowuyms/Trailblazer covering dataset processing, Trailblazer implementation, training/inference recipes, and simulator integration.",
    130           "source": "haiku"
    131         },
    132         "data_released": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "FCC and TPC-H data are publicly available (links provided), but the CC experience dataset (30,000+ sessions, 10M+ samples from Douyin's internal platform with proprietary network emulation) is not publicly released.",
    136           "source": "haiku"
    137         },
    138         "environment_specified": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Hardware (GPU server, ~4.5 GB GPU memory) and model names are noted, but no requirements.txt, Dockerfile, or dependency specification is provided in the paper.",
    142           "source": "haiku"
    143         },
    144         "reproduction_instructions": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "The paper references code on GitHub with training/inference recipes but provides no step-by-step reproduction instructions in the paper itself; the adequacy of the repo cannot be verified from the paper text.",
    148           "source": "haiku"
    149         }
    150       },
    151       "statistical_methodology": {
    152         "confidence_intervals_or_error_bars": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "Fig. 3a reports mean and standard deviation over 3 random seeds; Fig. 3b shows full performance distributions via scatter/box plots for cross-environment generalization.",
    156           "source": "haiku"
    157         },
    158         "significance_tests": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "No statistical significance tests (t-tests, ANOVA, permutation tests) are applied to comparative results; differences are reported as means without formal hypothesis testing.",
    162           "source": "haiku"
    163         },
    164         "effect_sizes_reported": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "Percentage improvements are consistently reported: 14.5%–36.6% higher QoE, 6.8%–41.3% JCT reduction, 3.9%–24.8% mean QoE improvement in OOD settings, giving interpretable effect sizes in context.",
    168           "source": "haiku"
    169         },
    170         "sample_size_justified": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "The choice of 100 test traces, 3 random seeds, and specific OOD environment counts is not justified through power analysis or sensitivity analysis.",
    174           "source": "haiku"
    175         },
    176         "variance_reported": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Standard deviation is reported for simulation results ('averaged over three random seeds, with the mean and standard deviation reported'); distribution shapes shown in Fig. 3b.",
    180           "source": "haiku"
    181         }
    182       },
    183       "evaluation_design": {
    184         "baselines_included": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "GENET, BBA, MPC for ABR; Decima, FIFO, Fair for CJS; VICC (production system) for real-world CC — all compared systematically against Trailblazer.",
    188           "source": "haiku"
    189         },
    190         "baselines_contemporary": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "GENET (SIGCOMM 2022) is contemporary; BBA and MPC are older but standard rule-based references; VICC is Douyin's actively maintained production system iterated over several years.",
    194           "source": "haiku"
    195         },
    196         "ablation_study": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Fig. 4a ablates pretrained knowledge (from-scratch training) and domain knowledge (frozen backbone); Section 2.3.2 ablates the APC scheduler; Extended Data Fig. 2 ablates LLM model scale.",
    200           "source": "haiku"
    201         },
    202         "multiple_metrics": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "QoE (ABR), JCT (CJS), MAPE and processing delay (CC offline), stall rate at three thresholds (100ms/200ms/500ms) plus OS breakdown and geographic breakdown (CC online).",
    206           "source": "haiku"
    207         },
    208         "human_evaluation": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "This is a network control paper; user experience is measured through objective system metrics (stall rate, QoE), not human judgments of system outputs.",
    212           "source": "haiku"
    213         },
    214         "held_out_test_set": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "For ABR, 100 test traces are held out from 485 total (235 train, 150 validation, 100 test); for CC, a 5% test subset is separate from the 95% training data.",
    218           "source": "haiku"
    219         },
    220         "per_category_breakdown": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Real-world results include breakdown by OS type (Android, iOS, HarmonyOS), geographic distance from server, and stall duration threshold; simulation results break down by OOD environment (Extended Data Tables 1–2).",
    224           "source": "haiku"
    225         },
    226         "failure_cases_discussed": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "OPT-0.35B failure (performs worse than baselines) is explicitly noted; the scheduler ablation demonstrates severe processing delay degradation under high load without APC.",
    230           "source": "haiku"
    231         },
    232         "negative_results_reported": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "Early saturation (performance plateaus beyond ~1B parameters, unlike NLP scaling laws) and from-scratch training failures are explicitly reported as negative or counterintuitive results.",
    236           "source": "haiku"
    237         }
    238       },
    239       "setup_transparency": {
    240         "model_versions_specified": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Llama2-7B, Qwen2.5-0.5B, OPT (0.35B to 6.7B), Mistral-7B, and LLaVA-7B are all specified with model family and parameter count.",
    244           "source": "haiku"
    245         },
    246         "prompts_provided": {
    247           "applies": false,
    248           "answer": false,
    249           "justification": "LLMs are used as neural architectures for offline RL/imitation learning receiving encoded network state vectors, not as prompted text generators; traditional prompt disclosure is not applicable.",
    250           "source": "haiku"
    251         },
    252         "hyperparameters_reported": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "Context window sizes (w=10 ABR, w=20 CJS), batch size (64), QoE coefficients (λ1=4.3, λ2=1), scheduler thresholds (α1=50ms, α2=0.05, α3=0.95), and inference latency targets are reported.",
    256           "source": "haiku"
    257         },
    258         "scaffolding_described": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "The NIOKA scheme (state encoder, action decoder, offline reinforcement fine-tuning) and APC mechanism (scheduler logic, collaboration design, batched inference) are described in detail in Sections 2.1 and 4.1.",
    262           "source": "haiku"
    263         },
    264         "data_preprocessing_documented": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "Experience dataset construction is described for all three tasks: GENET rollouts for ABR, Decima interactions for CJS, and 6-device/9-content-type/HoloWAN emulation sessions with 4 rule-based policies for CC.",
    268           "source": "haiku"
    269         }
    270       },
    271       "data_integrity": {
    272         "raw_data_available": {
    273           "applies": true,
    274           "answer": false,
    275           "justification": "FCC and TPC-H benchmarks are publicly linked, but the key CC experience dataset (30,000+ sessions from Douyin's internal platform) is not publicly available and cannot be independently verified.",
    276           "source": "haiku"
    277         },
    278         "data_collection_described": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "Section 4.2.3 describes CC data collection: 6 mobile devices, 9 media content types, HoloWAN network emulator, 4 randomly selected rule-based policies, yielding 30,000+ sessions with 10M+ samples and a 95/5 train/test split.",
    282           "source": "haiku"
    283         },
    284         "recruitment_methods_described": {
    285           "applies": false,
    286           "answer": false,
    287           "justification": "No human participants were recruited; data comes from network simulators, public benchmarks, and Douyin's internal system logs.",
    288           "source": "haiku"
    289         },
    290         "data_pipeline_documented": {
    291           "applies": true,
    292           "answer": true,
    293           "justification": "The end-to-end pipeline is described for each task: simulator → policy rollouts → experience dataset → LLM fine-tuning; the CC pipeline from network emulation to training data is also described with device and emulator specifics.",
    294           "source": "haiku"
    295         }
    296       },
    297       "contamination": {
    298         "training_cutoff_stated": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "The evaluation measures real-time control performance (QoE, JCT, stall rate), not LLM knowledge recall; benchmark contamination in the NLP sense is not applicable to this control task.",
    302           "source": "haiku"
    303         },
    304         "train_test_overlap_discussed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "NA — the test sets are network trace datasets and scheduling workloads used for control evaluation; potential exposure in LLM pretraining is irrelevant to the evaluated capability.",
    308           "source": "haiku"
    309         },
    310         "benchmark_contamination_addressed": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "NA — evaluation benchmarks measure control policy performance through simulation, not model knowledge recall; traditional contamination concerns do not apply.",
    314           "source": "haiku"
    315         }
    316       },
    317       "human_studies": {
    318         "pre_registered": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants; the study uses network simulators, public benchmark datasets, and real-world system logs from Douyin.",
    322           "source": "haiku"
    323         },
    324         "irb_or_ethics_approval": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants. Section 4.4 mentions compliance with Douyin's data security policy for anonymization, but this is not IRB approval for human subjects research.",
    328           "source": "haiku"
    329         },
    330         "demographics_reported": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "inclusion_exclusion_criteria": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "randomization_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "blinding_described": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         },
    354         "attrition_reported": {
    355           "applies": false,
    356           "answer": false,
    357           "justification": "No human participants.",
    358           "source": "haiku"
    359         }
    360       },
    361       "cost_and_practicality": {
    362         "inference_cost_reported": {
    363           "applies": true,
    364           "answer": true,
    365           "justification": "Inference latency (37.1ms average per batch of 64), GPU memory (~4.5 GB), and end-to-end request processing delay are all reported with specific numbers in Section 4.1.3.",
    366           "source": "haiku"
    367         },
    368         "compute_budget_stated": {
    369           "applies": true,
    370           "answer": false,
    371           "justification": "Training compute budget (GPU hours, number of training steps, wall-clock training time) is not reported anywhere in the paper.",
    372           "source": "haiku"
    373         }
    374       }
    375     }
    376   },
    377   "claims": [
    378     {
    379       "claim": "A single LLM-based Trailblazer achieves 14.5%–36.6% higher QoE than specialist policies on ABR and reduces JCT by 6.8%–41.3% on CJS.",
    380       "evidence": "Fig. 3a simulation results averaged over 3 random seeds comparing against GENET, BBA, MPC (ABR) and Decima, FIFO, Fair (CJS).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Trailblazer achieves stronger cross-environment generalization in OOD settings (3.9%–24.8% mean QoE improvement, 2.5%–41.6% JCT reduction) compared to all specialist baselines.",
    385       "evidence": "Fig. 3b distribution plots across 3 OOD environments per task, with triangle markers for means.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Both pretrained LLM knowledge and domain-specific fine-tuning are individually necessary — neither alone achieves full generalization.",
    390       "evidence": "Fig. 4a ablation comparing Trailblazer against (1) LLM trained from scratch and (2) frozen LLM backbone with only adapter layers fine-tuned.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "LLM performance saturates early in network optimization beyond ~1B parameters, contrasting with NLP scaling laws.",
    395       "evidence": "Extended Data Fig. 4c using OPT family (0.35B to 6.7B) on ABR; performance plateaus past 1B.",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "Trailblazer outperforms Douyin's production VICC policy in a 3-week real-world A/B test (150,000+ users, 100+ cities), reducing stall rates by 0.76%–1.28% and by 24.45% on HarmonyOS.",
    400       "evidence": "Fig. 6 online A/B test results reported as relative reductions; absolute stall rates not disclosed per Douyin policy.",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "Selective invocation (APC scheduler) reduces request processing delay from 345ms to 61ms under 2,000 peak requests at p=20% with only 2.66% MAPE penalty.",
    405       "evidence": "Fig. 5b ablation comparing Trailblazer with and without scheduler under varying peak loads.",
    406       "supported": "strong"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval",
    411     "case-study"
    412   ],
    413   "key_findings": "Trailblazer grounds LLMs as generalist network policies via a network alignment scheme (NIOKA) and adaptive policy collaboration (APC), achieving 14.5%–41.3% improvements over specialist baselines on two simulated tasks. In a 3-week live deployment on Douyin serving 150,000+ users, it reduces video stall rates by 0.76%–1.28% relative to the production system VICC, with larger gains (24.45%) on emerging HarmonyOS platforms. Two counterintuitive insights are surfaced: LLM performance saturates early beyond ~1B parameters in networking (unlike NLP), and selectively invoking the LLM only for difficult cases achieves a 5.6× latency reduction while incurring minimal performance loss.",
    414   "red_flags": [
    415     {
    416       "flag": "Internal self-evaluation",
    417       "detail": "Multiple authors are Bytedance employees; the key real-world evaluation compares Trailblazer against Douyin's own VICC system, conducted internally by the same organization. All results are reported as relative reductions with no absolute metrics disclosed."
    418     },
    419     {
    420       "flag": "Simulation data reused from prior work",
    421       "detail": "Section 4.4 explicitly states 'The experimental data in Section 2.2 is reused from our prior work [53] (NetLLM, SIGCOMM 2024).' The simulation experiments are not new contributions to this paper."
    422     },
    423     {
    424       "flag": "Three-task generalization scope",
    425       "detail": "Broad 'generalist paradigm' claims are based on only 3 tasks — all within the same networking/communications domain — without acknowledging that cross-domain generalization is untested."
    426     },
    427     {
    428       "flag": "No absolute metrics in real-world evaluation",
    429       "detail": "All online A/B test results are reported as relative reductions vs. VICC only, with no absolute stall rates disclosed due to 'Douyin's data security policy,' preventing independent assessment of the magnitude of gains."
    430     },
    431     {
    432       "flag": "No funding disclosure",
    433       "detail": "No funding source or acknowledgments section is present, despite the work involving a major industry partner (Bytedance) and multiple university affiliations."
    434     },
    435     {
    436       "flag": "Only 3 random seeds, no significance testing",
    437       "detail": "Simulation results are averaged over 3 random seeds without statistical significance tests, which may be insufficient to establish the reliability of reported performance differences."
    438     }
    439   ],
    440   "cited_papers": [
    441     {
    442       "title": "NetLLM: Adapting Large Language Models for Networking",
    443       "relevance": "Prior work by same authors (SIGCOMM 2024) that Trailblazer directly extends; simulation experimental data is explicitly reused from this paper."
    444     },
    445     {
    446       "title": "Neural Adaptive Video Streaming with Pensieve",
    447       "relevance": "Foundational learning-based ABR policy that defines the QoE metric and ABR simulator used throughout the paper."
    448     },
    449     {
    450       "title": "GENET: Automatic Curriculum Generation for Learning Adaptation in Networking",
    451       "relevance": "State-of-the-art ABR baseline and source of experience data used for LLM fine-tuning."
    452     },
    453     {
    454       "title": "Learning Scheduling Algorithms for Data Processing Clusters (Decima)",
    455       "relevance": "State-of-the-art CJS baseline using GNN; experience data source for CJS fine-tuning in Trailblazer."
    456     },
    457     {
    458       "title": "Decision Transformer: Reinforcement Learning via Sequence Modeling",
    459       "relevance": "Core offline RL algorithm adapted for LLM fine-tuning on ABR and CJS tasks in the NIOKA scheme."
    460     },
    461     {
    462       "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models",
    463       "relevance": "Primary backbone LLM used for simulation experiments (Llama2-7B default)."
    464     },
    465     {
    466       "title": "Qwen2.5 Technical Report",
    467       "relevance": "Backbone LLM used for real-world Douyin CC deployment (Qwen2.5-0.5B chosen based on early saturation finding)."
    468     },
    469     {
    470       "title": "OPT: Open Pre-Trained Transformer Language Models",
    471       "relevance": "Model family used to investigate LLM scaling behavior in networking, supporting the 'early saturation' finding."
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 2,
    477       "justification": "Applied to a real production system (Douyin/TikTok) serving millions; code and simulator released; clear path for practitioners working on network control."
    478     },
    479     "surprise_contrarian": {
    480       "score": 2,
    481       "justification": "'Early saturation' directly contradicts NLP scaling law assumptions, and using LLMs as real-time control policies (not text generators) challenges conventional LLM application thinking."
    482     },
    483     "fear_safety": {
    484       "score": 0,
    485       "justification": "No AI safety concerns raised; paper addresses network performance optimization, not misuse or risk scenarios."
    486     },
    487     "drama_conflict": {
    488       "score": 1,
    489       "justification": "Industry-academia collaboration with TikTok's parent company adds commercial credibility, but gains are modest and the paper is non-controversial."
    490     },
    491     "demo_ability": {
    492       "score": 2,
    493       "justification": "Code and ABR/CJS simulators released on GitHub; practitioners can reproduce simulation experiments; real-world Douyin deployment is not reproducible externally."
    494     },
    495     "brand_recognition": {
    496       "score": 1,
    497       "justification": "Bytedance/Douyin (TikTok) and Tsinghua University affiliation add recognition, but this is not from a flagship AI lab (OpenAI, Google DeepMind, Meta FAIR)."
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [
    502       {
    503         "hn_id": "43182325",
    504         "title": "The FFT Strikes Back: An Efficient Alternative to Self-Attention",
    505         "points": 456,
    506         "comments": 168,
    507         "url": "https://news.ycombinator.com/item?id=43182325",
    508         "created_at": "2025-02-26T09:57:23Z"
    509       },
    510       {
    511         "hn_id": "29883393",
    512         "title": "Lifting C Semantics for Dataflow Optimization",
    513         "points": 3,
    514         "comments": 0,
    515         "url": "https://news.ycombinator.com/item?id=29883393",
    516         "created_at": "2022-01-10T22:04:35Z"
    517       },
    518       {
    519         "hn_id": "46057417",
    520         "title": "Forecasting Ability of LLMs Depends on What We're Asking",
    521         "points": 1,
    522         "comments": 1,
    523         "url": "https://news.ycombinator.com/item?id=46057417",
    524         "created_at": "2025-11-26T13:55:05Z"
    525       },
    526       {
    527         "hn_id": "43150514",
    528         "title": "Intuitive physics understanding emerges from self-supervised pretraining",
    529         "points": 1,
    530         "comments": 1,
    531         "url": "https://news.ycombinator.com/item?id=43150514",
    532         "created_at": "2025-02-23T16:27:24Z"
    533       },
    534       {
    535         "hn_id": "25543684",
    536         "title": "Simdram: A Framework for Bit-Serial SIMD Processing Using DRAM",
    537         "points": 1,
    538         "comments": 0,
    539         "url": "https://news.ycombinator.com/item?id=25543684",
    540         "created_at": "2020-12-26T14:32:11Z"
    541       }
    542     ],
    543     "top_points": 456,
    544     "total_points": 462,
    545     "total_comments": 170
    546   }
    547 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs