scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27579B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Harnessing Language for Coordination: A Framework and Benchmark for LLM-Driven Multiagent Control",
      6     "authors": [
      7       "Timothée Anne",
      8       "Noah Syrkis",
      9       "Meriem Elhosni",
     10       "Florian Turati",
     11       "Franck Legendre"
     12     ],
     13     "year": 2024,
     14     "venue": "IEEE Transactions on Games",
     15     "arxiv_id": "2412.11761",
     16     "doi": "10.1109/TG.2025.3564042"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims of 'promising results' and specific demonstrated capabilities (coordination, terrain exploitation, strategic points) are all backed by Table I and Figures 5–6; stated limitations (visual reasoning, long-term planning) are confirmed by Section V-C and the Discussion.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims such as 'human help improves performance' and 'text descriptions outperform visual inputs' are supported by controlled ablations (Sections V-B and V-C) within the simulation environment, which is adequate for within-system causal inference.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The abstract and introduction invoke disaster response and urban planning as motivating applications, but all results are confined to one custom strategy game; the paper does not explicitly scope conclusions to this setting.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper attributes Sonnet's superiority to model quality and Exploit Weakness failures to LLM numerical reasoning deficits, but does not consider alternative explanations (e.g., prompt length, context window handling, tokenization of coordinates).",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Win/loss/tie outcomes and % enemies eliminated are defined objectives directly matching stated ability test goals; the paper does not conflate game performance with broader real-world coordination capability beyond aspirational framing.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section VI (Discussion) dedicates substantial space to specific limitations including prompt sensitivity, spatial reasoning failures, inference latency, non-determinism, and inability to re-plan mid-game.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper identifies concrete issues: inference time of 4–12 s is problematic for 10–20 s average game duration; LLMs cannot reliably extract positions from raw unit coordinate lists; Llama3-8B fails due to limited structured-output capability—these are specific, not boilerplate.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state that results do not generalize beyond the custom game or that the benchmark does not test real-time replanning, continuous observation, or non-scripted enemies; scope is implicit rather than declared.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments explicitly state: 'Funded by the armasuisse S+T project F00-007.'",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are disclosed on the title page: IT University of Copenhagen and armasuisse Science+Technology.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Three co-authors (Elhosni, Turati, Legendre, Jaquier) are armasuisse Science+Technology employees; armasuisse is both the funder and a direct stakeholder in a multi-agent coordination system with military/defense applications.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement, no patent or equity disclosures appear in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "HIVE is defined as a system enabling natural language control of unit swarms via an LLM; 'behavior tree' is formally defined with node types; unit types and game mechanics are specified with numerical parameters in Table II.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The introduction explicitly states two contributions: '(1) a real-time strategy game benchmark designed to evaluate these abilities and (2) a novel framework we term HIVE.'",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section II engages with directly related work (Cicero, SwarmBrain, SMAC, JaxMARL, AgentCoord) and explains how HIVE differs—unlike SMAC it focuses on strategic planning via LLMs; unlike SwarmBrain it uses fully LLM-generated plans rather than RL for low-level execution.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "The paper only points to hive.syrkis.com for demo videos; no repository link or explicit code release statement is provided.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The custom benchmark scenarios and game engine are not released; the 50 prompts are reproduced in the appendix but the simulation code and maps are not publicly available.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions JAX and Python but provides no requirements file, Dockerfile, or version pinning; Llama3-8B is noted as run on an M3 chip but without reproducible environment specification.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step instructions for reproducing experiments are provided; the appendix gives prompts and system instructions but not the game setup, API configurations, or execution pipeline.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Figure 5 uses box plots showing median and quartiles across 10 prompt variations; inference time in Section V-C is reported as 'median [first quartile, third quartile]'.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are used for any comparative claims; model performance differences (e.g., Sonnet 19/50 vs. 4o 12/50) are presented without testing whether they are statistically meaningful.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Absolute success rates (X/50), continuous performance percentages (% enemies eliminated, % distance covered), and win count differences between conditions are reported, constituting practical effect sizes.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The choice of 10 prompt variations per ability test is not justified by power analysis or prior work; 10 samples per condition is insufficient to detect small performance differences.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Box plots in Figure 5 show spread across prompt variations; the temperature ablation (Figure 11) shows distributions across 10 runs per temperature setting.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Llama3-8B serves as a weak baseline; the HIVE-alone condition (Section V-B) serves as an ablation baseline for the human-collaboration hypothesis.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "The nine tested models include the most capable models available at submission time (GPT-4o, Claude 3.5 Sonnet 20241022, Gemini 2.0 Flash); a preliminary Sonnet 3.7 comparison is noted in the Discussion.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section V includes three ablations: unit count scaling (V-A), human vs. no-human collaboration (V-B), and text vs. image map input (V-C), plus a temperature ablation in the appendix.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Evaluation uses strict win/loss/tie/early-completion counts (Table I), continuous performance metrics (% enemies eliminated, % distance covered in Fig 5), outcome category breakdowns (Fig 6), and inference latency (Fig 10).",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "The evaluation is fully automated via simulation; human judges are not used to assess output quality.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "This is not a prediction/training task; the benchmark scenarios are fixed evaluation environments, not a held-out split from a training distribution.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down per ability test (5 categories), per LLM (9 models), and per outcome type (won/tie/early completion/lost/invalid/no plan) in Table I and Figure 6.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Specific failure modes are analyzed: Llama3-8B returning invalid plans, LLMs failing Exploit Weakness due to numerical reasoning deficits, vision failures in Section V-C, and prompt-wording sensitivity causing 'drastic changes in the plan.'",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Multiple LLMs achieve 0/10 on several tasks (e.g., all models on Coordinate); visual map inputs consistently underperform textual inputs; HIVE without human help wins zero tasks on Coordinate in all conditions.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Exact version identifiers are provided for all nine models: e.g., gpt-4o-2024-11-20, claude-3-5-sonnet-20241022, o1-mini-2024-09-12, models/gemini-2.0-flash-exp.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "The full system prompt (Appendix E1) and all 50 evaluation prompts with variations (Appendix I) are reproduced verbatim, including the structured output format and planning examples.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Temperature is the primary reported hyperparameter: set to 0 for all models except o1-mini (fixed at 1 by API); the temperature ablation confirms this choice.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The HIVE scaffolding is described in detail: the plan structure, behavior tree grammar (Appendix G), available behavior trees (Appendix H), plan parsing and validation, objective checking logic, and unit assignment pipeline.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "The conversion of game state to LLM input is documented: unit positions/health are passed as coordinate lists; map terrain is converted to precomputed textual descriptions; examples shown in Appendix E2 and Figures 12–13.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Per-run outcomes for each of the 450 LLM queries (9 models × 50 prompts) are not provided as downloadable data; only aggregate figures and summary tables are in the paper.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "The data collection procedure is described: HIVE sends each of 10 prompt variations per ability test to each LLM, parses the returned plan, executes it in simulation with a fixed random seed, and records outcomes.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants are recruited; the 'player' role is filled by the researchers using pre-defined prompts.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline is described: game state serialization → LLM API query → plan parsing/validation → behavior tree assignment → simulation execution → objective checking → outcome recording (Sections III-C through III-E).",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs are not stated for any of the nine evaluated models, despite evaluating on a benchmark that could have been partially available to some models.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The paper does not discuss whether the game mechanics, map descriptions, or prompt formats used in the benchmark could have appeared in LLM training corpora.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The benchmark is new and custom-built, making contamination unlikely but not impossible (e.g., similar JAX-based game environments exist); the paper does not address this.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Inference wall-time is reported for all 9 models (Figure 10), with per-model quartile ranges; e.g., Sonnet 12.6 s [9.9 s, 14.2 s], Gemini 2 3.9 s [3.0 s, 6.2 s].",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total API cost or GPU-hours budget is stated; only per-query latency is reported.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "HIVE using Claude Sonnet 3.5 successfully solves all five ability tests (at least one prompt variation each), demonstrating it possesses all proposed coordination capabilities.",
    375       "evidence": "Table I: Sonnet achieves 19/50 total wins including at least 1 win per ability; Figure 4 shows successful plans for each test type.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Human-machine collaboration improves HIVE's performance compared to LLM-only control.",
    380       "evidence": "Figure 8: Without human prompts, both 4o and Sonnet win zero Coordinate and Exploit terrain tasks vs. several wins with human help; median continuous performance drops across all scenarios.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Textual map descriptions produce better performance than visual (image-based) map inputs for all tested LLMs.",
    385       "evidence": "Figure 9: Switching from text to raw/grid/scaffolding images reduces win rates in Exploit Terrain and Strategize Points for both Sonnet and 4o.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Current LLMs are sensitive to small variations in prompt wording, producing 'drastic changes in the plan and execution.'",
    390       "evidence": "10 prompt variations per ability test show high variance in outcomes (Figure 6); the authors explicitly state this finding as a key takeaway in Section V.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Scaling the number of units from 200 to 4,000 does not significantly degrade Sonnet's plan success rate, though 4o produces more invalid plans at higher unit counts.",
    395       "evidence": "Figure 7 shows no clear trend for Sonnet but increasing invalid/empty plans for 4o; authors acknowledge 'too much noise' to draw strong conclusions.",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "None of the tested LLMs exceed 50% success on the harder ability tests (Coordinate, Exploit Weakness, Exploit Terrain, Strategize Points).",
    400       "evidence": "Table I: Best scores on these tests are Sonnet 4/10 (Exploit Terrain), Gemini 2 3/10 (Exploit Weakness), o1-mini 4/10 (Strategize Points).",
    401       "supported": "strong"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "case-study"
    407   ],
    408   "key_findings": "HIVE enables a single human operator to coordinate swarms of up to 2,000 agents in a real-time strategy game via natural language LLM interaction, with Claude Sonnet 3.5 achieving the best performance (19/50 task wins) across five ability tests. Human-AI collaboration substantially outperforms LLM-only control, particularly for tasks requiring terrain exploitation and long-term planning. Current LLMs exhibit significant sensitivity to prompt wording variations and struggle with visual spatial reasoning—textual map descriptions consistently outperform image-based inputs. The benchmark discriminates meaningfully between models: Llama3-8B fails entirely, while frontier models succeed on simpler tasks but plateau below 50% on complex coordination scenarios.",
    409   "red_flags": [
    410     {
    411       "flag": "Underpowered comparison",
    412       "detail": "Only 10 prompt variations per ability test per model; model differences (e.g., Sonnet 19/50 vs 4o 12/50) are presented as meaningful without significance testing, making it impossible to distinguish real capability gaps from sampling noise."
    413     },
    414     {
    415       "flag": "No code or benchmark release",
    416       "detail": "The game engine, scenario maps, and evaluation harness are not released; only demo videos and prompt text are available, making independent replication infeasible."
    417     },
    418     {
    419       "flag": "Funder-author overlap",
    420       "detail": "armasuisse (Swiss defense agency) funds the work and has four co-authors on the paper, evaluating a system with direct military coordination applications without independent outcome review."
    421     },
    422     {
    423       "flag": "Single narrow environment",
    424       "detail": "All quantitative results come from one custom strategy game with three unit types and four hand-crafted scenarios; generalization claims to disaster response or urban planning are unsupported."
    425     },
    426     {
    427       "flag": "Ephemeral model versions",
    428       "detail": "Two of the tested models are experimental/preview snapshots (gemini-2.0-flash-exp, o1-mini preview) that may not be reproducible; API behavior of closed-source models can change silently."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Human-level play in the game of Diplomacy by combining language models with strategic reasoning",
    434       "relevance": "Key prior work (Cicero) demonstrating LLMs can achieve human-level performance in a complex strategic game through natural language negotiation."
    435     },
    436     {
    437       "title": "SwarmBrain: Embodied agent for real-time strategy game StarCraft II via large language models",
    438       "relevance": "Most directly related prior work combining LLM strategic planning with non-LLM tactical execution in an RTS game; HIVE differentiates by using fully LLM-generated structured plans."
    439     },
    440     {
    441       "title": "The StarCraft Multi-Agent Challenge",
    442       "relevance": "Established benchmark for multi-agent control that HIVE's work extends and differentiates from by adding LLM-based strategic planning."
    443     },
    444     {
    445       "title": "JaxMARL: Multi-Agent RL Environments in JAX",
    446       "relevance": "The SMAX environment HIVE builds on; provides the JAX-based parallelization infrastructure enabling large-scale unit simulation."
    447     },
    448     {
    449       "title": "Large Language Model based Multi-Agents: A Survey of Progress and Challenges",
    450       "relevance": "Comprehensive survey covering agent profiling, communication, and environment interaction for LLM multi-agent systems."
    451     },
    452     {
    453       "title": "BALROG: Benchmarking Agentic LLM and VLM Reasoning On Games",
    454       "relevance": "Contemporaneous benchmark showing LLMs struggle in complex game environments; supports HIVE's findings on LLM limitations."
    455     },
    456     {
    457       "title": "Vision language models are blind",
    458       "relevance": "Supports the paper's finding that VLMs struggle with spatial/visual reasoning tasks, motivating HIVE's use of textual map descriptions."
    459     },
    460     {
    461       "title": "Robot behavior-tree-based task generation with large language models",
    462       "relevance": "Prior work using LLMs to generate behavior trees in robotics; HIVE extends this to multi-agent game control."
    463     }
    464   ],
    465   "engagement_factors": {
    466     "practical_relevance": {
    467       "score": 2,
    468       "justification": "The natural-language swarm control paradigm has concrete applications in robotics and human-computer interfaces, though the game-specific implementation limits immediate transferability."
    469     },
    470     "surprise_contrarian": {
    471       "score": 1,
    472       "justification": "Results largely confirm expectations: better models perform better, human guidance helps, visual reasoning is weak; the finding that text beats images for spatial coordination is notable but not surprising."
    473     },
    474     "fear_safety": {
    475       "score": 2,
    476       "justification": "The system is explicitly funded by a defense procurement agency and designed for coordinating thousands of agents in adversarial scenarios, raising autonomous weapons implications that the paper does not address."
    477     },
    478     "drama_conflict": {
    479       "score": 1,
    480       "justification": "No significant controversy; the defense funding angle is a mild concern but not a central conflict in the paper."
    481     },
    482     "demo_ability": {
    483       "score": 3,
    484       "justification": "Demo videos are live at hive.syrkis.com showing the system in action; the system is clearly implemented and runnable, though code is not released for self-hosting."
    485     },
    486     "brand_recognition": {
    487       "score": 1,
    488       "justification": "Authors are from IT University of Copenhagen and armasuisse; no famous AI lab affiliation though the paper tests Claude, GPT-4o, and Gemini models."
    489     }
    490   },
    491   "hn_data": {
    492     "threads": [
    493       {
    494         "hn_id": "42448193",
    495         "title": "No More Adam: Learning Rate Scaling at Initialization Is All You Need",
    496         "points": 91,
    497         "comments": 28,
    498         "url": "https://news.ycombinator.com/item?id=42448193",
    499         "created_at": "2024-12-18T04:49:55Z"
    500       },
    501       {
    502         "hn_id": "43402629",
    503         "title": "Drowning in Documents: Consequences of Scaling Reranker Inference",
    504         "points": 2,
    505         "comments": 0,
    506         "url": "https://news.ycombinator.com/item?id=43402629",
    507         "created_at": "2025-03-18T18:04:19Z"
    508       },
    509       {
    510         "hn_id": "41858887",
    511         "title": "Language Models Encode Numbers Using Digit Representations in Base 10",
    512         "points": 2,
    513         "comments": 0,
    514         "url": "https://news.ycombinator.com/item?id=41858887",
    515         "created_at": "2024-10-16T13:35:00Z"
    516       },
    517       {
    518         "hn_id": "39033716",
    519         "title": "Large Language Models for Generative Information Extraction",
    520         "points": 2,
    521         "comments": 0,
    522         "url": "https://news.ycombinator.com/item?id=39033716",
    523         "created_at": "2024-01-17T21:26:22Z"
    524       }
    525     ],
    526     "top_points": 91,
    527     "total_points": 97,
    528     "total_comments": 28
    529   }
    530 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs