scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27786B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DoVer: Intervention-Driven Auto Debugging for LLM Multi-Agent Systems",
      6     "authors": [
      7       "Ming-Jie Ma",
      8       "Jue Zhang",
      9       "Fangkai Yang",
     10       "Yu Kang",
     11       "Qingwei Lin",
     12       "Saravan Rajmohan",
     13       "Dongmei Zhang"
     14     ],
     15     "year": 2025,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2512.06749",
     18     "doi": "10.48550/arXiv.2512.06749"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Abstract claims of 18–28% flip rate and 49% GSMPlus recovery are supported by Tables 2–3; the 30–60% hypothesis validation range matches Table 3 (validated+refuted across datasets).",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Causal claim that DoVer interventions cause failure recovery is supported by comparison against Self-Refine and CRITIC baselines both achieving 0% recovery vs DoVer's 17.6–27.5%, and by ablation studies varying models.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Section 7 explicitly states results 'should be interpreted as evidence of feasibility rather than universal guarantees' and enumerates specific constraints on covered frameworks, task types, and architectures.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Section 5.5 discusses the 29–67% inconclusive cases as arising from sub-agent capability gaps rather than incorrect hypotheses, and Section 3 discusses multiple competing sources of annotation uncertainty.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper carefully distinguishes Trial Success Rate (task completion), Progress Made (milestone advancement), and hypothesis validation (Validated/Refuted/Inconclusive), with explicit acknowledgment that LLM-as-a-judge evaluation may introduce biases.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 7 is explicitly titled 'LIMITATIONS AND GENERALIZABILITY' and spans over a full page with specific discussion.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Specific threats include: restriction to two agent frameworks, requirement for checkpoint/replay interfaces, interventions limited to orchestrator text messages (cannot modify sub-agent code), and LLM-as-a-judge bias in milestone and validation assessments.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Section 7 explicitly states the work does not cover 'long-running production workloads, domains with strict latency or cost constraints, or settings with safety-critical requirements' and that checkpointing requires 'non-trivial engineering effort.'",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "The acknowledgements section thanks reviewers and collaborators but contains no funding disclosure statement.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations are clearly stated on the title page: Chinese Academy of Sciences and Microsoft; Microsoft employees evaluate primarily Microsoft's Magentic-One and AutoGen2 frameworks.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The majority of authors are from Microsoft and the primary evaluation framework (Magentic-One) and secondary framework (AG2/AutoGen2) are both Microsoft products, creating a direct conflict between funder affiliation and outcome.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial interests statement appears anywhere in the paper.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "'Failure' is precisely defined (executes without interruption but produces incorrect/unsatisfactory results), 'Trial' is defined as a contiguous planning–execution span, and intervention categories (orchestrator_ledger, orchestrator_instruction, subagent_instruction) are enumerated.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Three explicit contributions are itemized in the introduction: (i) the DoVer framework, (ii) analysis of ground-truth annotation uncertainty, (iii) experimental demonstration of failure recovery.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 2 provides structured related work distinguishing failure-attribution work from debugging/repair work, and Section 5.3 compares against Self-Refine and CRITIC; the paper also explicitly positions against the concurrent Who&When attribution approach.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "The abstract states 'Project website and code will be available at https://aka.ms/DoVer' — a future-release promise, not a current release; the anonymous repository referenced in Appendix C is not a public release.",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "All evaluation datasets (GAIA, AssistantBench, GSMPlus) are publicly available standard benchmarks; the WW dataset from Zhang et al. (2025c) is published.",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "The paper specifies model versions (GPT-4o-20241120, GPT-5-chat-20250807, Qwen3-8B/32B) and mentions 'Azure OpenAI using default parameters,' but provides no requirements file, Dockerfile, or comprehensive dependency specification.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No step-by-step reproduction instructions are provided; the code is not yet released and Appendix C describes integration effort at a high level without runnable instructions.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Table 5 (reproduction study) reports standard deviations, but Tables 2 and 3 (main DoVer results) report no CIs or error bars despite the paper stating three independent runs were performed per intervention.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No statistical significance tests are applied to any comparative claims; performance differences are reported as raw percentages without testing.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Effect sizes are reported as flip rates (17.6%, 27.5%, 49%) with clear baseline context (0% for Self-Refine/CRITIC), and milestone progress is quantified as percentage gain.",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Sample sizes are small (26–45 cases per benchmark split) and no power analysis or justification for these sizes is provided.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Main results tables (2 and 3) report only point estimates; variance across the three independent runs per intervention is not reported.",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Section 5.3 compares against Self-Refine-style and CRITIC-style baselines, both achieving 0% recovery on WW-GAIA.",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Self-Refine (2023) and CRITIC (2023) are the standard self-improvement paradigm comparators; they are reasonable contemporaries for the self-correction approach, though not specifically designed for multi-agent debugging.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Table 4 ablates DoVer's underlying model (Qwen3-8B, Qwen3-32B vs GPT-4o) and prompting strategy (0-shot vs 3-shot), demonstrating component contributions.",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Evaluation uses Trial Success Rate, Progress Made (milestone advancement), and a four-category hypothesis validation taxonomy (Validated/Partially Validated/Refuted/Inconclusive).",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "No human evaluation of DoVer's outputs is performed; milestone evaluation and hypothesis validation both use LLM-as-a-judge (GPT-5 specified in Section 5.1).",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "GAIA Level-1 validation set cases not in WW provide a held-out evaluation, and all benchmark cases are independent of model training data in principle.",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Results are broken down by dataset (WW-AB, WW-GAIA, GAIA-Level-1, GSMPlus) and by hypothesis outcome category (Validated/Inconclusive/Partially Validated/Refuted) in Tables 2 and 3.",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Section 5.4 presents qualitative case studies for Refuted and Inconclusive outcomes; Section 5.5 analyzes the 29–67% inconclusive rate and identifies specific sub-agent bottlenecks (missing scroll-to-bottom tool, PDF handling).",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "WW-AB Progress Made is reported as '+0%' (interventions may hinder progress), 60–67% inconclusive rate in WW is reported honestly, and Self-Refine/CRITIC 0% recovery is explicitly stated.",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Exact model versions are specified: 'GPT-4o-20241120' and 'GPT-5-chat-20250807' in Section 3 footnote; Qwen3-8B and Qwen3-32B in Table 4.",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Appendix B provides all six prompts in full: Trial Segmenter (Fig. 5), Failure Proposer (Figs. 6–7), Intervention Recommender (Fig. 8), Milestone Extractor (Fig. 9), Milestone Evaluator (Fig. 10), and Post-Intervention Classifier (Fig. 11).",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "The paper states 'All LLM API calls are made through Azure OpenAI using default parameters' but does not specify what those defaults are (temperature, top-p, max tokens, etc.).",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Section 4 describes the DoVer pipeline in detail (trial segmentation, failure attribution, intervention generation, execution); Appendix C describes the checkpointing/replay integration for AG2.",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Section 5.1 describes failure trace collection ('initial run over all cases to identify failure traces'), explains why WW/MAST logs are not directly usable, and documents checkpoint-based re-collection.",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "The collected failure traces with checkpoints are not released; code is promised as future release and the anonymous repository is not a public release.",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Section 5.1 describes the data collection procedure: initial execution runs to identify failures, checkpoint capture at each step, and why existing WW/MAST logs required re-collection.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants; all evaluation uses standard benchmarks (GAIA, AssistantBench, GSMPlus).",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "The full pipeline from initial run → failure identification → trial segmentation → hypothesis generation → intervention → re-execution → scoring is documented across Sections 4–5 and Appendix C.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "The training data cutoffs for GPT-4o-20241120 and GPT-5-chat-20250807 are not stated; GAIA is a public benchmark that may be in training data.",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No discussion of potential overlap between GAIA/AssistantBench benchmark examples and GPT-4o or GPT-5 training data.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "GAIA and AssistantBench are publicly available benchmarks predating GPT-4o's training cutoff; the paper does not address whether benchmark examples were seen during pretraining.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants in this study.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants in this study.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants in this study.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "No inference cost or API cost estimates are reported despite using GPT-4o and GPT-5 for all runs, including three independent repeats per intervention across hundreds of trials.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "No total compute budget or wall-clock time is reported.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "DoVer recovers 18–28% of failed trials on GAIA and AssistantBench under the Magentic-One framework.",
    377       "evidence": "Table 2 reports 17.6% for WW-AB/WW-GAIA combined and 27.5% for GAIA-Level-1.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "DoVer achieves 49% trial success rate on GSMPlus with the AG2/AutoGen2 framework, demonstrating generality.",
    382       "evidence": "Table 2, GSMPlus row: 198 intervened trials, 49.0% success rate.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Log-based failure attribution suffers from substantial ground-truth annotation uncertainty (~48% of examined cases).",
    387       "evidence": "Section 3 reports 14 of 29 GAIA cases in WW exhibit GT uncertainty; annotator initial disagreement of ~20% reported by WW itself.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Prompt refinements (step indexing + guidance reminders) improve GPT-4o step attribution accuracy from 6% to 24% on WW-HC.",
    392       "evidence": "Table 5: baseline GPT-4o 6.04% step accuracy; +Step Index 20.69%; +Guidance 23.56%.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "Self-Refine and CRITIC self-improvement baselines achieve 0% failure recovery on WW-GAIA.",
    397       "evidence": "Section 5.3 explicitly states neither baseline flips any failure into success across all 26 WW-GAIA failed cases.",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "DoVer validates or refutes 30–60% of failure hypotheses depending on task complexity.",
    402       "evidence": "Table 3: GAIA-Level-1 achieves 34.9%+23.8%=58.7% validated+refuted; WW splits achieve ~30% each.",
    403       "supported": "moderate"
    404     }
    405   ],
    406   "methodology_tags": [
    407     "benchmark-eval",
    408     "case-study"
    409   ],
    410   "key_findings": "DoVer is an intervention-driven debugging framework for LLM multi-agent systems that operationalizes failure diagnosis by applying targeted edits to suspected failure points and re-executing traces, recovering 18–28% of GAIA/AssistantBench failures and 49% of GSMPlus failures versus 0% for self-improvement baselines. The paper also demonstrates that log-based failure attribution is fundamentally limited by annotation uncertainty (48% of examined GAIA cases have ambiguous ground-truth labels), motivating the outcome-oriented evaluation. A significant limitation is the 30–67% inconclusive rate, primarily because orchestrator-level interventions cannot address sub-agent capability gaps. The work is from Microsoft authors evaluating primarily Microsoft-developed frameworks (Magentic-One, AutoGen2), raising potential affiliation bias.",
    411   "red_flags": [
    412     {
    413       "flag": "Small evaluation samples, no power analysis",
    414       "detail": "Core evaluation uses only 26–45 failed cases per benchmark split; no power analysis or justification for sample size is provided, limiting statistical conclusions."
    415     },
    416     {
    417       "flag": "Microsoft authors evaluating Microsoft frameworks",
    418       "detail": "Majority of authors are Microsoft employees and the primary evaluation frameworks (Magentic-One, AutoGen2/AG2) are Microsoft products; no disclosure of this conflict."
    419     },
    420     {
    421       "flag": "Main results lack variance despite 3 repeats",
    422       "detail": "Tables 2–3 report only point estimates with no standard deviations or CIs despite running three independent intervention runs per trial, obscuring reliability."
    423     },
    424     {
    425       "flag": "Benchmark contamination not addressed",
    426       "detail": "GAIA and AssistantBench are public benchmarks that were available before GPT-4o/5 training cutoffs; potential contamination is not discussed."
    427     },
    428     {
    429       "flag": "Code not released at submission",
    430       "detail": "Abstract promises future availability ('will be available'); no public code exists to reproduce results."
    431     },
    432     {
    433       "flag": "No funding disclosure",
    434       "detail": "No funding statement appears in the paper despite Microsoft institutional affiliation."
    435     }
    436   ],
    437   "cited_papers": [
    438     {
    439       "title": "Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks",
    440       "relevance": "Primary agent framework used for evaluation; DoVer is integrated with Magentic-One's checkpointing infrastructure."
    441     },
    442     {
    443       "title": "GAIA: A Benchmark for General AI Assistants",
    444       "relevance": "Core evaluation benchmark; GAIA Level-1/2/3 failure cases form the primary test set."
    445     },
    446     {
    447       "title": "Why Do Multi-Agent LLM Systems Fail? (MAST)",
    448       "relevance": "Provides failure taxonomy for multi-agent systems; supplies the MathChat/GSMPlus experimental setup used in AG2 evaluation."
    449     },
    450     {
    451       "title": "Which Agent Causes Task Failures and When? (Who&When)",
    452       "relevance": "The log-based attribution benchmark and dataset that DoVer analyzes and critiques; provides the WW failure traces and baseline method."
    453     },
    454     {
    455       "title": "TRAIL: Trace Reasoning and Agentic Issue Localization",
    456       "relevance": "Concurrent work on turn-level failure taxonomy and long-context trace debugging; shows strong models still struggle."
    457     },
    458     {
    459       "title": "Interactive Debugging and Steering of Multi-Agent AI Systems (AGDebugger)",
    460       "relevance": "Human-in-the-loop debugging tool that DoVer adapts to enable automated checkpointing and replay."
    461     },
    462     {
    463       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    464       "relevance": "The agent execution pattern (planning–execution cycles) that creates the multi-trial structure DoVer exploits."
    465     },
    466     {
    467       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    468       "relevance": "Baseline self-improvement method compared against in ablation study."
    469     },
    470     {
    471       "title": "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?",
    472       "relevance": "One of the two main evaluation benchmarks; provides the WW-AB failure cases."
    473     },
    474     {
    475       "title": "AgentDebug / Where LLM Agents Fail and How They Can Learn from Failures",
    476       "relevance": "Concurrent intervention-driven debugging work similar to DoVer; acknowledged as parallel development."
    477     }
    478   ],
    479   "engagement_factors": {
    480     "practical_relevance": {
    481       "score": 3,
    482       "justification": "Directly addresses multi-agent system debugging, a concrete pain point for any team deploying LLM agents in production."
    483     },
    484     "surprise_contrarian": {
    485       "score": 2,
    486       "justification": "Challenges the prevailing log-based attribution paradigm by showing ~48% of ground-truth annotations are uncertain and that self-improvement baselines achieve 0% recovery."
    487     },
    488     "fear_safety": {
    489       "score": 1,
    490       "justification": "Addresses reliability of agentic systems but does not raise safety or harm concerns."
    491     },
    492     "drama_conflict": {
    493       "score": 1,
    494       "justification": "Mild methodological critique of the Who&When benchmark's annotation quality; not a high-profile controversy."
    495     },
    496     "demo_ability": {
    497       "score": 2,
    498       "justification": "Figure 4 shows a working web-based intervention interface for AG2 MathChat, but code is not yet publicly released."
    499     },
    500     "brand_recognition": {
    501       "score": 2,
    502       "justification": "Microsoft affiliation and use of Magentic-One and AutoGen2 (known Microsoft products) provides moderate brand recognition."
    503     }
    504   },
    505   "hn_data": {
    506     "threads": [
    507       {
    508         "hn_id": "42378335",
    509         "title": "Training LLMs to Reason in a Continuous Latent Space",
    510         "points": 283,
    511         "comments": 114,
    512         "url": "https://news.ycombinator.com/item?id=42378335",
    513         "created_at": "2024-12-10T16:26:17Z"
    514       },
    515       {
    516         "hn_id": "43042753",
    517         "title": "LM2: Large Memory Models",
    518         "points": 110,
    519         "comments": 30,
    520         "url": "https://news.ycombinator.com/item?id=43042753",
    521         "created_at": "2025-02-13T23:21:21Z"
    522       },
    523       {
    524         "hn_id": "29568816",
    525         "title": "Proof of Steak",
    526         "points": 79,
    527         "comments": 28,
    528         "url": "https://news.ycombinator.com/item?id=29568816",
    529         "created_at": "2021-12-15T17:16:25Z"
    530       },
    531       {
    532         "hn_id": "30078848",
    533         "title": "Phishing in organizations: Findings from a large-scale and long-term study",
    534         "points": 30,
    535         "comments": 10,
    536         "url": "https://news.ycombinator.com/item?id=30078848",
    537         "created_at": "2022-01-25T22:11:11Z"
    538       },
    539       {
    540         "hn_id": "42456288",
    541         "title": "Rethinking the Combination of Graph Neural Network and Large Language Model",
    542         "points": 2,
    543         "comments": 0,
    544         "url": "https://news.ycombinator.com/item?id=42456288",
    545         "created_at": "2024-12-18T22:41:39Z"
    546       },
    547       {
    548         "hn_id": "38762672",
    549         "title": "Building Trustworthy NeuroSymbolic AI Systems",
    550         "points": 2,
    551         "comments": 0,
    552         "url": "https://news.ycombinator.com/item?id=38762672",
    553         "created_at": "2023-12-25T14:04:27Z"
    554       },
    555       {
    556         "hn_id": "29485809",
    557         "title": "Deep learning for elliptic and parabolic boundary value problems",
    558         "points": 2,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=29485809",
    561         "created_at": "2021-12-08T15:22:21Z"
    562       },
    563       {
    564         "hn_id": "42470646",
    565         "title": "SpikeFI: A Fault Injection Framework for Spiking Neural Networks",
    566         "points": 1,
    567         "comments": 0,
    568         "url": "https://news.ycombinator.com/item?id=42470646",
    569         "created_at": "2024-12-20T12:47:13Z"
    570       }
    571     ],
    572     "top_points": 283,
    573     "total_points": 509,
    574     "total_comments": 182
    575   }
    576 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs