scan-v5.json (27786B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "DoVer: Intervention-Driven Auto Debugging for LLM Multi-Agent Systems", 6 "authors": [ 7 "Ming-Jie Ma", 8 "Jue Zhang", 9 "Fangkai Yang", 10 "Yu Kang", 11 "Qingwei Lin", 12 "Saravan Rajmohan", 13 "Dongmei Zhang" 14 ], 15 "year": 2025, 16 "venue": "arXiv.org", 17 "arxiv_id": "2512.06749", 18 "doi": "10.48550/arXiv.2512.06749" 19 }, 20 "checklist": { 21 "claims_and_evidence": { 22 "abstract_claims_supported": { 23 "applies": true, 24 "answer": true, 25 "justification": "Abstract claims of 18–28% flip rate and 49% GSMPlus recovery are supported by Tables 2–3; the 30–60% hypothesis validation range matches Table 3 (validated+refuted across datasets).", 26 "source": "haiku" 27 }, 28 "causal_claims_justified": { 29 "applies": true, 30 "answer": true, 31 "justification": "Causal claim that DoVer interventions cause failure recovery is supported by comparison against Self-Refine and CRITIC baselines both achieving 0% recovery vs DoVer's 17.6–27.5%, and by ablation studies varying models.", 32 "source": "haiku" 33 }, 34 "generalization_bounded": { 35 "applies": true, 36 "answer": true, 37 "justification": "Section 7 explicitly states results 'should be interpreted as evidence of feasibility rather than universal guarantees' and enumerates specific constraints on covered frameworks, task types, and architectures.", 38 "source": "haiku" 39 }, 40 "alternative_explanations_discussed": { 41 "applies": true, 42 "answer": true, 43 "justification": "Section 5.5 discusses the 29–67% inconclusive cases as arising from sub-agent capability gaps rather than incorrect hypotheses, and Section 3 discusses multiple competing sources of annotation uncertainty.", 44 "source": "haiku" 45 }, 46 "proxy_outcome_distinction": { 47 "applies": true, 48 "answer": true, 49 "justification": "The paper carefully distinguishes Trial Success Rate (task completion), Progress Made (milestone advancement), and hypothesis validation (Validated/Refuted/Inconclusive), with explicit acknowledgment that LLM-as-a-judge evaluation may introduce biases.", 50 "source": "haiku" 51 } 52 }, 53 "limitations_and_scope": { 54 "limitations_section_present": { 55 "applies": true, 56 "answer": true, 57 "justification": "Section 7 is explicitly titled 'LIMITATIONS AND GENERALIZABILITY' and spans over a full page with specific discussion.", 58 "source": "haiku" 59 }, 60 "threats_to_validity_specific": { 61 "applies": true, 62 "answer": true, 63 "justification": "Specific threats include: restriction to two agent frameworks, requirement for checkpoint/replay interfaces, interventions limited to orchestrator text messages (cannot modify sub-agent code), and LLM-as-a-judge bias in milestone and validation assessments.", 64 "source": "haiku" 65 }, 66 "scope_boundaries_stated": { 67 "applies": true, 68 "answer": true, 69 "justification": "Section 7 explicitly states the work does not cover 'long-running production workloads, domains with strict latency or cost constraints, or settings with safety-critical requirements' and that checkpointing requires 'non-trivial engineering effort.'", 70 "source": "haiku" 71 } 72 }, 73 "conflicts_of_interest": { 74 "funding_disclosed": { 75 "applies": true, 76 "answer": false, 77 "justification": "The acknowledgements section thanks reviewers and collaborators but contains no funding disclosure statement.", 78 "source": "haiku" 79 }, 80 "affiliations_disclosed": { 81 "applies": true, 82 "answer": true, 83 "justification": "Author affiliations are clearly stated on the title page: Chinese Academy of Sciences and Microsoft; Microsoft employees evaluate primarily Microsoft's Magentic-One and AutoGen2 frameworks.", 84 "source": "haiku" 85 }, 86 "funder_independent_of_outcome": { 87 "applies": true, 88 "answer": false, 89 "justification": "The majority of authors are from Microsoft and the primary evaluation framework (Magentic-One) and secondary framework (AG2/AutoGen2) are both Microsoft products, creating a direct conflict between funder affiliation and outcome.", 90 "source": "haiku" 91 }, 92 "financial_interests_declared": { 93 "applies": true, 94 "answer": false, 95 "justification": "No competing interests or financial interests statement appears anywhere in the paper.", 96 "source": "haiku" 97 } 98 }, 99 "scope_and_framing": { 100 "key_terms_defined": { 101 "applies": true, 102 "answer": true, 103 "justification": "'Failure' is precisely defined (executes without interruption but produces incorrect/unsatisfactory results), 'Trial' is defined as a contiguous planning–execution span, and intervention categories (orchestrator_ledger, orchestrator_instruction, subagent_instruction) are enumerated.", 104 "source": "haiku" 105 }, 106 "intended_contribution_clear": { 107 "applies": true, 108 "answer": true, 109 "justification": "Three explicit contributions are itemized in the introduction: (i) the DoVer framework, (ii) analysis of ground-truth annotation uncertainty, (iii) experimental demonstration of failure recovery.", 110 "source": "haiku" 111 }, 112 "engagement_with_prior_work": { 113 "applies": true, 114 "answer": true, 115 "justification": "Section 2 provides structured related work distinguishing failure-attribution work from debugging/repair work, and Section 5.3 compares against Self-Refine and CRITIC; the paper also explicitly positions against the concurrent Who&When attribution approach.", 116 "source": "haiku" 117 } 118 } 119 }, 120 "type_checklist": { 121 "empirical": { 122 "artifacts": { 123 "code_released": { 124 "applies": true, 125 "answer": false, 126 "justification": "The abstract states 'Project website and code will be available at https://aka.ms/DoVer' — a future-release promise, not a current release; the anonymous repository referenced in Appendix C is not a public release.", 127 "source": "haiku" 128 }, 129 "data_released": { 130 "applies": true, 131 "answer": true, 132 "justification": "All evaluation datasets (GAIA, AssistantBench, GSMPlus) are publicly available standard benchmarks; the WW dataset from Zhang et al. (2025c) is published.", 133 "source": "haiku" 134 }, 135 "environment_specified": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper specifies model versions (GPT-4o-20241120, GPT-5-chat-20250807, Qwen3-8B/32B) and mentions 'Azure OpenAI using default parameters,' but provides no requirements file, Dockerfile, or comprehensive dependency specification.", 139 "source": "haiku" 140 }, 141 "reproduction_instructions": { 142 "applies": true, 143 "answer": false, 144 "justification": "No step-by-step reproduction instructions are provided; the code is not yet released and Appendix C describes integration effort at a high level without runnable instructions.", 145 "source": "haiku" 146 } 147 }, 148 "statistical_methodology": { 149 "confidence_intervals_or_error_bars": { 150 "applies": true, 151 "answer": false, 152 "justification": "Table 5 (reproduction study) reports standard deviations, but Tables 2 and 3 (main DoVer results) report no CIs or error bars despite the paper stating three independent runs were performed per intervention.", 153 "source": "haiku" 154 }, 155 "significance_tests": { 156 "applies": true, 157 "answer": false, 158 "justification": "No statistical significance tests are applied to any comparative claims; performance differences are reported as raw percentages without testing.", 159 "source": "haiku" 160 }, 161 "effect_sizes_reported": { 162 "applies": true, 163 "answer": true, 164 "justification": "Effect sizes are reported as flip rates (17.6%, 27.5%, 49%) with clear baseline context (0% for Self-Refine/CRITIC), and milestone progress is quantified as percentage gain.", 165 "source": "haiku" 166 }, 167 "sample_size_justified": { 168 "applies": true, 169 "answer": false, 170 "justification": "Sample sizes are small (26–45 cases per benchmark split) and no power analysis or justification for these sizes is provided.", 171 "source": "haiku" 172 }, 173 "variance_reported": { 174 "applies": true, 175 "answer": false, 176 "justification": "Main results tables (2 and 3) report only point estimates; variance across the three independent runs per intervention is not reported.", 177 "source": "haiku" 178 } 179 }, 180 "evaluation_design": { 181 "baselines_included": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section 5.3 compares against Self-Refine-style and CRITIC-style baselines, both achieving 0% recovery on WW-GAIA.", 185 "source": "haiku" 186 }, 187 "baselines_contemporary": { 188 "applies": true, 189 "answer": true, 190 "justification": "Self-Refine (2023) and CRITIC (2023) are the standard self-improvement paradigm comparators; they are reasonable contemporaries for the self-correction approach, though not specifically designed for multi-agent debugging.", 191 "source": "haiku" 192 }, 193 "ablation_study": { 194 "applies": true, 195 "answer": true, 196 "justification": "Table 4 ablates DoVer's underlying model (Qwen3-8B, Qwen3-32B vs GPT-4o) and prompting strategy (0-shot vs 3-shot), demonstrating component contributions.", 197 "source": "haiku" 198 }, 199 "multiple_metrics": { 200 "applies": true, 201 "answer": true, 202 "justification": "Evaluation uses Trial Success Rate, Progress Made (milestone advancement), and a four-category hypothesis validation taxonomy (Validated/Partially Validated/Refuted/Inconclusive).", 203 "source": "haiku" 204 }, 205 "human_evaluation": { 206 "applies": true, 207 "answer": false, 208 "justification": "No human evaluation of DoVer's outputs is performed; milestone evaluation and hypothesis validation both use LLM-as-a-judge (GPT-5 specified in Section 5.1).", 209 "source": "haiku" 210 }, 211 "held_out_test_set": { 212 "applies": true, 213 "answer": true, 214 "justification": "GAIA Level-1 validation set cases not in WW provide a held-out evaluation, and all benchmark cases are independent of model training data in principle.", 215 "source": "haiku" 216 }, 217 "per_category_breakdown": { 218 "applies": true, 219 "answer": true, 220 "justification": "Results are broken down by dataset (WW-AB, WW-GAIA, GAIA-Level-1, GSMPlus) and by hypothesis outcome category (Validated/Inconclusive/Partially Validated/Refuted) in Tables 2 and 3.", 221 "source": "haiku" 222 }, 223 "failure_cases_discussed": { 224 "applies": true, 225 "answer": true, 226 "justification": "Section 5.4 presents qualitative case studies for Refuted and Inconclusive outcomes; Section 5.5 analyzes the 29–67% inconclusive rate and identifies specific sub-agent bottlenecks (missing scroll-to-bottom tool, PDF handling).", 227 "source": "haiku" 228 }, 229 "negative_results_reported": { 230 "applies": true, 231 "answer": true, 232 "justification": "WW-AB Progress Made is reported as '+0%' (interventions may hinder progress), 60–67% inconclusive rate in WW is reported honestly, and Self-Refine/CRITIC 0% recovery is explicitly stated.", 233 "source": "haiku" 234 } 235 }, 236 "setup_transparency": { 237 "model_versions_specified": { 238 "applies": true, 239 "answer": true, 240 "justification": "Exact model versions are specified: 'GPT-4o-20241120' and 'GPT-5-chat-20250807' in Section 3 footnote; Qwen3-8B and Qwen3-32B in Table 4.", 241 "source": "haiku" 242 }, 243 "prompts_provided": { 244 "applies": true, 245 "answer": true, 246 "justification": "Appendix B provides all six prompts in full: Trial Segmenter (Fig. 5), Failure Proposer (Figs. 6–7), Intervention Recommender (Fig. 8), Milestone Extractor (Fig. 9), Milestone Evaluator (Fig. 10), and Post-Intervention Classifier (Fig. 11).", 247 "source": "haiku" 248 }, 249 "hyperparameters_reported": { 250 "applies": true, 251 "answer": false, 252 "justification": "The paper states 'All LLM API calls are made through Azure OpenAI using default parameters' but does not specify what those defaults are (temperature, top-p, max tokens, etc.).", 253 "source": "haiku" 254 }, 255 "scaffolding_described": { 256 "applies": true, 257 "answer": true, 258 "justification": "Section 4 describes the DoVer pipeline in detail (trial segmentation, failure attribution, intervention generation, execution); Appendix C describes the checkpointing/replay integration for AG2.", 259 "source": "haiku" 260 }, 261 "data_preprocessing_documented": { 262 "applies": true, 263 "answer": true, 264 "justification": "Section 5.1 describes failure trace collection ('initial run over all cases to identify failure traces'), explains why WW/MAST logs are not directly usable, and documents checkpoint-based re-collection.", 265 "source": "haiku" 266 } 267 }, 268 "data_integrity": { 269 "raw_data_available": { 270 "applies": true, 271 "answer": false, 272 "justification": "The collected failure traces with checkpoints are not released; code is promised as future release and the anonymous repository is not a public release.", 273 "source": "haiku" 274 }, 275 "data_collection_described": { 276 "applies": true, 277 "answer": true, 278 "justification": "Section 5.1 describes the data collection procedure: initial execution runs to identify failures, checkpoint capture at each step, and why existing WW/MAST logs required re-collection.", 279 "source": "haiku" 280 }, 281 "recruitment_methods_described": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants; all evaluation uses standard benchmarks (GAIA, AssistantBench, GSMPlus).", 285 "source": "haiku" 286 }, 287 "data_pipeline_documented": { 288 "applies": true, 289 "answer": true, 290 "justification": "The full pipeline from initial run → failure identification → trial segmentation → hypothesis generation → intervention → re-execution → scoring is documented across Sections 4–5 and Appendix C.", 291 "source": "haiku" 292 } 293 }, 294 "contamination": { 295 "training_cutoff_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "The training data cutoffs for GPT-4o-20241120 and GPT-5-chat-20250807 are not stated; GAIA is a public benchmark that may be in training data.", 299 "source": "haiku" 300 }, 301 "train_test_overlap_discussed": { 302 "applies": true, 303 "answer": false, 304 "justification": "No discussion of potential overlap between GAIA/AssistantBench benchmark examples and GPT-4o or GPT-5 training data.", 305 "source": "haiku" 306 }, 307 "benchmark_contamination_addressed": { 308 "applies": true, 309 "answer": false, 310 "justification": "GAIA and AssistantBench are publicly available benchmarks predating GPT-4o's training cutoff; the paper does not address whether benchmark examples were seen during pretraining.", 311 "source": "haiku" 312 } 313 }, 314 "human_studies": { 315 "pre_registered": { 316 "applies": false, 317 "answer": false, 318 "justification": "No human participants in this study.", 319 "source": "haiku" 320 }, 321 "irb_or_ethics_approval": { 322 "applies": false, 323 "answer": false, 324 "justification": "No human participants in this study.", 325 "source": "haiku" 326 }, 327 "demographics_reported": { 328 "applies": false, 329 "answer": false, 330 "justification": "No human participants in this study.", 331 "source": "haiku" 332 }, 333 "inclusion_exclusion_criteria": { 334 "applies": false, 335 "answer": false, 336 "justification": "No human participants in this study.", 337 "source": "haiku" 338 }, 339 "randomization_described": { 340 "applies": false, 341 "answer": false, 342 "justification": "No human participants in this study.", 343 "source": "haiku" 344 }, 345 "blinding_described": { 346 "applies": false, 347 "answer": false, 348 "justification": "No human participants in this study.", 349 "source": "haiku" 350 }, 351 "attrition_reported": { 352 "applies": false, 353 "answer": false, 354 "justification": "No human participants in this study.", 355 "source": "haiku" 356 } 357 }, 358 "cost_and_practicality": { 359 "inference_cost_reported": { 360 "applies": true, 361 "answer": false, 362 "justification": "No inference cost or API cost estimates are reported despite using GPT-4o and GPT-5 for all runs, including three independent repeats per intervention across hundreds of trials.", 363 "source": "haiku" 364 }, 365 "compute_budget_stated": { 366 "applies": true, 367 "answer": false, 368 "justification": "No total compute budget or wall-clock time is reported.", 369 "source": "haiku" 370 } 371 } 372 } 373 }, 374 "claims": [ 375 { 376 "claim": "DoVer recovers 18–28% of failed trials on GAIA and AssistantBench under the Magentic-One framework.", 377 "evidence": "Table 2 reports 17.6% for WW-AB/WW-GAIA combined and 27.5% for GAIA-Level-1.", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "DoVer achieves 49% trial success rate on GSMPlus with the AG2/AutoGen2 framework, demonstrating generality.", 382 "evidence": "Table 2, GSMPlus row: 198 intervened trials, 49.0% success rate.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "Log-based failure attribution suffers from substantial ground-truth annotation uncertainty (~48% of examined cases).", 387 "evidence": "Section 3 reports 14 of 29 GAIA cases in WW exhibit GT uncertainty; annotator initial disagreement of ~20% reported by WW itself.", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "Prompt refinements (step indexing + guidance reminders) improve GPT-4o step attribution accuracy from 6% to 24% on WW-HC.", 392 "evidence": "Table 5: baseline GPT-4o 6.04% step accuracy; +Step Index 20.69%; +Guidance 23.56%.", 393 "supported": "strong" 394 }, 395 { 396 "claim": "Self-Refine and CRITIC self-improvement baselines achieve 0% failure recovery on WW-GAIA.", 397 "evidence": "Section 5.3 explicitly states neither baseline flips any failure into success across all 26 WW-GAIA failed cases.", 398 "supported": "strong" 399 }, 400 { 401 "claim": "DoVer validates or refutes 30–60% of failure hypotheses depending on task complexity.", 402 "evidence": "Table 3: GAIA-Level-1 achieves 34.9%+23.8%=58.7% validated+refuted; WW splits achieve ~30% each.", 403 "supported": "moderate" 404 } 405 ], 406 "methodology_tags": [ 407 "benchmark-eval", 408 "case-study" 409 ], 410 "key_findings": "DoVer is an intervention-driven debugging framework for LLM multi-agent systems that operationalizes failure diagnosis by applying targeted edits to suspected failure points and re-executing traces, recovering 18–28% of GAIA/AssistantBench failures and 49% of GSMPlus failures versus 0% for self-improvement baselines. The paper also demonstrates that log-based failure attribution is fundamentally limited by annotation uncertainty (48% of examined GAIA cases have ambiguous ground-truth labels), motivating the outcome-oriented evaluation. A significant limitation is the 30–67% inconclusive rate, primarily because orchestrator-level interventions cannot address sub-agent capability gaps. The work is from Microsoft authors evaluating primarily Microsoft-developed frameworks (Magentic-One, AutoGen2), raising potential affiliation bias.", 411 "red_flags": [ 412 { 413 "flag": "Small evaluation samples, no power analysis", 414 "detail": "Core evaluation uses only 26–45 failed cases per benchmark split; no power analysis or justification for sample size is provided, limiting statistical conclusions." 415 }, 416 { 417 "flag": "Microsoft authors evaluating Microsoft frameworks", 418 "detail": "Majority of authors are Microsoft employees and the primary evaluation frameworks (Magentic-One, AutoGen2/AG2) are Microsoft products; no disclosure of this conflict." 419 }, 420 { 421 "flag": "Main results lack variance despite 3 repeats", 422 "detail": "Tables 2–3 report only point estimates with no standard deviations or CIs despite running three independent intervention runs per trial, obscuring reliability." 423 }, 424 { 425 "flag": "Benchmark contamination not addressed", 426 "detail": "GAIA and AssistantBench are public benchmarks that were available before GPT-4o/5 training cutoffs; potential contamination is not discussed." 427 }, 428 { 429 "flag": "Code not released at submission", 430 "detail": "Abstract promises future availability ('will be available'); no public code exists to reproduce results." 431 }, 432 { 433 "flag": "No funding disclosure", 434 "detail": "No funding statement appears in the paper despite Microsoft institutional affiliation." 435 } 436 ], 437 "cited_papers": [ 438 { 439 "title": "Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks", 440 "relevance": "Primary agent framework used for evaluation; DoVer is integrated with Magentic-One's checkpointing infrastructure." 441 }, 442 { 443 "title": "GAIA: A Benchmark for General AI Assistants", 444 "relevance": "Core evaluation benchmark; GAIA Level-1/2/3 failure cases form the primary test set." 445 }, 446 { 447 "title": "Why Do Multi-Agent LLM Systems Fail? (MAST)", 448 "relevance": "Provides failure taxonomy for multi-agent systems; supplies the MathChat/GSMPlus experimental setup used in AG2 evaluation." 449 }, 450 { 451 "title": "Which Agent Causes Task Failures and When? (Who&When)", 452 "relevance": "The log-based attribution benchmark and dataset that DoVer analyzes and critiques; provides the WW failure traces and baseline method." 453 }, 454 { 455 "title": "TRAIL: Trace Reasoning and Agentic Issue Localization", 456 "relevance": "Concurrent work on turn-level failure taxonomy and long-context trace debugging; shows strong models still struggle." 457 }, 458 { 459 "title": "Interactive Debugging and Steering of Multi-Agent AI Systems (AGDebugger)", 460 "relevance": "Human-in-the-loop debugging tool that DoVer adapts to enable automated checkpointing and replay." 461 }, 462 { 463 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 464 "relevance": "The agent execution pattern (planning–execution cycles) that creates the multi-trial structure DoVer exploits." 465 }, 466 { 467 "title": "Self-Refine: Iterative Refinement with Self-Feedback", 468 "relevance": "Baseline self-improvement method compared against in ablation study." 469 }, 470 { 471 "title": "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?", 472 "relevance": "One of the two main evaluation benchmarks; provides the WW-AB failure cases." 473 }, 474 { 475 "title": "AgentDebug / Where LLM Agents Fail and How They Can Learn from Failures", 476 "relevance": "Concurrent intervention-driven debugging work similar to DoVer; acknowledged as parallel development." 477 } 478 ], 479 "engagement_factors": { 480 "practical_relevance": { 481 "score": 3, 482 "justification": "Directly addresses multi-agent system debugging, a concrete pain point for any team deploying LLM agents in production." 483 }, 484 "surprise_contrarian": { 485 "score": 2, 486 "justification": "Challenges the prevailing log-based attribution paradigm by showing ~48% of ground-truth annotations are uncertain and that self-improvement baselines achieve 0% recovery." 487 }, 488 "fear_safety": { 489 "score": 1, 490 "justification": "Addresses reliability of agentic systems but does not raise safety or harm concerns." 491 }, 492 "drama_conflict": { 493 "score": 1, 494 "justification": "Mild methodological critique of the Who&When benchmark's annotation quality; not a high-profile controversy." 495 }, 496 "demo_ability": { 497 "score": 2, 498 "justification": "Figure 4 shows a working web-based intervention interface for AG2 MathChat, but code is not yet publicly released." 499 }, 500 "brand_recognition": { 501 "score": 2, 502 "justification": "Microsoft affiliation and use of Magentic-One and AutoGen2 (known Microsoft products) provides moderate brand recognition." 503 } 504 }, 505 "hn_data": { 506 "threads": [ 507 { 508 "hn_id": "42378335", 509 "title": "Training LLMs to Reason in a Continuous Latent Space", 510 "points": 283, 511 "comments": 114, 512 "url": "https://news.ycombinator.com/item?id=42378335", 513 "created_at": "2024-12-10T16:26:17Z" 514 }, 515 { 516 "hn_id": "43042753", 517 "title": "LM2: Large Memory Models", 518 "points": 110, 519 "comments": 30, 520 "url": "https://news.ycombinator.com/item?id=43042753", 521 "created_at": "2025-02-13T23:21:21Z" 522 }, 523 { 524 "hn_id": "29568816", 525 "title": "Proof of Steak", 526 "points": 79, 527 "comments": 28, 528 "url": "https://news.ycombinator.com/item?id=29568816", 529 "created_at": "2021-12-15T17:16:25Z" 530 }, 531 { 532 "hn_id": "30078848", 533 "title": "Phishing in organizations: Findings from a large-scale and long-term study", 534 "points": 30, 535 "comments": 10, 536 "url": "https://news.ycombinator.com/item?id=30078848", 537 "created_at": "2022-01-25T22:11:11Z" 538 }, 539 { 540 "hn_id": "42456288", 541 "title": "Rethinking the Combination of Graph Neural Network and Large Language Model", 542 "points": 2, 543 "comments": 0, 544 "url": "https://news.ycombinator.com/item?id=42456288", 545 "created_at": "2024-12-18T22:41:39Z" 546 }, 547 { 548 "hn_id": "38762672", 549 "title": "Building Trustworthy NeuroSymbolic AI Systems", 550 "points": 2, 551 "comments": 0, 552 "url": "https://news.ycombinator.com/item?id=38762672", 553 "created_at": "2023-12-25T14:04:27Z" 554 }, 555 { 556 "hn_id": "29485809", 557 "title": "Deep learning for elliptic and parabolic boundary value problems", 558 "points": 2, 559 "comments": 0, 560 "url": "https://news.ycombinator.com/item?id=29485809", 561 "created_at": "2021-12-08T15:22:21Z" 562 }, 563 { 564 "hn_id": "42470646", 565 "title": "SpikeFI: A Fault Injection Framework for Spiking Neural Networks", 566 "points": 1, 567 "comments": 0, 568 "url": "https://news.ycombinator.com/item?id=42470646", 569 "created_at": "2024-12-20T12:47:13Z" 570 } 571 ], 572 "top_points": 283, 573 "total_points": 509, 574 "total_comments": 182 575 } 576 }