scan-v5.json (29058B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Enhancing LLM Code Generation: A Systematic Evaluation of Multi-Agent Collaboration and Runtime Debugging for Improved Accuracy, Reliability, and Latency", 6 "authors": [ 7 "Nazmus Ashrafi", 8 "Salah Bouktif", 9 "Mohammed Mediani" 10 ], 11 "year": 2025, 12 "venue": "arXiv.org", 13 "arxiv_id": "2505.02133", 14 "doi": "10.48550/arXiv.2505.02133" 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "Abstract claims about 19 LLMs, two benchmarks, and the combined approach are all confirmed in the paper body with corresponding tables (Table 2) and statistical tests.", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": true, 27 "justification": "Within-subject paired t-tests compare each of 19 models under all three conditions (ACT, Debug, ACT+Debug), which is a reasonable design for causal inference in controlled benchmark evaluation.", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": false, 33 "justification": "The paper evaluates only on HumanEval and HumanEval+ (Python programming tasks from 2021) but makes broad claims about 'organizations seeking robust AI-driven coding solutions' and 'real-world AI applications' without bounding to Python code completion specifically.", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": true, 39 "justification": "The paper discusses why debugging outperforms agentic workflows (context-rich execution feedback), why complex agentic interactions hurt (introducing fragility), and why specific models respond differently to combination approaches.", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "Pass@1 is clearly described as measuring functional correctness; code rigor is operationalized as the accuracy drop from HumanEval to HumanEval+ (80× more tests); latency is wall-clock time — each claim is matched to its measurement.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": false, 53 "justification": "There is no dedicated limitations or threats-to-validity section; the paper goes directly from results to conclusion. Scattered remarks in methodology (e.g., reliance on visible test cases) do not constitute a section.", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": false, 59 "justification": "No formal threats-to-validity discussion exists. Comments like 'same prompts for all models may not be ideal' and 'LDB does not fully replicate real-world debugging' are isolated remarks, not a systematic treatment.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": false, 65 "justification": "The paper does not state what results do NOT show; conclusions are framed broadly without bounding to HumanEval-style Python tasks, specific model families, or the particular iteration limits chosen.", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding source is mentioned anywhere in the paper.", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "All three authors are identified as being from the Department of Computer Science and Software Engineering, United Arab Emirates University, Al Ain, UAE.", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": false, 84 "answer": false, 85 "justification": "No funding is disclosed, so independence of funder cannot be assessed.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests or financial interests statement is present in the paper.", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "Multi-agent collaboration is defined through its components (Analyst, Coder, Tester) and workflow; runtime debugging is explained via the LDB-based block-level approach; pass@k is formally described with its formula.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "The paper clearly states it empirically evaluates the combination of multi-agent collaboration and runtime debugging across 19 LLMs on two benchmarks, contributing insights into when and how combination strategies are beneficial.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 2 systematically reviews related frameworks (AgentCoder, MapCoder, LDB, CYCLE, self-collaboration, RGD, MGDebugger) and Section 3 explicitly positions the proposed approach as combining and extending these prior methods.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": true, 122 "justification": "The paper links to a GitHub repository (https://github.com/nazmus-ashrafi/multiagent_vs_debugger) explicitly for agent prompts and code.", 123 "source": "haiku" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": true, 128 "justification": "Both HumanEval and HumanEval+ are publicly available standard benchmarks requiring no separate release.", 129 "source": "haiku" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "No requirements.txt, Dockerfile, or dependency specification is provided; only the API access month (December 2024) is noted, which is insufficient for reproduction.", 135 "source": "haiku" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "No step-by-step reproduction instructions are provided in the paper; the GitHub reference is specifically for prompts, not a complete reproduction guide.", 141 "source": "haiku" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": false, 148 "justification": "Table 2 reports only point estimates (pass@1 percentages); no confidence intervals or error bars are reported for any model-approach combination.", 149 "source": "haiku" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": true, 154 "justification": "One-tailed paired t-tests are conducted comparing ACT+Debug vs ACT alone and ACT+Debug vs Debug alone, with t-statistics, degrees of freedom, and significance levels explicitly reported.", 155 "source": "haiku" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Percentage differences are reported throughout (e.g., 0.68% mean accuracy improvement for AC+Debug over Debug alone, 6.7% gap between Debug and ACT on HumanEval) providing practical effect size context.", 161 "source": "haiku" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": false, 166 "justification": "The 19 LLMs are chosen for diversity but no power analysis or formal justification for why 19 models is sufficient for the statistical tests performed is provided.", 167 "source": "haiku" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": false, 172 "justification": "Only mean accuracy values are reported; no standard deviation or variance is provided. Single-sample evaluation (n=1 per problem) eliminates run-level variance but inter-run reproducibility is not assessed.", 173 "source": "haiku" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": true, 180 "justification": "Five baselines are included: Basic (single prompt), AC, ACT, Debugger Only, and AC+Debugger, enabling comprehensive comparison against the proposed ACT+Debugger approach.", 181 "source": "haiku" 182 }, 183 "baselines_contemporary": { 184 "applies": true, 185 "answer": true, 186 "justification": "LDB (2024) and self-collaboration framework (2023) are contemporary; models include GPT-4o, Claude 3.5 Sonnet, DeepSeek-V3 — all state-of-the-art at time of experiments (December 2024).", 187 "source": "haiku" 188 }, 189 "ablation_study": { 190 "applies": true, 191 "answer": true, 192 "justification": "The six approaches (Basic→AC→ACT→Debugger→AC+Debug→ACT+Debug) form a systematic ablation isolating the contribution of analyst, tester, and debugger modules individually and in combination.", 193 "source": "haiku" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": true, 198 "justification": "Three metrics are used: functional accuracy (pass@1 on HumanEval), code rigor (accuracy drop on HumanEval+ with 80× more tests), and latency (execution time in minutes per Table 3).", 199 "source": "haiku" 200 }, 201 "human_evaluation": { 202 "applies": false, 203 "answer": false, 204 "justification": "Human evaluation is not applicable for automated code generation evaluated against unit tests; functional correctness is measured programmatically.", 205 "source": "haiku" 206 }, 207 "held_out_test_set": { 208 "applies": true, 209 "answer": true, 210 "justification": "HumanEval's hidden test cases are reserved for final evaluation while visible test cases are used for in-pipeline execution feedback, ensuring final evaluation is on held-out data.", 211 "source": "haiku" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": true, 216 "justification": "Results are broken down per model (all 19 LLMs in Table 2, Figures 4-5 per provider family) and per approach, with per-model analysis of which configurations help or hurt specific architectures.", 217 "source": "haiku" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Failure cases are explicitly discussed: QwQ-Preview's severe degradation with agentic approaches, GPT-4o underperforming with ACT+Debug on HumanEval+, and Llama/DeepSeek models gaining nothing from ACT.", 223 "source": "haiku" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": true, 228 "justification": "The paper explicitly reports that ACT+Debug does NOT significantly improve over Debug alone (H0,2 not rejected), that more complex agentic workflows reduce code rigor, and that adding ACT hurts several models.", 229 "source": "haiku" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": true, 236 "justification": "Table 1 lists specific model names, versions, and API endpoints with the note that 'All APIs were accessed in the month of December 2024.'", 237 "source": "haiku" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": true, 242 "justification": "The paper explicitly states all agent prompts are available in the GitHub repository (https://github.com/nazmus-ashrafi/multiagent_vs_debugger), covering role-specific instructions for all agents in both phases.", 243 "source": "haiku" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": false, 248 "justification": "Iteration limits (retriesCT=3, retriesD=4 for combined, retriesD=10 for standalone) are reported, but temperature, top-p, and other LLM sampling hyperparameters are never mentioned.", 249 "source": "haiku" 250 }, 251 "scaffolding_described": { 252 "applies": true, 253 "answer": true, 254 "justification": "The multi-agent pipeline (ACT phases, debugging phase, CFG analysis, iteration limits, agent handoff conditions) is described in detail in Section 3 with an architecture diagram in Figure 1.", 255 "source": "haiku" 256 }, 257 "data_preprocessing_documented": { 258 "applies": true, 259 "answer": true, 260 "justification": "The split of HumanEval into task description, visible test cases, and hidden test cases is clearly described; benchmarks are used as-is with the split rationale explained.", 261 "source": "haiku" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": false, 268 "justification": "Aggregated pass@1 scores are in Table 2 but raw per-problem results (which specific problems each model/approach passed or failed) are not released.", 269 "source": "haiku" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "The use of HumanEval and HumanEval+ APIs, the specific API endpoints in Table 1, and the December 2024 access period are documented.", 275 "source": "haiku" 276 }, 277 "recruitment_methods_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants; standard benchmarks are used with no recruitment.", 281 "source": "haiku" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": true, 286 "justification": "The pipeline from benchmark problem input through agent collaboration and debugging phases to final pass@1 evaluation is described in Section 3 and illustrated in Figure 1.", 287 "source": "haiku" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "Training cutoffs are not stated for any of the 19 LLMs; only API access dates (December 2024) are noted, not when training data was collected.", 295 "source": "haiku" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": true, 299 "answer": false, 300 "justification": "HumanEval (2021) predates all tested models' training data, making contamination highly likely, yet the paper never discusses potential training data overlap with the benchmark.", 301 "source": "haiku" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": true, 305 "answer": false, 306 "justification": "HumanEval has been publicly available since 2021 and is almost certainly in the training data of all 19 LLMs tested (some achieving >90% pass@1); this is never acknowledged or addressed.", 307 "source": "haiku" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": false, 313 "answer": false, 314 "justification": "No human participants.", 315 "source": "haiku" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": false, 319 "answer": false, 320 "justification": "No human participants.", 321 "source": "haiku" 322 }, 323 "demographics_reported": { 324 "applies": false, 325 "answer": false, 326 "justification": "No human participants.", 327 "source": "haiku" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": false, 331 "answer": false, 332 "justification": "No human participants.", 333 "source": "haiku" 334 }, 335 "randomization_described": { 336 "applies": false, 337 "answer": false, 338 "justification": "No human participants.", 339 "source": "haiku" 340 }, 341 "blinding_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "No human participants.", 345 "source": "haiku" 346 }, 347 "attrition_reported": { 348 "applies": false, 349 "answer": false, 350 "justification": "No human participants.", 351 "source": "haiku" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": true, 357 "answer": true, 358 "justification": "Latency per approach is reported in Table 3 and Figure 13 (ranging from 7.68 to 68.42 minutes average); Figure 4 caption qualitatively ranks models by token cost.", 359 "source": "haiku" 360 }, 361 "compute_budget_stated": { 362 "applies": true, 363 "answer": false, 364 "justification": "Total computational budget (API costs, total tokens consumed across 19 models × 2 datasets × 6 approaches × 164 problems) is not stated.", 365 "source": "haiku" 366 } 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "ACT+Debug significantly outperforms ACT alone at α=0.15 significance level", 373 "evidence": "Paired t-test on 19 LLMs: mean accuracy 64.82% (ACT+Debug) vs 57.16% (ACT) on HumanEval; H0,1 rejected at α=0.15", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "ACT+Debug does NOT significantly outperform Debug alone", 378 "evidence": "Only 0.96% mean accuracy difference (64.82% vs 63.86%); H0,2 not rejected at α=0.15; explicitly stated as non-significant", 379 "supported": "strong" 380 }, 381 { 382 "claim": "AC+Debugger achieves the optimal balance of accuracy, rigor, and latency", 383 "evidence": "AC+Debug yields 0.68% mean accuracy improvement over Debug alone at 38.42 min vs 31.11 min, while ACT+Debug takes 68.42 min with lower HumanEval+ accuracy (-1.22% vs AC+Debug)", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "Debugging-based approaches generally outperform agentic workflows", 388 "evidence": "Debug alone achieves 61.02% mean accuracy across both datasets vs 54.04% for ACT; 6.7% gap on HumanEval and 7.36% on HumanEval+", 389 "supported": "strong" 390 }, 391 { 392 "claim": "Increased agentic complexity reduces code rigor under stringent testing", 393 "evidence": "ACT+Debug shows the largest accuracy drop sum (137.74 across all models) on HumanEval+ vs Basic approach (90.83); AC+Debug drop is 110.41", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "The benefit of combining approaches diminishes when the Debug-ACT performance gap is large", 398 "evidence": "Figures 6-7 show inverse correlation between the Debug-ACT gap and improvement from combining approaches across 38 data points (19 models × 2 datasets)", 399 "supported": "moderate" 400 }, 401 { 402 "claim": "OpenAI models consistently benefit from combinatorial approaches while open-source models (Llama, DeepSeek) generally do not", 403 "evidence": "Figure 4: GPT-4o-mini improves from 80.45% to 92.07% with ACT+Debug; Table 2 shows Llama 3.3 70B, DeepSeek-V3, and others gain nothing or regress from adding ACT to debugging", 404 "supported": "moderate" 405 } 406 ], 407 "methodology_tags": [ 408 "benchmark-eval" 409 ], 410 "key_findings": "Across 19 LLMs on HumanEval and HumanEval+, runtime debugging alone outperforms multi-agent agentic workflows by ~7%, while combining a simple Analyst-Coder pipeline with debugging (AC+Debug) yields a modest 0.68% additional accuracy gain with comparable latency — a difference that is not statistically significant even at the non-standard α=0.15 threshold. The benefit of combining approaches inversely correlates with the performance gap between the individual techniques: combination helps most when both strategies perform similarly for a given model. Counter-intuitively, more complex agentic configurations (three-agent ACT) reduce code rigor under stringent testing (HumanEval+) and increase latency without improving accuracy, suggesting simpler agentic workflows paired with debugging represent the practical optimum.", 411 "red_flags": [ 412 { 413 "flag": "Non-standard α=0.15 significance threshold", 414 "detail": "The paper uses α=0.15 for all statistical tests, substantially more permissive than conventional α=0.05. The justification ('even marginal improvements matter in production') is post-hoc and not pre-registered. The main positive finding (ACT+Debug > ACT alone) may not hold at standard thresholds." 415 }, 416 { 417 "flag": "HumanEval contamination unaddressed", 418 "detail": "HumanEval (2021) is widely present in LLM training corpora; some tested models achieve >90% pass@1. The paper never discusses contamination despite evaluating models trained years after the benchmark was published — results may reflect memorization rather than reasoning." 419 }, 420 { 421 "flag": "No confidence intervals on main results", 422 "detail": "Table 2 reports only point estimates for pass@1 scores across 19 models × 6 approaches × 2 datasets. No CIs or error bars are provided, making it impossible to assess uncertainty in individual model comparisons." 423 }, 424 { 425 "flag": "Single sample per problem eliminates run-level variance", 426 "detail": "Using n=1 sample per problem means results cannot be verified for reproducibility across runs with different random seeds; LLM outputs are stochastic and single-sample estimates are unreliable for fine-grained comparisons like 0.68% differences." 427 }, 428 { 429 "flag": "No dedicated limitations section", 430 "detail": "The paper lacks any formal limitations or threats-to-validity section. Generalization to non-HumanEval benchmarks, other programming languages, or real-world coding tasks is never addressed." 431 }, 432 { 433 "flag": "Marginal 0.68% improvement framed as optimal", 434 "detail": "The paper's central practical recommendation (AC+Debug as 'optimal') rests on a 0.68% mean accuracy improvement that is itself not statistically significant, with no discussion of minimum practically meaningful differences." 435 } 436 ], 437 "cited_papers": [ 438 { 439 "title": "AgentCoder: Multi-Agent-based Code Generation with Iterative Testing and Optimisation", 440 "relevance": "Core multi-agent code generation framework this paper builds upon and compares against" 441 }, 442 { 443 "title": "Debug like a Human: A Large Language Model Debugger via Verifying Runtime Execution Step-by-step (LDB)", 444 "relevance": "The debugging component adopted in this paper; authors implement a variant of LDB as the debugging phase" 445 }, 446 { 447 "title": "Self-collaboration Code Generation via ChatGPT", 448 "relevance": "The Analyst-Coder-Tester framework the paper's multi-agent collaboration phase is directly based on" 449 }, 450 { 451 "title": "MapCoder: Multi-Agent Code Generation for Competitive Problem Solving", 452 "relevance": "Related multi-agent code generation approach reviewed in literature" 453 }, 454 { 455 "title": "Evaluating Large Language Models Trained on Code (HumanEval)", 456 "relevance": "Primary evaluation benchmark; defines the pass@k metric used throughout the paper" 457 }, 458 { 459 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation (HumanEval+)", 460 "relevance": "Secondary benchmark with 80× more tests used to measure code rigor throughout the study" 461 }, 462 { 463 "title": "RGD: Multi-LLM Based Agent Debugger via Refinement and Generation Guidance", 464 "relevance": "Related multi-agent debugging framework combining guide, debug, and feedback agents" 465 }, 466 { 467 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 468 "relevance": "State-of-the-art approach combined with LDB achieving 98.2% on HumanEval, motivating the study of LDB integration" 469 }, 470 { 471 "title": "Teaching Large Language Models to Self-Debug", 472 "relevance": "Foundational self-debugging framework using execution feedback for iterative code improvement" 473 }, 474 { 475 "title": "From Code to Correctness: Closing the Last Mile of Code Generation with Hierarchical Debugging (MGDebugger)", 476 "relevance": "Related hierarchical debugging approach for code generation reviewed in literature" 477 } 478 ], 479 "engagement_factors": { 480 "practical_relevance": { 481 "score": 3, 482 "justification": "Provides direct, actionable guidance for organizations choosing between multi-agent and debugging strategies across 19 diverse LLMs with latency and accuracy trade-offs explicitly quantified." 483 }, 484 "surprise_contrarian": { 485 "score": 2, 486 "justification": "Counterintuitive finding that simpler agentic workflows outperform complex ones and that adding a tester agent can reduce code rigor — challenges the 'more agents = better' assumption prevalent in agentic AI research." 487 }, 488 "fear_safety": { 489 "score": 0, 490 "justification": "No AI safety or risk concerns are raised; the paper is purely about code generation accuracy and efficiency." 491 }, 492 "drama_conflict": { 493 "score": 1, 494 "justification": "Mild tension between prevailing enthusiasm for complex multi-agent systems and the finding that they often underperform simpler debugging approaches, without framing this as a controversy." 495 }, 496 "demo_ability": { 497 "score": 2, 498 "justification": "GitHub repository linked with prompts; readers could implement AC+Debugger with API access to any of the 19 models tested using the described pipeline." 499 }, 500 "brand_recognition": { 501 "score": 1, 502 "justification": "Tests well-known models (GPT-4o, Claude 3.5 Sonnet, DeepSeek-V3, Llama) but authors are from UAE University, not a recognized AI research lab." 503 } 504 }, 505 "hn_data": { 506 "threads": [ 507 { 508 "hn_id": "43390400", 509 "title": "Deep Learning Is Not So Mysterious or Different", 510 "points": 485, 511 "comments": 126, 512 "url": "https://news.ycombinator.com/item?id=43390400", 513 "created_at": "2025-03-17T16:47:02Z" 514 }, 515 { 516 "hn_id": "45291024", 517 "title": "Launch HN: Cactus (YC S25) – AI inference on smartphones", 518 "points": 123, 519 "comments": 63, 520 "url": "https://news.ycombinator.com/item?id=45291024", 521 "created_at": "2025-09-18T15:40:29Z" 522 }, 523 { 524 "hn_id": "44430311", 525 "title": "Small language models are the future of agentic AI", 526 "points": 113, 527 "comments": 45, 528 "url": "https://news.ycombinator.com/item?id=44430311", 529 "created_at": "2025-07-01T03:33:49Z" 530 }, 531 { 532 "hn_id": "44659764", 533 "title": "Mitigating Tool Squatting and Rug Pull Attacks in Model Context Protocol (MCP)", 534 "points": 5, 535 "comments": 0, 536 "url": "https://news.ycombinator.com/item?id=44659764", 537 "created_at": "2025-07-23T14:42:26Z" 538 }, 539 { 540 "hn_id": "44246361", 541 "title": "Small Language Models Are the Future of Agentic AI", 542 "points": 5, 543 "comments": 0, 544 "url": "https://news.ycombinator.com/item?id=44246361", 545 "created_at": "2025-06-11T11:16:33Z" 546 }, 547 { 548 "hn_id": "44003454", 549 "title": "Twist: Teleoperated Whole-Body Imitation System", 550 "points": 2, 551 "comments": 0, 552 "url": "https://news.ycombinator.com/item?id=44003454", 553 "created_at": "2025-05-16T09:44:32Z" 554 }, 555 { 556 "hn_id": "23087191", 557 "title": "A Survey on Dialog Management: Recent Advances and Challenges", 558 "points": 2, 559 "comments": 0, 560 "url": "https://news.ycombinator.com/item?id=23087191", 561 "created_at": "2020-05-06T01:52:26Z" 562 }, 563 { 564 "hn_id": "45549900", 565 "title": "Agentic web browsing can't scale with cloud LLMs", 566 "points": 1, 567 "comments": 0, 568 "url": "https://news.ycombinator.com/item?id=45549900", 569 "created_at": "2025-10-11T15:29:17Z" 570 }, 571 { 572 "hn_id": "43291939", 573 "title": "Deep Learning Is Not So Mysterious or Different", 574 "points": 1, 575 "comments": 0, 576 "url": "https://news.ycombinator.com/item?id=43291939", 577 "created_at": "2025-03-07T17:11:27Z" 578 } 579 ], 580 "top_points": 485, 581 "total_points": 737, 582 "total_comments": 234 583 } 584 }