scan-v5.json (28334B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Towards Engineering Multi-Agent LLMs: A Protocol-Driven Approach", 6 "authors": [ 7 "Zhenyu Mao", 8 "Jacky Keung", 9 "Fengji Zhang", 10 "Shuo Liu", 11 "Yifei Wang", 12 "Jialong Li" 13 ], 14 "year": 2025, 15 "venue": "Asia-Pacific Software Engineering Conference", 16 "arxiv_id": "2510.12120", 17 "doi": "10.1109/APSEC66846.2025.00100" 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "All numerical claims in the abstract (69.6%, 56.7%, 47.4%, 28.2% failure reductions) are directly supported by Tables I and II. Specific task-model combinations match reported improvements exactly.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": false, 30 "justification": "Paper claims SEMAP 'reduces failures' but lacks ablation studies to isolate which of three principles (contracts, messaging, lifecycle) causes improvement. Framework confound exists: SEMAP on A2A vs baseline on MetaGPT—cannot distinguish methodology benefit from infrastructure benefit.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": true, 36 "justification": "Claims are appropriately bounded to software engineering tasks. Abstract and conclusion acknowledge that future work includes scaling to larger datasets and comparing against more baselines, showing awareness of current scope limitations.", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": false, 42 "justification": "Paper attributes improvements to SEMAP but does not discuss alternative explanations: A2A framework advantages over MetaGPT, prompt engineering differences, or which of the three principles actually drives improvements.", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": false, 48 "justification": "Paper measures failure counts in three categories but claims this demonstrates 'system robustness' and 'effectiveness.' Does not discuss whether failure counts are the right proxy, whether some failures are more critical than others, or what failure reduction means for real task success.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": false, 56 "justification": "No dedicated Limitations or Threats-to-Validity section. Only a single sentence in conclusion mentions future work: 'To strengthen validity, future experiments will be scaled to larger datasets...'", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": false, 62 "justification": "No specific validity threats discussed. Sample sizes (100 for vulnerability tasks, unspecified for development), single baseline comparison (MetaGPT only), framework confound, LLM-as-Judge evaluation noise, and lack of ablation are unaddressed.", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": false, 68 "justification": "Paper does not explicitly state what results do NOT show: applicability beyond SE, generalization to other LLM families, applicability with non-A2A frameworks, or minimum task complexity thresholds.", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": false, 76 "justification": "No funding source mentioned. No acknowledgments section visible.", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": false, 82 "justification": "Only institutional affiliations listed (City University of Hong Kong, Waseda University). No disclosure of author relationships with Google (A2A), DeepSeek, or OpenAI—all directly evaluated in the paper.", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": false, 87 "answer": false, 88 "justification": "No funder identified, so independence cannot be assessed.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "No competing interests statement. No disclosure of patents, equity, or consulting relationships with evaluated frameworks or companies.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": false, 102 "justification": "Some terms formally defined (behavioral contract, structured messaging, lifecycle as FSM) but key concepts used without clear definition: 'verification' (what constitutes correct verification?), 'robustness,' and how Design by Contract applies to non-deterministic LLM agents.", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "Contribution explicitly stated: SEMAP is a protocol-layer methodology implementing three SE principles, implemented on A2A infrastructure, evaluated on SE tasks. Clear that paper claims both methodology and empirical evaluation.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": false, 114 "justification": "Section II provides background on multi-agent LLMs and protocols, listing MetaGPT, ChatDev, AutoGen. However, paper does not engage deeply with how SEMAP differs from or improves upon existing frameworks' coordination approaches.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "empirical": { 121 "artifacts": { 122 "code_released": { 123 "applies": true, 124 "answer": false, 125 "justification": "No code released. Conclusion states 'Future work also includes... releasing artifacts for reproducibility,' indicating non-availability at publication.", 126 "source": "haiku" 127 }, 128 "data_released": { 129 "applies": true, 130 "answer": true, 131 "justification": "Uses public benchmarks unmodified: HumanEval (OpenAI), ProgramDev (reference [19]), devign100 (Devign subset), vudenc100 (CVEFixes). All standard public datasets.", 132 "source": "haiku" 133 }, 134 "environment_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "Model versions with dates provided (DeepSeek-V3-0324, gpt-4.1-nano-2025-04-14), but no requirements.txt, Dockerfile, Python version, dependencies, or hardware specifications.", 138 "source": "haiku" 139 }, 140 "reproduction_instructions": { 141 "applies": true, 142 "answer": false, 143 "justification": "No step-by-step reproduction instructions. Missing: A2A infrastructure setup, SEMAP principle implementation, agent prompting, LLM-as-Judge evaluation setup, and dataset loading procedures.", 144 "source": "haiku" 145 } 146 }, 147 "statistical_methodology": { 148 "confidence_intervals_or_error_bars": { 149 "applies": true, 150 "answer": false, 151 "justification": "Tables I-II and Figure 2 show failure counts and trends with no confidence intervals, standard errors, or error bands. No mention of multiple runs with different random seeds.", 152 "source": "haiku" 153 }, 154 "significance_tests": { 155 "applies": true, 156 "answer": false, 157 "justification": "No p-values, t-tests, or other significance tests reported. Sample sizes small (n=100 for vulnerability detection). No discussion of whether improvements are statistically significant.", 158 "source": "haiku" 159 }, 160 "effect_sizes_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Percentage reductions reported: 69.6%, 56.7%, 47.4%, 28.2%, etc. These are effect sizes, though context about typical baseline failure rates is missing.", 164 "source": "haiku" 165 }, 166 "sample_size_justified": { 167 "applies": true, 168 "answer": false, 169 "justification": "Vulnerability detection sample sizes (100) are mentioned but not justified as sufficient. Development task sample sizes (HumanEval, ProgramDev) not specified. No power analysis.", 170 "source": "haiku" 171 }, 172 "variance_reported": { 173 "applies": true, 174 "answer": false, 175 "justification": "No standard deviations, variances, or confidence bands reported. Figure 2 shows trends without uncertainty quantification. No mention of runs with multiple random seeds or data points.", 176 "source": "haiku" 177 } 178 }, 179 "evaluation_design": { 180 "baselines_included": { 181 "applies": true, 182 "answer": true, 183 "justification": "MetaGPT baseline included in all comparisons. Tables I-II and Figure 2 show side-by-side SEMAP vs baseline failure counts.", 184 "source": "haiku" 185 }, 186 "baselines_contemporary": { 187 "applies": true, 188 "answer": false, 189 "justification": "MetaGPT is recent, but only one baseline compared. Future work mentions 'single-agent LLMs and domain-specific detectors' as missing comparisons.", 190 "source": "haiku" 191 }, 192 "ablation_study": { 193 "applies": true, 194 "answer": false, 195 "justification": "Conclusion explicitly states 'Ablation studies will isolate the impact of contracts, messaging, and lifecycle control'—indicating no ablation study in current work.", 196 "source": "haiku" 197 }, 198 "multiple_metrics": { 199 "applies": true, 200 "answer": false, 201 "justification": "Only failure-based metrics reported: counts by category, by task, by round. Missing: task success rate, solution correctness, code quality, latency, resource usage, task completion time.", 202 "source": "haiku" 203 }, 204 "human_evaluation": { 205 "applies": true, 206 "answer": false, 207 "justification": "Uses LLM-as-a-Judge (gpt-4o-2024-08-06 to categorize failures) but no human evaluation of actual agent outputs or correctness.", 208 "source": "haiku" 209 }, 210 "held_out_test_set": { 211 "applies": true, 212 "answer": true, 213 "justification": "Standard public benchmarks used: HumanEval, ProgramDev, devign100, vudenc100 all have standard test splits.", 214 "source": "haiku" 215 }, 216 "per_category_breakdown": { 217 "applies": true, 218 "answer": true, 219 "justification": "Results broken down by: failure category (under-specification, misalignment, verification), task type (4 variants), model (2 models), and collaboration round (Figure 2).", 220 "source": "haiku" 221 }, 222 "failure_cases_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No specific failure cases, examples, or error traces shown. Only aggregate failure counts reported. No qualitative analysis of what failures look like.", 226 "source": "haiku" 227 }, 228 "negative_results_reported": { 229 "applies": true, 230 "answer": false, 231 "justification": "Results show all cases are positive (improvements), but effect sizes vary widely (8.3% to 69.6%). Smaller improvements (e.g., 8.3% on DeepSeek devign100) are reported but not discussed or analyzed.", 232 "source": "haiku" 233 } 234 }, 235 "setup_transparency": { 236 "model_versions_specified": { 237 "applies": true, 238 "answer": true, 239 "justification": "Model versions with snapshot dates specified: DeepSeek-V3-0324, gpt-4.1-nano-2025-04-14, gpt-4o-2024-08-06. Dates enable reproducibility.", 240 "source": "haiku" 241 }, 242 "prompts_provided": { 243 "applies": true, 244 "answer": false, 245 "justification": "No actual prompts, system instructions, or templates provided. High-level principles described (contracts, messaging, lifecycle) but not operationalized as concrete prompts.", 246 "source": "haiku" 247 }, 248 "hyperparameters_reported": { 249 "applies": true, 250 "answer": false, 251 "justification": "Only 'up to five collaboration rounds' and 'single round' mentioned. Temperature, top-p, top-k, max_tokens, and other LLM hyperparameters not reported.", 252 "source": "haiku" 253 }, 254 "scaffolding_described": { 255 "applies": true, 256 "answer": false, 257 "justification": "High-level FSM and contract descriptions provided, but implementation details missing: How are contracts implemented in prompts? How does A2A enforce message schemas? What does 'verification' look like in practice?", 258 "source": "haiku" 259 }, 260 "data_preprocessing_documented": { 261 "applies": true, 262 "answer": false, 263 "justification": "Benchmarks used mostly as-is. Vulnerability detection datasets randomly sampled (no seed specified). No documentation of cleaning, filtering, or other preprocessing steps.", 264 "source": "haiku" 265 } 266 }, 267 "data_integrity": { 268 "raw_data_available": { 269 "applies": true, 270 "answer": false, 271 "justification": "Standard benchmarks are public, but SEMAP evaluation outputs (failure categorizations, agent outputs) are not released. Cannot independently verify failure categorizations.", 272 "source": "haiku" 273 }, 274 "data_collection_described": { 275 "applies": true, 276 "answer": false, 277 "justification": "Relies on prior benchmark definitions. Vulnerability sampling described as 'randomly selecting' and 'randomly sampling' but no seed, procedure, or stratification details provided.", 278 "source": "haiku" 279 }, 280 "recruitment_methods_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants; benchmarks used instead.", 284 "source": "haiku" 285 }, 286 "data_pipeline_documented": { 287 "applies": true, 288 "answer": false, 289 "justification": "Data pipeline not documented. Only states 'LLM-as-a-Judge pipeline proposed in [19]' without describing how failures are categorized, edge cases handled, or data flows through evaluation.", 290 "source": "haiku" 291 } 292 }, 293 "contamination": { 294 "training_cutoff_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "Training data cutoff dates not explicitly stated. Model names suggest dates (DeepSeek-V3-0324 → March 24, 2025; gpt-4.1-nano-2025-04-14 → April 14, 2025) but not confirmed in paper.", 298 "source": "haiku" 299 }, 300 "train_test_overlap_discussed": { 301 "applies": true, 302 "answer": false, 303 "justification": "No discussion of train/test overlap. Using 2025 evaluation models with 2024-2025 benchmarks creates risk of data contamination. ProgramDev (2025) and CVEFixes data (2024) potentially in training sets.", 304 "source": "haiku" 305 }, 306 "benchmark_contamination_addressed": { 307 "applies": true, 308 "answer": false, 309 "justification": "No mention of whether HumanEval, Devign, or CVEFixes examples appeared in training corpora of DeepSeek-V3 or GPT-4.1-nano.", 310 "source": "haiku" 311 } 312 }, 313 "human_studies": { 314 "pre_registered": { 315 "applies": false, 316 "answer": false, 317 "justification": "No human participants.", 318 "source": "haiku" 319 }, 320 "irb_or_ethics_approval": { 321 "applies": false, 322 "answer": false, 323 "justification": "No human participants.", 324 "source": "haiku" 325 }, 326 "demographics_reported": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participants.", 330 "source": "haiku" 331 }, 332 "inclusion_exclusion_criteria": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participants.", 336 "source": "haiku" 337 }, 338 "randomization_described": { 339 "applies": false, 340 "answer": false, 341 "justification": "No human participants.", 342 "source": "haiku" 343 }, 344 "blinding_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human participants.", 348 "source": "haiku" 349 }, 350 "attrition_reported": { 351 "applies": false, 352 "answer": false, 353 "justification": "No human participants.", 354 "source": "haiku" 355 } 356 }, 357 "cost_and_practicality": { 358 "inference_cost_reported": { 359 "applies": true, 360 "answer": false, 361 "justification": "No API costs, latency, throughput, or wall-clock time reported. No analysis of cost trade-offs between SEMAP overhead and failure reduction benefits.", 362 "source": "haiku" 363 }, 364 "compute_budget_stated": { 365 "applies": true, 366 "answer": false, 367 "justification": "No total computational budget or resource requirements stated. No mention of number of API calls, GPU hours, or total cost of evaluation.", 368 "source": "haiku" 369 } 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "SEMAP reduces total failures by 69.6% on function-level code development with DeepSeek", 376 "evidence": "Table I: DeepSeek on HumanEval shows 112 baseline failures → 34 SEMAP failures", 377 "supported": "strong" 378 }, 379 { 380 "claim": "SEMAP is most effective at reducing under-specification failures", 381 "evidence": "Table I shows 71.5%-73.0% reductions in under-specification on HumanEval; 53.8% on ProgramDev. Largest gains in this category across all tasks.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "SEMAP shows consistent improvement across different SE tasks (development and vulnerability detection)", 386 "evidence": "All results in Tables I-II are positive, but ranging from 8.3% to 69.6%. Improvements are consistent in direction but highly variable in magnitude.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Three SE principles (behavioral contracts, structured messaging, lifecycle verification) address three failure modes", 391 "evidence": "Methodology claims to address under-specification, misalignment, and verification failure respectively. But no ablation study isolates each principle's contribution.", 392 "supported": "weak" 393 }, 394 { 395 "claim": "SEMAP promotes more stable failure reduction across collaboration rounds than baseline", 396 "evidence": "Figure 2 shows SEMAP trends decline more sharply and steadily. But no error bars; visual interpretation only.", 397 "supported": "weak" 398 }, 399 { 400 "claim": "SEMAP reduces vulnerability detection failures by up to 47.4% on Python tasks", 401 "evidence": "Table II: vudenc100 with GPT-4.1-nano shows 38 baseline → 20 SEMAP (47.4% reduction)", 402 "supported": "strong" 403 }, 404 { 405 "claim": "SEMAP can support both centralized and decentralized workflows", 406 "evidence": "Methodology supports FSM-based coordination. Development uses centralized CEO style; vulnerability detection uses decentralized voting. Results shown for both, but not separately analyzed.", 407 "supported": "weak" 408 } 409 ], 410 "methodology_tags": [ 411 "benchmark-eval", 412 "observational" 413 ], 414 "key_findings": "SEMAP, a protocol-layer methodology applying three SE principles (behavioral contracts, structured messaging, lifecycle-guided execution), reduces multi-agent LLM failures across SE tasks. Function-level code development shows dramatic improvements (69.6% failure reduction with DeepSeek on HumanEval), with largest gains on under-specification errors. Vulnerability detection improvements are smaller (8.3–47.4%), and improvements vary significantly across model-task combinations. Results lack statistical testing, ablation studies, and implementation details necessary for reproducibility or understanding which principles drive improvements.", 415 "red_flags": [ 416 { 417 "flag": "No ablation study", 418 "detail": "Cannot isolate contribution of behavioral contracts vs structured messaging vs lifecycle verification. Improvements attributed to all three collectively, but each could be individually ineffective." 419 }, 420 { 421 "flag": "Framework confound", 422 "detail": "SEMAP runs on A2A infrastructure; baseline runs on MetaGPT. Cannot distinguish whether improvements come from SEMAP methodology or A2A framework advantages. Different codebase, APIs, capabilities." 423 }, 424 { 425 "flag": "Single baseline", 426 "detail": "Only MetaGPT baseline compared. No comparison to AutoGen, ChatDev, or single-agent LLM approaches. Missing other contemporary multi-agent frameworks." 427 }, 428 { 429 "flag": "No statistical testing", 430 "applies": "All results lack confidence intervals, significance tests, or multiple runs. Improvements could be within noise; no p-values reported." 431 }, 432 { 433 "flag": "Small vulnerability detection samples", 434 "detail": "n=100 each for devign100 and vudenc100. Small sample sizes reduce generalizability and statistical power." 435 }, 436 { 437 "flag": "LLM-as-Judge evaluation bias", 438 "detail": "Using gpt-4o to categorize failures of other LLM systems adds evaluation noise and potential bias. No human validation of failure categorizations." 439 }, 440 { 441 "flag": "Highly variable effect sizes", 442 "detail": "Improvements range from 8.3% to 69.6% with no explanation. Why does DeepSeek show 69.6% on HumanEval but only 8.3% on devign100?" 443 }, 444 { 445 "flag": "No implementation details", 446 "detail": "No code, prompts, hyperparameters (temperature, top-p), or detailed scaffolding. Abstract principles (contracts, FSM) not operationalized as concrete prompts or A2A configurations." 447 }, 448 { 449 "flag": "No limitations section", 450 "detail": "Paper lacks dedicated Limitations or Threats-to-Validity section. Validity concerns not explicitly discussed." 451 }, 452 { 453 "flag": "No failure case analysis", 454 "detail": "Only aggregate failure counts reported. No examples of what failures look like, which agents fail most, or qualitative analysis of failure modes." 455 }, 456 { 457 "flag": "Train/test contamination not addressed", 458 "detail": "2025 evaluation models with 2024-2025 benchmarks. ProgramDev and CVEFixes likely in training data; no discussion of potential overlap." 459 }, 460 { 461 "flag": "Reproducibility blocking", 462 "detail": "Code not released (future work). Prompts not provided. Hyperparameters not fully specified. Evaluation outputs not released. Cannot reproduce or verify results independently." 463 } 464 ], 465 "cited_papers": [ 466 { 467 "title": "Why do multi-agent llm systems fail?", 468 "authors": "Cemri et al.", 469 "year": 2025, 470 "relevance": "Introduces MAST failure taxonomy used to structure SEMAP evaluation; directly motivates the three failure categories addressed" 471 }, 472 { 473 "title": "LLM-based multi-agent systems for software engineering: Literature review, vision and the road ahead", 474 "authors": "He et al.", 475 "year": 2024, 476 "relevance": "Comprehensive survey of multi-agent LLMs for SE; frames problem space and related frameworks" 477 }, 478 { 479 "title": "A survey of ai agent protocols", 480 "authors": "Yang et al.", 481 "year": 2025, 482 "relevance": "Taxonomy of agent communication protocols; positions SEMAP as first domain-specific SE protocol" 483 }, 484 { 485 "title": "Evaluating large language models trained on code", 486 "authors": "Chen et al.", 487 "year": 2021, 488 "relevance": "HumanEval benchmark used for function-level development evaluation" 489 }, 490 { 491 "title": "Devign: Effective vulnerability identification by learning comprehensive program semantics via graph neural networks", 492 "authors": "Zhou et al.", 493 "year": 2019, 494 "relevance": "Source of Devign dataset used for C/C++ vulnerability detection evaluation" 495 }, 496 { 497 "title": "MARE: Multi-agents collaboration framework for requirements engineering", 498 "authors": "Jin et al.", 499 "year": 2024, 500 "relevance": "Example of prior multi-agent LLM application to SE; shows need for better coordination frameworks" 501 }, 502 { 503 "title": "A pair programming framework for code generation via multi-plan exploration and feedback-driven refinement", 504 "authors": "Zhang et al.", 505 "year": 2024, 506 "relevance": "Multi-agent code generation approach; demonstrates current state of practice before SEMAP" 507 } 508 ], 509 "engagement_factors": { 510 "practical_relevance": { 511 "score": 2, 512 "justification": "Protocol methodology could benefit practitioners building multi-agent systems, but no released code, prompts, or implementation guidance limits immediate applicability. Abstract principles without concrete tools." 513 }, 514 "surprise_contrarian": { 515 "score": 1, 516 "justification": "Applying classical SE design principles (Design by Contract, state machines) to LLM agents is expected, not contrarian or surprising. Standard software engineering applied to new domain." 517 }, 518 "fear_safety": { 519 "score": 1, 520 "justification": "Paper does not address AI safety, alignment, or risk concerns. Focuses purely on task failure reduction in code development and vulnerability detection contexts." 521 }, 522 "drama_conflict": { 523 "score": 0, 524 "justification": "Straightforward engineering paper with no controversial claims, methodological drama, or conflict. No dramatic findings or surprising reversals." 525 }, 526 "demo_ability": { 527 "score": 0, 528 "justification": "No code released, no working demo, no reproducible artifacts. Explicitly defers to future work. Practitioners cannot try SEMAP immediately." 529 }, 530 "brand_recognition": { 531 "score": 1, 532 "justification": "Authors from City University of Hong Kong and Waseda University (not top-tier brands). Uses Google A2A and popular LLMs but doesn't leverage brand recognition; results presented as engineering contribution, not vendor advantage." 533 } 534 }, 535 "hn_data": { 536 "threads": [ 537 { 538 "hn_id": "39285499", 539 "title": "Show HN: DynamiCrafter: Animating Open-Domain Images with Video Diffusion Priors", 540 "points": 6, 541 "comments": 2, 542 "url": "https://news.ycombinator.com/item?id=39285499", 543 "created_at": "2024-02-07T07:12:57Z" 544 }, 545 { 546 "hn_id": "42793447", 547 "title": "Can LLMs demonstrate behavioral self-awareness?", 548 "points": 3, 549 "comments": 1, 550 "url": "https://news.ycombinator.com/item?id=42793447", 551 "created_at": "2025-01-22T14:54:07Z" 552 }, 553 { 554 "hn_id": "42815497", 555 "title": "Tell me about yourself: LLMs are aware of their learned behaviors", 556 "points": 2, 557 "comments": 0, 558 "url": "https://news.ycombinator.com/item?id=42815497", 559 "created_at": "2025-01-24T17:44:03Z" 560 }, 561 { 562 "hn_id": "38011661", 563 "title": "Monarch Mixer: A Simple Sub-Quadratic GEMM-Based Architecture", 564 "points": 1, 565 "comments": 0, 566 "url": "https://news.ycombinator.com/item?id=38011661", 567 "created_at": "2023-10-25T11:29:10Z" 568 }, 569 { 570 "hn_id": "37939342", 571 "title": "Can Large Language Models Explain Themselves? A Study", 572 "points": 1, 573 "comments": 0, 574 "url": "https://news.ycombinator.com/item?id=37939342", 575 "created_at": "2023-10-19T06:41:38Z" 576 } 577 ], 578 "top_points": 6, 579 "total_points": 13, 580 "total_comments": 3 581 } 582 }