scan-v5.json (28882B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "How Far Can We Go with Practical Function-Level Program Repair?", 6 "authors": [ 7 "Jiahong Xiang", 8 "Xiaoyang Xu", 9 "Fanchu Kong", 10 "Mingyuan Wu", 11 "Zizheng Zhan", 12 "Haotian Zhang", 13 "Yuqun Zhang" 14 ], 15 "year": 2024, 16 "venue": "arXiv.org", 17 "arxiv_id": "2404.12833", 18 "doi": "10.48550/arXiv.2404.12833" 19 }, 20 "checklist": { 21 "claims_and_evidence": { 22 "abstract_claims_supported": { 23 "applies": true, 24 "answer": true, 25 "justification": "All major abstract claims — zero-shot superiority (Table 3), few-shot disparity (Table 3), auxiliary info improvements (Table 4), SRepair's 300 correct fixes and 85% gain over ChatRepair (Table 7), and first multi-function repair (Figure 11) — are directly supported by presented data.", 26 "source": "haiku" 27 }, 28 "causal_claims_justified": { 29 "applies": true, 30 "answer": true, 31 "justification": "Causal claims about auxiliary information improving performance are supported by systematic ablation across 8 prompt configurations and 6 LLMs; while not RCT, the within-model controlled comparisons provide adequate evidence for the claim within the tested setting.", 32 "source": "haiku" 33 }, 34 "generalization_bounded": { 35 "applies": true, 36 "answer": false, 37 "justification": "The title and conclusion make sweeping claims ('How Far Can We Go') but the study is limited to Defects4J (Java) and QuixBugs; the claim that SRepair is 'first time achieved by any APR technique ever' for multi-function bugs cannot be independently verified and is stated without comprehensive literature coverage.", 38 "source": "haiku" 39 }, 40 "alternative_explanations_discussed": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper identifies that bug reports sometimes hurt performance (Closure-66 case) and discusses fault location attention distraction, but does not systematically explore alternative explanations for why the dual-LLM CoT approach outperforms — e.g., model size differences, token budget effects are only briefly touched.", 44 "source": "haiku" 45 }, 46 "proxy_outcome_distinction": { 47 "applies": true, 48 "answer": true, 49 "justification": "The paper explicitly distinguishes between plausible patches (pass all tests) and correct patches (semantically equivalent to developer patch via manual inspection), acknowledging that test-passing does not equal correctness.", 50 "source": "haiku" 51 } 52 }, 53 "limitations_and_scope": { 54 "limitations_section_present": { 55 "applies": true, 56 "answer": true, 57 "justification": "Section 6 'Threats to Validity' covers internal, external, and construct validity threats with specific mitigations discussed.", 58 "source": "haiku" 59 }, 60 "threats_to_validity_specific": { 61 "applies": true, 62 "answer": true, 63 "justification": "Specific quantified threats are addressed: data leakage is measured at 7.4‰ identical patches for study and 1.5‰ for SRepair500; manual validation bias is mitigated with three-author cross-validation; trigger test manipulation is addressed via the dual-LLM separation architecture.", 64 "source": "haiku" 65 }, 66 "scope_boundaries_stated": { 67 "applies": true, 68 "answer": false, 69 "justification": "The external validity discussion only states results 'may not well generalize' without stating explicit boundaries — e.g., no statement that results do not apply to dynamically typed languages, non-Java ecosystems, or bugs not captured in academic benchmarks.", 70 "source": "haiku" 71 } 72 }, 73 "conflicts_of_interest": { 74 "funding_disclosed": { 75 "applies": true, 76 "answer": false, 77 "justification": "No funding acknowledgment section or funding statement appears anywhere in the paper.", 78 "source": "haiku" 79 }, 80 "affiliations_disclosed": { 81 "applies": true, 82 "answer": true, 83 "justification": "Author affiliations are clearly listed on the title page: most at Southern University of Science and Technology, Zizheng Zhan and Haotian Zhang at Kwai Inc. (industry).", 84 "source": "haiku" 85 }, 86 "funder_independent_of_outcome": { 87 "applies": false, 88 "answer": false, 89 "justification": "No funding is disclosed, making this criterion not applicable.", 90 "source": "haiku" 91 }, 92 "financial_interests_declared": { 93 "applies": true, 94 "answer": false, 95 "justification": "No competing interests or financial interests statement appears in the paper; two authors are from industry (Kwai Inc.) with no disclosure of any potential conflicts.", 96 "source": "haiku" 97 } 98 }, 99 "scope_and_framing": { 100 "key_terms_defined": { 101 "applies": true, 102 "answer": true, 103 "justification": "Key terms are defined precisely: 'function-level APR' (generating entire patched function), 'single-function bugs' vs 'multi-function bugs', 'plausible patches' vs 'correct patches', and 'few-shot learning' with specific k-shot configurations are all defined in context.", 104 "source": "haiku" 105 }, 106 "intended_contribution_clear": { 107 "applies": true, 108 "answer": true, 109 "justification": "Three explicit contributions are bullet-pointed: the empirical study on function-level APR, the findings about zero-shot and auxiliary information, and the SRepair technique with quantified performance claims.", 110 "source": "haiku" 111 }, 112 "engagement_with_prior_work": { 113 "applies": true, 114 "answer": true, 115 "justification": "Section 2 discusses AlphaRepair, Repilot, FitRepair, ChatRepair, and CodexRepair in detail, explicitly noting where their own work diverges (e.g., prior claim that LLMs can't be directly applied to function-level APR is challenged), and comparisons run throughout the evaluation.", 116 "source": "haiku" 117 } 118 } 119 }, 120 "type_checklist": { 121 "empirical": { 122 "artifacts": { 123 "code_released": { 124 "applies": true, 125 "answer": true, 126 "justification": "Data Availability section states 'The data and code are available at GitHub [1] for public evaluation' with a live GitHub link (https://github.com/GhabiX/SRepair).", 127 "source": "haiku" 128 }, 129 "data_released": { 130 "applies": true, 131 "answer": true, 132 "justification": "All experiments use Defects4J 1.2 and 2.0 (publicly available standard benchmark) and QuixBugs (publicly available); no proprietary dataset is used.", 133 "source": "haiku" 134 }, 135 "environment_specified": { 136 "applies": true, 137 "answer": false, 138 "justification": "Hardware specs are mentioned (A100 GPUs, Ubuntu 20.04.6 LTS) but no requirements.txt, Dockerfile, or dependency manifest is in the paper; full configuration is deferred to the GitHub page without confirmation of what is there.", 139 "source": "haiku" 140 }, 141 "reproduction_instructions": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper refers to GitHub for 'experimental results under different configurations' but provides no step-by-step reproduction instructions in the paper itself; a reader could not reproduce from the paper alone.", 145 "source": "haiku" 146 } 147 }, 148 "statistical_methodology": { 149 "confidence_intervals_or_error_bars": { 150 "applies": true, 151 "answer": false, 152 "justification": "All results are reported as raw counts of fixed bugs with no confidence intervals or error bars, despite generating 200 samples per bug which would allow variance estimation.", 153 "source": "haiku" 154 }, 155 "significance_tests": { 156 "applies": true, 157 "answer": false, 158 "justification": "No statistical significance tests are applied to any comparative claims; differences between techniques are stated as raw numbers and percentages without any p-values or effect tests.", 159 "source": "haiku" 160 }, 161 "effect_sizes_reported": { 162 "applies": true, 163 "answer": true, 164 "justification": "Percentage improvements are consistently reported (e.g., 'at least 85% more than ChatRepair', '26.7% improvement', 'decrease of 49.7%') with clear baseline context.", 165 "source": "haiku" 166 }, 167 "sample_size_justified": { 168 "applies": true, 169 "answer": false, 170 "justification": "No sample size justification or power analysis is provided; the 522 single-function bugs are the full available set from Defects4J rather than a justified sample.", 171 "source": "haiku" 172 }, 173 "variance_reported": { 174 "applies": true, 175 "answer": false, 176 "justification": "200 samples per bug are generated per run, but no variance or standard deviation across runs or seeds is reported for any metric.", 177 "source": "haiku" 178 } 179 }, 180 "evaluation_design": { 181 "baselines_included": { 182 "applies": true, 183 "answer": true, 184 "justification": "Four state-of-the-art baselines are included: AlphaRepair, Repilot, FitRepair, and ChatRepair, plus direct component baselines (GPT-3.5-TurboPI(ALL) and MagicoderPI(ALL)).", 185 "source": "haiku" 186 }, 187 "baselines_contemporary": { 188 "applies": true, 189 "answer": true, 190 "justification": "All baselines are from 2022-2023 and represent the current state-of-the-art LLM-based APR techniques at time of writing; the most recent is ChatRepair (2023).", 191 "source": "haiku" 192 }, 193 "ablation_study": { 194 "applies": true, 195 "answer": true, 196 "justification": "SRepair2M (dual-LLM without CoT) and SRepair2M+FL (with fault location) ablation variants isolate the contribution of the CoT mechanism and fault location information respectively.", 197 "source": "haiku" 198 }, 199 "multiple_metrics": { 200 "applies": true, 201 "answer": true, 202 "justification": "Both plausible patches and correct patches are reported as primary metrics, plus patch status distributions (plausible/test-failure/uncompilable ratios) in Figure 5.", 203 "source": "haiku" 204 }, 205 "human_evaluation": { 206 "applies": true, 207 "answer": true, 208 "justification": "Three authors manually cross-validated all plausible patches from SRepair500 to determine semantic correctness by comparison with developer patches.", 209 "source": "haiku" 210 }, 211 "held_out_test_set": { 212 "applies": true, 213 "answer": true, 214 "justification": "The Defects4J test suite (including developer-written trigger tests) serves as a held-out validation set; patches must pass all tests to be considered plausible.", 215 "source": "haiku" 216 }, 217 "per_category_breakdown": { 218 "applies": true, 219 "answer": true, 220 "justification": "Table 7 provides per-project breakdowns across 17 Java projects for all techniques, allowing granular comparison across different software types and bug distributions.", 221 "source": "haiku" 222 }, 223 "failure_cases_discussed": { 224 "applies": true, 225 "answer": true, 226 "justification": "Specific failure cases are analyzed: Closure-66 where bug reports mislead all models, Closure-112 where fault location information over-focuses the model, and CodeLlama's anomalous Python code generation for Java bugs.", 227 "source": "haiku" 228 }, 229 "negative_results_reported": { 230 "applies": true, 231 "answer": true, 232 "justification": "Finding 2 explicitly reports that few-shot learning causes negative impact on performance for some models (up to 49.7% fewer plausible fixes for Magicoder with K1(CE)), which is a prominent negative result.", 233 "source": "haiku" 234 } 235 }, 236 "setup_transparency": { 237 "model_versions_specified": { 238 "applies": true, 239 "answer": true, 240 "justification": "Exact model versions are specified: code-davinci-edit-001, gpt-3.5-turbo-1106, CodeLlama-Instruct series (7B/13B/34B), and MagicoderS-CL-7B.", 241 "source": "haiku" 242 }, 243 "prompts_provided": { 244 "applies": true, 245 "answer": true, 246 "justification": "Figure 3 shows the APR input prompt structure with actual content, Figure 9 shows the CoT prompt structure, and Figure 4 shows example auxiliary information inputs with real bug examples.", 247 "source": "haiku" 248 }, 249 "hyperparameters_reported": { 250 "applies": true, 251 "answer": true, 252 "justification": "Hyperparameters are explicitly stated: nucleus sampling with top_p=0.9, temperature=0.8, 200 samples per bug; 5-hour time limit per bug for SRepair.", 253 "source": "haiku" 254 }, 255 "scaffolding_described": { 256 "applies": true, 257 "answer": true, 258 "justification": "The dual-LLM framework is described in detail in Section 5: repair suggestion model (GPT-3.5-Turbo with CoT) generates natural language suggestions, patch generation model (Magicoder) generates 5 patched functions per suggestion.", 259 "source": "haiku" 260 }, 261 "data_preprocessing_documented": { 262 "applies": true, 263 "answer": true, 264 "justification": "Section 3.3.3 describes extraction of auxiliary information: bug reports from official Defects4J issue links, trigger tests and error messages automatically extracted by building buggy projects and running JUnit, buggy function comments extracted programmatically.", 265 "source": "haiku" 266 } 267 }, 268 "data_integrity": { 269 "raw_data_available": { 270 "applies": true, 271 "answer": true, 272 "justification": "Both the code and data are on GitHub, and Defects4J is a well-established public benchmark; the generated patches and their validation results are available for inspection.", 273 "source": "haiku" 274 }, 275 "data_collection_described": { 276 "applies": true, 277 "answer": true, 278 "justification": "The benchmark construction is described: 522 single-function bugs identified from all 835 bugs in Defects4J 1.2 and 2.0, with Table 1 providing per-project statistics and criteria for inclusion (bugs existing within one single function).", 279 "source": "haiku" 280 }, 281 "recruitment_methods_described": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants were recruited; the study uses standard software engineering benchmarks.", 285 "source": "haiku" 286 }, 287 "data_pipeline_documented": { 288 "applies": true, 289 "answer": true, 290 "justification": "The full pipeline from bug selection → auxiliary information extraction → patch generation → test validation → manual correctness inspection is documented across Sections 3.1-3.3 and 5.2.", 291 "source": "haiku" 292 } 293 }, 294 "contamination": { 295 "training_cutoff_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "The training data cutoff for GPT-3.5-Turbo (gpt-3.5-turbo-1106) and other models is never stated, despite Defects4J being a widely known benchmark that is almost certainly in training data.", 299 "source": "haiku" 300 }, 301 "train_test_overlap_discussed": { 302 "applies": true, 303 "answer": true, 304 "justification": "Data leakage is addressed under Threats to Validity: only 7.4‰ of plausible patches in the study and 1.5‰ in SRepair500 are identical to developer patches, which the authors claim poses negligible impact.", 305 "source": "haiku" 306 }, 307 "benchmark_contamination_addressed": { 308 "applies": true, 309 "answer": false, 310 "justification": "While patch overlap is measured, broader benchmark contamination (the model having learned from similar Defects4J bugfixes during pretraining) is not discussed; the 7.4‰ exact match metric does not capture learned patterns from similar fixes.", 311 "source": "haiku" 312 } 313 }, 314 "human_studies": { 315 "pre_registered": { 316 "applies": false, 317 "answer": false, 318 "justification": "No human participants in the study.", 319 "source": "haiku" 320 }, 321 "irb_or_ethics_approval": { 322 "applies": false, 323 "answer": false, 324 "justification": "No human participants in the study.", 325 "source": "haiku" 326 }, 327 "demographics_reported": { 328 "applies": false, 329 "answer": false, 330 "justification": "No human participants in the study.", 331 "source": "haiku" 332 }, 333 "inclusion_exclusion_criteria": { 334 "applies": false, 335 "answer": false, 336 "justification": "No human participants in the study.", 337 "source": "haiku" 338 }, 339 "randomization_described": { 340 "applies": false, 341 "answer": false, 342 "justification": "No human participants in the study.", 343 "source": "haiku" 344 }, 345 "blinding_described": { 346 "applies": false, 347 "answer": false, 348 "justification": "No human participants in the study.", 349 "source": "haiku" 350 }, 351 "attrition_reported": { 352 "applies": false, 353 "answer": false, 354 "justification": "No human participants in the study.", 355 "source": "haiku" 356 } 357 }, 358 "cost_and_practicality": { 359 "inference_cost_reported": { 360 "applies": true, 361 "answer": true, 362 "justification": "API cost is explicitly reported: 'repairing 300 single-function bugs with SRepair costs only $8.6, averaging $0.029 per correct fix.'", 363 "source": "haiku" 364 }, 365 "compute_budget_stated": { 366 "applies": true, 367 "answer": true, 368 "justification": "Total compute for the empirical study is explicitly stated: 'over 10 million patches are generated and validated, consuming more than 8,000 GPU and 100,000 CPU hours.'", 369 "source": "haiku" 370 } 371 } 372 } 373 }, 374 "claims": [ 375 { 376 "claim": "LLMs with zero-shot learning are already powerful function-level APR techniques, achieving 180 average plausible fixes on 522 single-function bugs.", 377 "evidence": "Table 3 shows K0(Basic) achieves the best average plausible fix count (180) across all 6 LLMs, outperforming all few-shot variants.", 378 "supported": "strong" 379 }, 380 { 381 "claim": "Few-shot learning leads to disparate and often negative impact on function-level APR performance, ranging from +10% to -49.7%.", 382 "evidence": "Table 3 documents per-model impacts: CodeLlama 34B gains 10% with K1(CE) while Magicoder drops 49.7% with K2(CE,PE) vs K0(Basic).", 383 "supported": "strong" 384 }, 385 { 386 "claim": "Auxiliary repair-relevant information significantly enhances function-level repair performance, with PI(ALL) achieving 254 and BR(ALL) achieving 273 average plausible fixes vs 180 baseline.", 387 "evidence": "Table 4 shows consistent improvements across all 6 LLMs under both bug report and project-specific information setups.", 388 "supported": "strong" 389 }, 390 { 391 "claim": "SRepair correctly fixes 300 single-function bugs, surpassing all previous APR techniques by at least 85% (vs ChatRepair's 162 correct fixes).", 392 "evidence": "Table 7 shows SRepair500 achieves 300 correct fixes vs ChatRepair's 162, Repilot's 254 plausible but fewer correct fixes.", 393 "supported": "strong" 394 }, 395 { 396 "claim": "SRepair successfully fixes 32 multi-function bugs, which is the first time achieved by any APR technique.", 397 "evidence": "Figure 11 shows SRepair500 achieves 53 plausible and 32 correct multi-function bug fixes; prior baselines are not systematically compared on multi-function bugs.", 398 "supported": "moderate" 399 }, 400 { 401 "claim": "Statement-level fault location information becomes less valuable when rich auxiliary project-specific information is available, providing only 7.1% additional improvement over PI(ALL).", 402 "evidence": "Table 5 shows FL information adds 20.6% for K0(Basic) but only 7.1% for PI(ALL), demonstrating diminishing returns as auxiliary information increases.", 403 "supported": "strong" 404 } 405 ], 406 "methodology_tags": [ 407 "benchmark-eval", 408 "empirical" 409 ], 410 "key_findings": "LLMs can effectively perform function-level APR under zero-shot learning, contradicting prior assumptions that require few-shot examples; few-shot learning actually hurts performance for several models by generating more uncompilable patches. Auxiliary project-specific information (trigger tests, error messages, function comments) significantly improves repair performance and can substitute for costly statement-level fault localization. The proposed SRepair dual-LLM framework with Chain of Thought reasoning achieves 300 correct single-function bug fixes on Defects4J, an 85% improvement over the prior state-of-the-art ChatRepair, at only $0.029 per correct fix. SRepair is reported as the first APR technique to successfully fix multi-function bugs, fixing 32 such bugs.", 411 "red_flags": [ 412 { 413 "flag": "No statistical significance testing", 414 "detail": "All comparative claims are made based on raw counts and percentages with no significance tests, confidence intervals, or variance across runs, despite generating 200 samples per bug which would enable such analysis." 415 }, 416 { 417 "flag": "Benchmark contamination understated", 418 "detail": "Defects4J is a widely known 2014 benchmark almost certainly in GPT-3.5's training data. The paper measures only exact patch overlap (7.4‰) but does not address whether the model learned from similar bugfixes during pretraining, which could inflate all GPT-3.5-Turbo results." 419 }, 420 { 421 "flag": "Unverifiable 'first ever' claim", 422 "detail": "The claim that SRepair is 'the first time achieved by any APR technique ever' for multi-function bugs cannot be independently verified and relies entirely on the authors' knowledge of prior literature." 423 }, 424 { 425 "flag": "No funding disclosure", 426 "detail": "Two authors are from Kwai Inc. (industry) but no funding or competing interests are disclosed anywhere in the paper." 427 }, 428 { 429 "flag": "Manual patch inspection not blinded", 430 "detail": "Three authors cross-validated plausible patches for SRepair500, but the authors are the same people who built SRepair, introducing potential confirmation bias in correctness assessment." 431 } 432 ], 433 "cited_papers": [ 434 { 435 "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT", 436 "relevance": "Primary baseline comparison; SRepair claims 85% improvement over this ChatRepair approach" 437 }, 438 { 439 "title": "Copiloting the copilots: Fusing large language models with completion engines for automated program repair (Repilot)", 440 "relevance": "Key baseline; SRepair claims 1.59x improvement over Repilot" 441 }, 442 { 443 "title": "Automated program repair in the era of large pre-trained language models", 444 "relevance": "Prior work establishing few-shot learning approach for function-level APR; this paper challenges their claim that zero-shot is ineffective" 445 }, 446 { 447 "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning (AlphaRepair)", 448 "relevance": "Key baseline using cloze-style repair" 449 }, 450 { 451 "title": "Revisiting the Plastic Surgery Hypothesis via Large Language Models (FitRepair)", 452 "relevance": "Key baseline combining domain-specific fine-tuning and prompting" 453 }, 454 { 455 "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs", 456 "relevance": "Primary benchmark used for all experiments" 457 }, 458 { 459 "title": "Chain-of-thought prompting elicits reasoning in large language models", 460 "relevance": "Core technique (CoT) used in SRepair's repair suggestion model" 461 }, 462 { 463 "title": "MagiCoder: Source code is all you need", 464 "relevance": "Open-source model used as patch generation backbone in SRepair" 465 }, 466 { 467 "title": "Automated repair of programs from large language models", 468 "relevance": "Related work on APR for LLM-generated code errors" 469 }, 470 { 471 "title": "QuixBugs: A multi-lingual program repair benchmark set based on the Quixey Challenge", 472 "relevance": "Secondary benchmark used to validate SRepair's cross-language generalizability" 473 } 474 ], 475 "engagement_factors": { 476 "practical_relevance": { 477 "score": 3, 478 "justification": "Directly actionable: SRepair is open-source, costs $0.029 per fix, requires no costly fault localization, and achieves strong results on a widely-used benchmark." 479 }, 480 "surprise_contrarian": { 481 "score": 2, 482 "justification": "Counter-intuitive finding that few-shot learning hurts performance, and that free-form project information can nearly match expensive statement-level fault location, challenges common assumptions in the APR field." 483 }, 484 "fear_safety": { 485 "score": 0, 486 "justification": "No AI safety, risk, or misuse concerns are raised; the paper is focused on a beneficial software engineering application." 487 }, 488 "drama_conflict": { 489 "score": 1, 490 "justification": "Claims of 'first ever' multi-function repair and 85% improvement over prior SOTA are notable but are standard comparative framing in the SE literature." 491 }, 492 "demo_ability": { 493 "score": 2, 494 "justification": "GitHub repo is publicly available and uses API-accessible models (GPT-3.5) plus open-source Magicoder, making the approach reproducible." 495 }, 496 "brand_recognition": { 497 "score": 1, 498 "justification": "Uses GPT-3.5-Turbo (OpenAI) as a component, and Kwai Inc. industry affiliation, but no high-profile lab association." 499 } 500 }, 501 "hn_data": { 502 "threads": [ 503 { 504 "hn_id": "39562254", 505 "title": "On the Ocean Conditions of Hycean Worlds", 506 "points": 18, 507 "comments": 7, 508 "url": "https://news.ycombinator.com/item?id=39562254" 509 }, 510 { 511 "hn_id": "38869223", 512 "title": "Show HN: RAGatouille, a simple lib to use&train top retrieval models in RAG apps", 513 "points": 15, 514 "comments": 5, 515 "url": "https://news.ycombinator.com/item?id=38869223" 516 }, 517 { 518 "hn_id": "46130726", 519 "title": "Hardness of observing strong-to-weak symmetry breaking", 520 "points": 7, 521 "comments": 0, 522 "url": "https://news.ycombinator.com/item?id=46130726" 523 }, 524 { 525 "hn_id": "43851789", 526 "title": "\"It Listens Better Than My Therapist\": Discourse on LLMs and Mental Health", 527 "points": 4, 528 "comments": 0, 529 "url": "https://news.ycombinator.com/item?id=43851789" 530 }, 531 { 532 "hn_id": "41357036", 533 "title": "Melody predominates over harmony in the evolution of musical scales", 534 "points": 2, 535 "comments": 0, 536 "url": "https://news.ycombinator.com/item?id=41357036" 537 }, 538 { 539 "hn_id": "41140952", 540 "title": "Cabin: Confining Untrusted Programs Within Confidential VMs", 541 "points": 2, 542 "comments": 0, 543 "url": "https://news.ycombinator.com/item?id=41140952" 544 }, 545 { 546 "hn_id": "40922111", 547 "title": "Can Go AIs be adversarially robust?", 548 "points": 2, 549 "comments": 0, 550 "url": "https://news.ycombinator.com/item?id=40922111" 551 }, 552 { 553 "hn_id": "40119920", 554 "title": "Combining Power and Arithmetic Optimization via Datapath Rewriting", 555 "points": 1, 556 "comments": 0, 557 "url": "https://news.ycombinator.com/item?id=40119920" 558 }, 559 { 560 "hn_id": "41022306", 561 "title": "Bright: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval", 562 "points": 1, 563 "comments": 0, 564 "url": "https://news.ycombinator.com/item?id=41022306" 565 } 566 ], 567 "top_points": 18, 568 "total_points": 52, 569 "total_comments": 12 570 } 571 }