scan-v5.json (28897B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Input Reduction Enhanced LLM-based Program Repair", 6 "authors": [ 7 "Boyang Yang", 8 "Luyao Ren", 9 "Xin Yin", 10 "Jiadong Ren", 11 "Haoye Tian" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2507.15251", 16 "doi": "10.48550/arXiv.2507.15251" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "All abstract claims are verified: 89.1% average input reduction (Table 3), 53.8% relative pass@10 improvement over Origin Test (GLM overall 6.5%→10.0%), 17.6% over Baseline (GLM 8.5%→10.0%), ChatRepair +21.3% and CREF +2.6% (Tables 10–11).", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper makes causal claims that input reduction improves repair accuracy; these are backed by controlled ablations (RQ-3 isolating length vs. information), statistical MWW tests (p<0.05), and plug-in integration experiments that hold all other variables fixed.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": true, 35 "justification": "Claims are explicitly bounded to LLM-based APR with long failure-inducing test inputs; the threats-to-validity section acknowledges the competitive-programming domain and validates on OSS-Fuzz for broader applicability, without overclaiming universal generalization.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": true, 41 "justification": "RQ-3 specifically investigates whether gains come from shorter prompts alone vs. preserved failure information; five prompt variants (Diff Lines, Reduced+Origin, etc.) systematically rule out length-only explanations.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "pass@k is defined as patch correctness against the full official test suite; the paper does not conflate this metric with broader software quality or developer productivity, staying within the measured granularity.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 6 'Threats to Validity' contains three dedicated subsections (Internal, Construct, External validity), each with specific threats and mitigations.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Specific threats are identified: compression ratio reaching 100% hiding variation (addressed by reporting both mean and median), dataset restriction to AtCoder (mitigated with OSS-Fuzz), stochasticity addressed by pass@k sampling — these are concrete, not boilerplate.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper explicitly scopes evaluation to long failure-inducing inputs and notes 'ReduceFix might benefit only the pipeline evaluated in this study and fail to transfer,' testing transferability via ChatRepair and CREF plug-in experiments.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Acknowledgments section lists three explicit funding sources: National Natural Science Foundation of China (62273292), Central Leading Local Science and Technology Development Project of Hebei Province (246Z0804G), and Hebei Innovation Capability Improvement Plan Project (22567626H).", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "All author affiliations are listed on the first page: Yanshan University, Peking University, Zhejiang University, Aalto University, and Yanshan University.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": true, 87 "justification": "All funders are Chinese government science foundations with no commercial interest in the ReduceFix tool or the LLMs evaluated.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or conflicts-of-interest statement is included anywhere in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key terms are defined: APR is introduced in the opening sentence, 'lost-in-the-middle' is explained with citation, compression rate is formally defined in Eq. 2, and pass@k is defined with its formula in Section 4.3.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 1 enumerates four explicit contributions: (1) ReduceFix framework, (2) LFTBench benchmark, (3) comprehensive evaluation results, (4) plug-in integration into ChatRepair and CREF.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 7 provides a structured related work covering LLM-based APR and test input reduction, explicitly positioning ReduceFix as the first approach combining both and showing how it differs from ddmin-family methods, HDD, Perses, and LPR.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "Section 4.4 explicitly states 'the full artifact is published at https://github.com/GLEAM-Lab/ReduceFix,' and the README is referenced for reducer visualizations.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "LFTBench and LFTBench-Py are stated as released in the contributions section; the GitHub artifact URL implies data availability alongside code.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "No requirements.txt, Dockerfile, or dependency specification is provided in the paper; only hyperparameters (Table 2) and hardware category ('single 24 GB consumer GPU') are mentioned.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "No step-by-step reproduction instructions appear in the paper itself; the GitHub README is referenced but not its content, and the paper provides only algorithm pseudocode.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "All pass@k results are reported as single point estimates in tables; no confidence intervals or error bars appear anywhere in the paper.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": true, 156 "justification": "Section 5.3 applies two-sided Mann-Whitney-Wilcoxon tests to compare ReduceFix against Origin Test, reporting p < 0.05 for key comparisons including the ddmin-only gap.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Both absolute and relative improvements are consistently reported (e.g., 'pass@10 rises from 30.5% to 37.0%, an absolute gain of 6.5 percentage points and a relative gain of 21.3%').", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The 200-bug, 20-task sample size is motivated by the availability of AtCoder data after LLM cutoff dates, but no power analysis or statistical justification for sufficiency is provided.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "Pass@k figures are reported as single values with no standard deviation, confidence intervals, or run-to-run variance; the reducer uses temperature=0 (deterministic), but repair sampling variance is not reported.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Multiple baselines are tested: Baseline (no test), Origin Test (full test), ddmin-only, and pure-LLM reduction, plus ChatRepair and CREF with and without ReduceFix.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "ChatRepair (ISSTA 2024) and CREF (ISSTA 2024) are recent and competitive APR systems; ddmin is classical but appropriate as a known algorithm baseline for the reduction task.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "RQ-3 (Section 5.4) is an explicit ablation with five prompt variants to isolate the contributions of length reduction vs. information selection.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "The paper reports pass@1, pass@5, pass@10, reduction success rate, mean and median compression rate, and token/cost comparisons.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "Automated test suite evaluation is the appropriate measure for APR; no human evaluation is needed or applicable.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "The full official AtCoder hidden test archive is used for final validation; the failure-inducing input used in the prompt is distinct from the full test suite used to judge patch correctness.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Results are broken down by task difficulty (C, D, E&F in Table 6) and by input format (6 categories in Table 4), as well as by project for OSS-Fuzz (Tables 12–13).", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Section 5.2 includes a dedicated 'Failed Case Study' for ABC372E where vertex renumbering masked the defect, with a proposed fix described in detail.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper explicitly reports that including the full failing test (Origin Test) hurts performance relative to no test for GLM-4-9B-chat (8.5%→6.5%) and DeepSeek-V3 (66.5%→63.0%).", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": false, 238 "justification": "Model names are given (Qwen2.5-Coder-7B-instruct, GLM-4-9B-chat, DeepSeek-V3, Qwen2.5-Plus) but no API snapshot dates or commit hashes are provided; Qwen2.5-Plus is a cloud service with non-disclosed parameter count and no versioning timestamp.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": true, 244 "justification": "Listing 1 provides the full one-shot reducer prompt template with all placeholder variables labeled; the repair prompt structure is also described in Section 3.4.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Table 2 lists all hyperparameters: temperature 0.0 for reduction, 0.8 for repair, 60s wall-clock limit, 10s compilation timeout, 5s execution timeout per test case, k∈{1,5,10}.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": true, 255 "answer": true, 256 "justification": "Algorithm 1 provides the complete three-stage pipeline control logic; Sections 3.2–3.4 describe each stage in detail including reducer generation, iterative reduction, and patch validation loops.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Section 4.2 describes the full benchmark construction: AtCoder ABC 361–377 selection, test size filter (≥4 KB), difficulty filter (C–F), manual collection of wrong-answer submissions before July 1, 2025.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": true, 270 "justification": "The GitHub artifact at GLEAM-Lab/ReduceFix is stated to contain the full artifact, including LFTBench and LFTBench-Py; OSS-Fuzz data reuses publicly available ARVO scripts.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Section 4.2 describes the collection procedure in detail: AtCoder tasks selected by contest number range, difficulty level, test file size threshold, and manual identification of failing submissions before a cutoff date.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants; LFTBench is constructed from publicly available AtCoder contest data with no subject recruitment.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The pipeline from AtCoder test archives → filtering → manual submission collection → benchmark packaging is described; ARVO's Docker-based data and scripts are reused for OSS-Fuzz.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": true, 296 "justification": "Section 4.2 explicitly states LFTBench covers ABC 361–377, 'a span entirely after the knowledge cut-offs of the 4 LLMs we evaluate,' directly addressing training cutoffs.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": true, 302 "justification": "Section 4.2 explicitly motivates the post-cutoff design by noting that existing benchmarks 'were released years ago' and 'large language models have almost certainly seen,' providing quantitative context for the leakage concern.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": true, 308 "justification": "For OSS-Fuzz, the paper acknowledges potential overlap but argues relative comparisons remain valid since the same LLM and instances are held fixed across strategies; LFTBench is explicitly designed to be post-cutoff.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": true, 360 "justification": "Table 5 reports exact API costs: ReduceFix costs $0.017 vs. pure-LLM $0.632 on 20 problems; Section 4.1 lists per-token pricing for Qwen2.5-Plus and DeepSeek-V3.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "Only API costs for one reduction comparison are reported; total compute for all experiments across 200 bugs × 4 LLMs × 3 conditions is not stated, and local GPU experiments have no wall-clock totals.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "ReduceFix successfully reduces 95% of 200 LFTBench bugs with 89.1% average compression rate", 375 "evidence": "Table 3 reports 95.0% overall success rate and mean/median compression of 89.1%/100.0%; all 200 syntactically valid reducers were generated.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Providing reduced test inputs improves pass@10 by up to 53.8% relative to full test inputs across LLMs", 380 "evidence": "Table 6: GLM-4-9B-chat improves from 6.5% (Origin Test) to 10.0% (Reduced Test) overall pass@10, a 53.8% relative gain; gains are consistent across all 4 LLMs.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Including the full unmodified failing test often hurts repair accuracy below the no-test baseline", 385 "evidence": "Table 6: GLM-4-9B-chat drops from 8.5% (Baseline) to 6.5% (Origin Test) pass@10; DeepSeek-V3 drops from 66.5% to 63.0%.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Both compact length and complete failure evidence are required; neither alone suffices", 390 "evidence": "Table 9 (RQ-3): Diff Lines (sparse info, short prompt) achieves 20.0% pass@10; Reduced+Origin (complete info, long prompt) achieves 19.0%; Reduced Test (both) achieves 25.5%.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "ReduceFix integrates as a drop-in plug-in and improves ChatRepair by 21.3% and CREF by 2.6% relative pass@10", 395 "evidence": "Tables 10–11: ChatRepair 30.5%→37.0% (+21.3% relative); CREF 39.0%→40.0% (+2.6% relative) on LFTBench.", 396 "supported": "strong" 397 }, 398 { 399 "claim": "ReduceFix outperforms both ddmin-only (35.5% success) and pure-LLM (40.0% success) reduction baselines", 400 "evidence": "Table 3 reports ddmin-only 35.5%, pure-LLM 40.0%, ReduceFix 95.0% overall reduction success rate.", 401 "supported": "strong" 402 }, 403 { 404 "claim": "ReduceFix generalizes to repository-level OSS-Fuzz crashes, improving pass@10 from 16.7% (Origin Test) to 41.7%", 405 "evidence": "Table 13: micro-average pass@10 rises from 16.7% (Origin Test) and 25.0% (Baseline) to 41.7% (Reduced Test) on 12 OSS-Fuzz instances with Qwen2.5-Plus.", 406 "supported": "moderate" 407 } 408 ], 409 "methodology_tags": [ 410 "benchmark-eval", 411 "empirical" 412 ], 413 "key_findings": "ReduceFix demonstrates that automatically reducing long failure-inducing test inputs before LLM-based repair substantially improves patch accuracy: inputs shrink by 89.1% on average with 95% success, and pass@10 improves by up to 53.8% relative over using the full test. Crucially, including unmodified long test inputs often hurts performance below the no-test baseline, confirming the 'lost-in-the-middle' effect. Ablation studies show that both prompt length reduction and preserved failure information are independently necessary—neither alone achieves the full gain. The approach integrates as a drop-in component for existing APR systems (ChatRepair +21.3%, CREF +2.6%) and generalizes to OSS-Fuzz repository-level crashes.", 414 "red_flags": [ 415 { 416 "flag": "Tiny OSS-Fuzz dataset", 417 "detail": "Repository-level generalization claims are based on only 12 OSS-Fuzz instances across 5 projects; this is too small to support confident conclusions about real-world applicability." 418 }, 419 { 420 "flag": "No confidence intervals on pass@k", 421 "detail": "All pass@k results are point estimates with no variance, confidence intervals, or error bars; statistical significance is tested for one comparison (RQ-2) but not reported for most tables." 422 }, 423 { 424 "flag": "Cloud model versions unpinned", 425 "detail": "Qwen2.5-Plus is a cloud API service with no snapshot date or version identifier; results may not be reproducible if the underlying model is updated." 426 }, 427 { 428 "flag": "Competitive programming domain gap", 429 "detail": "LFTBench is entirely AtCoder competitive programming problems, which have reference solutions and exact output oracles not present in most real-world software bugs; the benchmark may not reflect typical APR scenarios." 430 }, 431 { 432 "flag": "Marginal gains for strong models", 433 "detail": "For DeepSeek-V3, improvement over Baseline is 45.2%→45.9% pass@1 and 66.5%→67.0% pass@10 — within statistical noise range despite the MWW test; the approach appears most valuable for weaker models." 434 } 435 ], 436 "cited_papers": [ 437 { 438 "title": "Automated Program Repair via Conversation: Fixing 162 out of 337 Bugs for $0.42 Each Using ChatGPT", 439 "relevance": "Primary baseline system (ChatRepair) integrated with ReduceFix; demonstrates conversational APR with test feedback." 440 }, 441 { 442 "title": "CREF: An LLM-based Conversational Software Repair Framework for Programming Tutors", 443 "relevance": "Second baseline system integrated with ReduceFix; representative of conversational repair with tutor guidance." 444 }, 445 { 446 "title": "Lost in the Middle: How Language Models Use Long Contexts", 447 "relevance": "Foundational motivation for ReduceFix: documents the attention degradation in long prompts that ReduceFix targets." 448 }, 449 { 450 "title": "Simplifying and Isolating Failure-Inducing Input (ddmin)", 451 "relevance": "Classical delta debugging algorithm that ReduceFix builds upon; the paper's LLM generates task-specific adaptations of ddmin." 452 }, 453 { 454 "title": "LPR: Large Language Models-Aided Program Reduction", 455 "relevance": "Closest prior work to ReduceFix's approach; focuses on source code reduction rather than arbitrary test input formats." 456 }, 457 { 458 "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs", 459 "relevance": "Standard APR benchmark the paper explicitly contrasts with LFTBench, noting its short test inputs and leakage risk." 460 }, 461 { 462 "title": "Perses: Syntax-Guided Program Reduction", 463 "relevance": "Grammar-based reduction baseline that motivates ReduceFix's LLM-based approach to handle diverse input formats." 464 }, 465 { 466 "title": "ARVO: Atlas of Reproducible Vulnerabilities for Open Source Software", 467 "relevance": "Provides the OSS-Fuzz data, scripts, and Docker images used for the repository-level validation experiments." 468 }, 469 { 470 "title": "Automated Repair of Programs from Large Language Models", 471 "relevance": "Representative LLM-based APR work; evaluates LLMs on programs they generated, establishing state of the art." 472 }, 473 { 474 "title": "Agentless: Demystifying LLM-based Software Engineering Agents", 475 "relevance": "Repository-level repair framework cited for its SEARCH/REPLACE patch format used in the OSS-Fuzz experiments." 476 } 477 ], 478 "engagement_factors": { 479 "practical_relevance": { 480 "score": 3, 481 "justification": "ReduceFix is a drop-in component for existing APR pipelines with a public artifact, directly usable by practitioners building or extending LLM-based repair systems." 482 }, 483 "surprise_contrarian": { 484 "score": 2, 485 "justification": "The finding that adding more test information (full failing test) consistently hurts repair accuracy below the no-test baseline is counterintuitive and challenges common APR prompt design assumptions." 486 }, 487 "fear_safety": { 488 "score": 0, 489 "justification": "The paper addresses software reliability tooling with no safety, alignment, or misuse implications." 490 }, 491 "drama_conflict": { 492 "score": 1, 493 "justification": "Mild tension with prior APR work that includes full test inputs by default; no major controversy." 494 }, 495 "demo_ability": { 496 "score": 2, 497 "justification": "GitHub artifact is released with LFTBench benchmark; practitioners can run the pipeline on the benchmark, though API keys and AtCoder problem access are required." 498 }, 499 "brand_recognition": { 500 "score": 1, 501 "justification": "Authors from Yanshan University, Peking University, Zhejiang University, and Aalto University — credible institutions but not famous AI labs; no well-known product affiliation." 502 } 503 }, 504 "hn_data": { 505 "threads": [ 506 { 507 "hn_id": "44309345", 508 "title": "Reasoning by Superposition: A Perspective on Chain of Continuous Thought", 509 "points": 60, 510 "comments": 1, 511 "url": "https://news.ycombinator.com/item?id=44309345" 512 }, 513 { 514 "hn_id": "44996548", 515 "title": "The JWST Rocky Worlds DDT Program reveals GJ 3929B to likely be a bare rock", 516 "points": 18, 517 "comments": 0, 518 "url": "https://news.ycombinator.com/item?id=44996548" 519 }, 520 { 521 "hn_id": "44047804", 522 "title": "Code Improvement Practices at Meta", 523 "points": 4, 524 "comments": 0, 525 "url": "https://news.ycombinator.com/item?id=44047804" 526 }, 527 { 528 "hn_id": "45300655", 529 "title": "Generalizable Geometric Image Caption Synthesis", 530 "points": 3, 531 "comments": 0, 532 "url": "https://news.ycombinator.com/item?id=45300655" 533 }, 534 { 535 "hn_id": "36942453", 536 "title": "Open Problems and Fundamental Limitations of RLHF", 537 "points": 3, 538 "comments": 0, 539 "url": "https://news.ycombinator.com/item?id=36942453" 540 }, 541 { 542 "hn_id": "44324675", 543 "title": "ProtoReasoning: Prototypes as the Foundation for Generalizable Reasoning in LLMs", 544 "points": 2, 545 "comments": 0, 546 "url": "https://news.ycombinator.com/item?id=44324675" 547 }, 548 { 549 "hn_id": "43781749", 550 "title": "A Comprehensive Benchmark for C-to-Safe-Rust Transpilation", 551 "points": 2, 552 "comments": 0, 553 "url": "https://news.ycombinator.com/item?id=43781749" 554 }, 555 { 556 "hn_id": "43776339", 557 "title": "The Bitter Lesson Learned from 2k Multilingual Benchmarks", 558 "points": 2, 559 "comments": 0, 560 "url": "https://news.ycombinator.com/item?id=43776339" 561 }, 562 { 563 "hn_id": "45537808", 564 "title": "The role of non–metricity on neutrino behavior in bumblebee gravity", 565 "points": 1, 566 "comments": 0, 567 "url": "https://news.ycombinator.com/item?id=45537808" 568 }, 569 { 570 "hn_id": "44971896", 571 "title": "OS-R1: Agentic Operating System Kernel Tuning with Reinforcement Learning", 572 "points": 1, 573 "comments": 0, 574 "url": "https://news.ycombinator.com/item?id=44971896" 575 } 576 ], 577 "top_points": 60, 578 "total_points": 96, 579 "total_comments": 1 580 } 581 }