scan-v5.json (28695B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "The Impact of Fine-Tuning Large Language Models on Automated Program Repair", 6 "authors": [ 7 "Roman Machácek", 8 "Anastasiia Grishina", 9 "Max Hort", 10 "Leon Moonen" 11 ], 12 "year": 2025, 13 "venue": "IEEE International Conference on Software Maintenance and Evolution", 14 "arxiv_id": "2507.19909", 15 "doi": "10.1109/ICSME64153.2025.00042" 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": true, 22 "justification": "The abstract claims that full fine-tuning decreases benchmarking performance and PEFT achieves better results — both are directly supported by Tables III and V, which show degradation for DeepSeekCoder/StarCoder under full FT and improvements under LoRA.", 23 "source": "haiku" 24 }, 25 "causal_claims_justified": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper makes causal claims about fine-tuning affecting APR performance using a controlled experimental design that compares the same models across three conditions (no FT, full FT, PEFT) on identical benchmarks, which is adequate for this type of causal inference.", 29 "source": "haiku" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": true, 34 "justification": "Conclusions are bounded to the six selected Java-focused LLMs and three APR benchmarks; the threats-to-validity section explicitly notes that QuixBugs/HumanEval-Java contain simple bugs not representative of complex real-world bugs.", 35 "source": "haiku" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": true, 40 "justification": "The paper discusses multiple alternative explanations for performance degradation under full fine-tuning: data distribution mismatch between CLM and benchmark datasets, overfitting, and model size constraints.", 41 "source": "haiku" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper explicitly distinguishes between plausibility (passes all tests) and correctness, stating 'Plausibility shows whether a patch passes all available tests but is not a guarantee of its correctness,' and uses this as motivation for also reporting CodeBLEU and exact match.", 47 "source": "haiku" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": true, 54 "justification": "Section IV-E 'Threats to Validity' is a dedicated section covering internal and external validity concerns including benchmark representativeness, versioning issues, data distribution mismatch, data leakage, and plausibility vs. correctness.", 55 "source": "haiku" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": true, 60 "justification": "Threats are specific: e.g., 'benchmarks like HumanEval-Java and QuixBugs were created from simple projects and consist of bugs that are not representative of complex real-world bugs,' and specific mention of Java versioning causing result differences from Jiang et al.", 61 "source": "haiku" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": true, 66 "justification": "The paper is explicit that results apply to Java programs, the six selected LLMs, and three specific benchmarks; the threats section acknowledges that larger and more complex datasets would be needed to represent real-world bugs.", 67 "source": "haiku" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": true, 74 "justification": "Acknowledgments explicitly disclose funding from the Research Council of Norway (secureIT #288787) and European Union Horizon Europe Marie Skłodowska-Curie Actions (#101151798).", 75 "source": "haiku" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "All four authors' affiliations are disclosed (University of Bern, Simula Research Laboratory) — academic institutions with no commercial interest in the evaluated models.", 81 "source": "haiku" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": true, 85 "answer": true, 86 "justification": "Funders (Research Council of Norway, EU) are public/governmental bodies independent of the commercial LLMs (CodeLlama, DeepSeekCoder, StarCoder) being evaluated.", 87 "source": "haiku" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is present in the paper; only funding acknowledgments are provided.", 93 "source": "haiku" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "APR, LLMs, PEFT, LoRA, and IA3 are all defined and explained in Section II with mathematical formulations; the paper also defines evaluation outcomes (plausible, timeout, uncompilable, wrong, unknown).", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "Section I lists five explicit contributions (baseline establishment, full FT assessment, PEFT effects, LoRA hyperparameter analysis, replication package) with bullet points.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper explicitly builds on Jiang et al. [8], numerically compares results with Li et al. [58] where models overlap, and situates its novel contribution (PEFT for APR with different fine-tuning data) relative to concurrent work by Huang et al. [59].", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "empirical": { 119 "artifacts": { 120 "code_released": { 121 "applies": true, 122 "answer": true, 123 "justification": "A replication package with code and results is publicly available at https://doi.org/10.5281/zenodo.16359186, explicitly cited twice in the paper.", 124 "source": "haiku" 125 }, 126 "data_released": { 127 "applies": true, 128 "answer": true, 129 "justification": "All three benchmarks (Defects4J, QuixBugs, HumanEval-Java) are publicly available; the CLM fine-tuning dataset is from a public GitHub repo (lin-tan/clm).", 130 "source": "haiku" 131 }, 132 "environment_specified": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper mentions using Hugging Face and A100/V100 GPUs but provides no requirements.txt, Dockerfile, or specific library version pinning in the paper text; environment reproducibility relies on the replication package, which cannot be verified here.", 136 "source": "haiku" 137 }, 138 "reproduction_instructions": { 139 "applies": true, 140 "answer": true, 141 "justification": "The Zenodo replication package explicitly contains 'code and results,' and the paper describes preprocessing formats in Listing 1 plus hyperparameter defaults; the replication package is sufficient to enable reproduction.", 142 "source": "haiku" 143 } 144 }, 145 "statistical_methodology": { 146 "confidence_intervals_or_error_bars": { 147 "applies": true, 148 "answer": false, 149 "justification": "All results are presented as raw counts of plausible patches with no confidence intervals, error bars, or statistical uncertainty measures across the 10 generated patches or runs.", 150 "source": "haiku" 151 }, 152 "significance_tests": { 153 "applies": true, 154 "answer": false, 155 "justification": "No statistical significance tests are applied to any of the comparative claims despite multiple model-by-benchmark comparisons; differences are described informally as 'improvements' or 'deterioration.'", 156 "source": "haiku" 157 }, 158 "effect_sizes_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Percentage improvements with baseline context are reported in RQ3 summary (e.g., 'performance gains of 172%, 225%, 153% on QuixBugs, HumanEval-Java and Defects4J benchmarks' for CodeGen-2B with LoRA).", 162 "source": "haiku" 163 }, 164 "sample_size_justified": { 165 "applies": true, 166 "answer": false, 167 "justification": "The benchmark sizes (40, 163, and 219 programs) are inherited from prior work without any power analysis or justification for their adequacy to detect performance differences.", 168 "source": "haiku" 169 }, 170 "variance_reported": { 171 "applies": true, 172 "answer": false, 173 "justification": "Results are counts of plausible patches with no variance, standard deviation, or spread across multiple experimental runs; stochasticity of LLM inference is acknowledged but not quantified.", 174 "source": "haiku" 175 } 176 }, 177 "evaluation_design": { 178 "baselines_included": { 179 "applies": true, 180 "answer": true, 181 "justification": "All tables compare against base (no fine-tuning) models, and Table V directly compares base vs. FMFT vs. LoRA vs. IA3 for the same models.", 182 "source": "haiku" 183 }, 184 "baselines_contemporary": { 185 "applies": true, 186 "answer": true, 187 "justification": "Baselines include contemporary models (DeepSeekCoder v1, StarCoder, CodeLlama-2, all from 2023-2024) selected specifically to improve on Jiang et al.'s prior work.", 188 "source": "haiku" 189 }, 190 "ablation_study": { 191 "applies": true, 192 "answer": true, 193 "justification": "RQ4 systematically varies LoRA rank and scaling factor across 8 values each (1,2,4,8,16,32,64), functioning as a hyperparameter ablation; the four experimental conditions (no FT, full FT, LoRA, IA3) also constitute an ablation.", 194 "source": "haiku" 195 }, 196 "multiple_metrics": { 197 "applies": true, 198 "answer": true, 199 "justification": "The paper uses plausible patch count (test-based), CodeBLEU, exact match, and training/validation loss as complementary metrics.", 200 "source": "haiku" 201 }, 202 "human_evaluation": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human evaluation of patch quality is conducted; the authors explicitly chose plausibility over manual correctness verification to avoid subjectivity issues.", 206 "source": "haiku" 207 }, 208 "held_out_test_set": { 209 "applies": true, 210 "answer": true, 211 "justification": "The three APR benchmarks serve as held-out test sets, completely separate from the CLM fine-tuning dataset; Defects4J-related patches were explicitly removed from CLM to prevent leakage.", 212 "source": "haiku" 213 }, 214 "per_category_breakdown": { 215 "applies": true, 216 "answer": true, 217 "justification": "All result tables provide per-benchmark (QuixBugs, HumanEval-Java, Defects4J) and per-model breakdowns, enabling granular comparison across conditions.", 218 "source": "haiku" 219 }, 220 "failure_cases_discussed": { 221 "applies": true, 222 "answer": true, 223 "justification": "The paper discusses failure modes explicitly: full fine-tuning causes performance degradation for stronger models (DeepSeekCoder, StarCoder) due to data distribution mismatch, and CodeT5 models underperform despite PEFT.", 224 "source": "haiku" 225 }, 226 "negative_results_reported": { 227 "applies": true, 228 "answer": true, 229 "justification": "Negative results are a central finding: full fine-tuning degrades performance for DeepSeekCoder-1.3b from 33/94/72 to 15/64/80 on QB/HE/D4J, and IA3 underperforms LoRA in 21/24 cases.", 230 "source": "haiku" 231 } 232 }, 233 "setup_transparency": { 234 "model_versions_specified": { 235 "applies": true, 236 "answer": true, 237 "justification": "Specific model versions are named (CodeGen-1, CodeT5, StarCoderBase, DeepSeekCoder-Base v1, Bloom, CodeLlama-2) with parameter sizes; the paper notes 'at the time of writing, we used the latest DeepSeekCoder model available, i.e., v1.'", 238 "source": "haiku" 239 }, 240 "prompts_provided": { 241 "applies": true, 242 "answer": true, 243 "justification": "Listing 1 provides the full prompt format for each model (Bloom, CodeGEN, CodeLlama2, CodeT5, DeepSeekCoder, StarCoder) with actual code examples and fill-in-the-middle tokens.", 244 "source": "haiku" 245 }, 246 "hyperparameters_reported": { 247 "applies": true, 248 "answer": true, 249 "justification": "LoRA defaults (r=8, α=16), number of epochs (3), and the 8 values tested for rank and scaling factor are explicitly reported; IA3 uses Hugging Face defaults.", 250 "source": "haiku" 251 }, 252 "scaffolding_described": { 253 "applies": false, 254 "answer": false, 255 "justification": "This is a fine-tuning study with direct model inference, not an agentic scaffolding setup; no agentic scaffolding is used.", 256 "source": "haiku" 257 }, 258 "data_preprocessing_documented": { 259 "applies": true, 260 "answer": true, 261 "justification": "Section III-D describes preprocessing for each model with Listing 1 showing exact input formats; CLM dataset filtering criteria (single-hunk patches, Defects4J deduplication) are also documented.", 262 "source": "haiku" 263 } 264 }, 265 "data_integrity": { 266 "raw_data_available": { 267 "applies": true, 268 "answer": true, 269 "justification": "The Zenodo replication package explicitly contains 'code and results,' making raw experimental results available for independent verification.", 270 "source": "haiku" 271 }, 272 "data_collection_described": { 273 "applies": true, 274 "answer": true, 275 "justification": "Section III-A describes all datasets: Defects4J 2.0.1 (835 active bugs), QuixBugs (40 Java programs), HumanEval-Java (163 bugs), and CLM (143,666 instances from 1,083,185 GitHub commits, March 2011-March 2018).", 276 "source": "haiku" 277 }, 278 "recruitment_methods_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "Standard benchmarks are used; no human participant recruitment is involved.", 282 "source": "haiku" 283 }, 284 "data_pipeline_documented": { 285 "applies": true, 286 "answer": true, 287 "justification": "The preprocessing pipeline from raw datasets to model inputs is documented in Section III-D and Listing 1, including CLM filtering steps (single-hunk filtering, Defects4J deduplication via AST comparison).", 288 "source": "haiku" 289 } 290 }, 291 "contamination": { 292 "training_cutoff_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "Training data cutoffs for the evaluated LLMs are not explicitly stated; the paper acknowledges contamination risk but does not report training cutoff dates for any of the six models.", 296 "source": "haiku" 297 }, 298 "train_test_overlap_discussed": { 299 "applies": true, 300 "answer": true, 301 "justification": "Section IV-E explicitly discusses data leakage as 'the biggest reason for concern' and notes that models are pre-trained on GitHub which may include benchmark code; HumanEval-Java was specifically Python→Java converted to reduce overlap.", 302 "source": "haiku" 303 }, 304 "benchmark_contamination_addressed": { 305 "applies": true, 306 "answer": true, 307 "justification": "HumanEval-Java was created by translating Python to Java specifically to eliminate pre-training contamination; Defects4J-related patches were removed from CLM using AST comparison; contamination is acknowledged as unresolvable for public GitHub-trained models.", 308 "source": "haiku" 309 } 310 }, 311 "human_studies": { 312 "pre_registered": { 313 "applies": false, 314 "answer": false, 315 "justification": "No human participants.", 316 "source": "haiku" 317 }, 318 "irb_or_ethics_approval": { 319 "applies": false, 320 "answer": false, 321 "justification": "No human participants.", 322 "source": "haiku" 323 }, 324 "demographics_reported": { 325 "applies": false, 326 "answer": false, 327 "justification": "No human participants.", 328 "source": "haiku" 329 }, 330 "inclusion_exclusion_criteria": { 331 "applies": false, 332 "answer": false, 333 "justification": "No human participants.", 334 "source": "haiku" 335 }, 336 "randomization_described": { 337 "applies": false, 338 "answer": false, 339 "justification": "No human participants.", 340 "source": "haiku" 341 }, 342 "blinding_described": { 343 "applies": false, 344 "answer": false, 345 "justification": "No human participants.", 346 "source": "haiku" 347 }, 348 "attrition_reported": { 349 "applies": false, 350 "answer": false, 351 "justification": "No human participants.", 352 "source": "haiku" 353 } 354 }, 355 "cost_and_practicality": { 356 "inference_cost_reported": { 357 "applies": true, 358 "answer": false, 359 "justification": "GPU hardware (A100, V100) is mentioned as a constraint for model selection, but no specific inference cost, latency, or wall-clock time for running experiments is reported.", 360 "source": "haiku" 361 }, 362 "compute_budget_stated": { 363 "applies": true, 364 "answer": false, 365 "justification": "Only the hardware type ('1 node with A100, and V100 GPUs') is mentioned; no total GPU-hours, training time, or compute budget is quantified.", 366 "source": "haiku" 367 } 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "Full fine-tuning decreases APR performance for stronger pre-trained models (DeepSeekCoder, StarCoder) due to data distribution mismatch and overfitting.", 374 "evidence": "Table III shows DeepSeekCoder-1.3b drops from 33/94/72 (base) to 15/64/80 (FMFT epoch 3) on QB/HE/D4J; StarCoder-1b drops from 22/69/62 to 13/49/71.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Parameter-efficient fine-tuning (LoRA) improves APR performance over both base models and full fine-tuning for most configurations.", 379 "evidence": "Table V shows CodeGen-2B with LoRA achieves 19/81/98 vs. base 13/44/20 and FMFT 11/36/64; described as '172%, 225%, 153% performance gains.'", 380 "supported": "strong" 381 }, 382 { 383 "claim": "LoRA outperforms IA3 in 21 out of 24 cases, contrary to Liu et al.'s claim that IA3 is generally superior.", 384 "evidence": "Table V comparison across CodeGen, CodeT5, and DeepSeekCoder models shows LoRA higher in 21/24 benchmark-model pairs; discrepancy with Li et al. [58] acknowledged.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "LoRA hyperparameters (rank and scaling factor) have negligible impact on APR performance.", 389 "evidence": "Figures 3-6 show CodeBLEU varies only 0.6-0.64 across all tested rank/scaling values (1,2,4,8,16,32,64); exact match shows slightly more variation but remains low throughout.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "Larger models generally achieve more plausible patches without fine-tuning.", 394 "evidence": "Table I shows CodeGen-2B > CodeGen-350M in 5/6 cases, StarCoder-3b ≥ StarCoder-1b in 5/6 cases; holds in 34/48 cases overall.", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "PEFT methods use less than 1% of original trainable parameters while achieving competitive or superior performance.", 399 "evidence": "Table VI shows LoRA uses 0.06-0.49% and IA3 uses 0.02-0.07% of total parameters; Table V shows these achieve better results than full fine-tuning in many cases.", 400 "supported": "strong" 401 } 402 ], 403 "methodology_tags": [ 404 "benchmark-eval" 405 ], 406 "key_findings": "Full fine-tuning improves weaker baseline models (CodeT5, Bloom) but degrades stronger pre-trained models (DeepSeekCoder, StarCoder) due to data distribution mismatch between the CLM fine-tuning dataset and APR benchmarks. Parameter-efficient fine-tuning with LoRA consistently outperforms full fine-tuning and uses under 0.2% of trainable parameters, making it the recommended approach for APR. Contrary to prior work claiming IA3 superiority, LoRA outperforms IA3 in 21/24 experimental configurations. LoRA hyperparameters (rank, scaling factor) have negligible effect on final performance, supporting use of default values.", 407 "red_flags": [ 408 { 409 "flag": "No statistical significance testing", 410 "detail": "All comparative claims (PEFT better than full FT, LoRA better than IA3) are supported only by raw counts with no significance tests despite multiple pairwise comparisons across 3 benchmarks × 6+ models." 411 }, 412 { 413 "flag": "No variance across runs", 414 "detail": "Results report counts of plausible patches from a single run with 10 patches each; LLM inference stochasticity is acknowledged but not quantified through repeated experiments." 415 }, 416 { 417 "flag": "Small benchmark scale limits power", 418 "detail": "QuixBugs has only 40 programs — small differences (e.g., 2-3 additional plausible patches) are treated as meaningful findings without any power analysis." 419 }, 420 { 421 "flag": "Training cutoffs not stated for evaluated models", 422 "detail": "Contamination concern is acknowledged but training data cutoffs for DeepSeekCoder, StarCoder, CodeLlama-2, etc. are not retrieved or stated, leaving the contamination risk unquantified." 423 }, 424 { 425 "flag": "Plausibility ≠ correctness", 426 "detail": "Plausible patches (passing all provided tests) are used as the primary success metric, but test suites in these benchmarks are incomplete; some plausible patches may be incorrect fixes that happen to satisfy tests." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "Impact of Code Language Models on Automated Program Repair", 432 "relevance": "Direct predecessor work by Jiang et al. that this paper builds upon, sharing the framework, fine-tuning dataset (CLM), and two of the six LLMs (CodeGen, CodeT5)" 433 }, 434 { 435 "title": "Exploring Parameter-Efficient Fine-Tuning of Large Language Model on Automated Program Repair", 436 "relevance": "Closest related work (Li et al.) that also investigates PEFT for APR but uses instruction-tuning datasets; numerical comparisons made where models overlap" 437 }, 438 { 439 "title": "Comprehensive Fine-Tuning Large Language Models of Code for Automated Program Repair", 440 "relevance": "Concurrent study (Huang et al.) on fine-tuning LLMs for APR, situates this paper's contribution relative to parallel work" 441 }, 442 { 443 "title": "LoRA: Low-Rank Adaptation of Large Language Models", 444 "relevance": "Foundational technique paper for LoRA PEFT method, one of the two PEFT approaches evaluated" 445 }, 446 { 447 "title": "Few-Shot Parameter-Efficient Fine-Tuning Is Better and Cheaper than in-Context Learning", 448 "relevance": "Foundational paper for IA3 PEFT method, the second PEFT approach evaluated; claimed IA3 outperforms LoRA, a claim this paper partially refutes" 449 }, 450 { 451 "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs", 452 "relevance": "Primary complex benchmark used for APR evaluation throughout the study" 453 }, 454 { 455 "title": "RepairLLaMA: Efficient Representations and Fine-Tuned Adapters for Program Repair", 456 "relevance": "Related PEFT-for-APR work that fine-tuned CodeLlama-7b with LoRA, cited for context on PEFT in APR" 457 }, 458 { 459 "title": "A Syntax-Guided Edit Decoder for Neural Program Repair", 460 "relevance": "Source of the CLM fine-tuning dataset (143,666 bug-fix pairs from GitHub) used in all fine-tuning experiments" 461 } 462 ], 463 "engagement_factors": { 464 "practical_relevance": { 465 "score": 2, 466 "justification": "Provides directly actionable guidance: use LoRA over full fine-tuning and IA3 for APR, with specific default hyperparameters validated across 6 models and 3 benchmarks." 467 }, 468 "surprise_contrarian": { 469 "score": 2, 470 "justification": "Counter-intuitive finding that full fine-tuning hurts better-performing models, and that LoRA beats IA3 in 21/24 cases contrary to IA3's claimed superiority in the original IA3 paper." 471 }, 472 "fear_safety": { 473 "score": 0, 474 "justification": "No AI safety or risk concerns — this is a technical performance comparison in a software engineering research context." 475 }, 476 "drama_conflict": { 477 "score": 1, 478 "justification": "Mild contradiction with Li et al.'s IA3 vs. LoRA findings creates some academic tension, but the discrepancy is discussed constructively rather than as a controversy." 479 }, 480 "demo_ability": { 481 "score": 1, 482 "justification": "Replication package exists at Zenodo with code, but running experiments requires substantial GPU resources (A100/V100) making casual reproduction difficult." 483 }, 484 "brand_recognition": { 485 "score": 0, 486 "justification": "Simula Research Laboratory and University of Bern are respected institutions but not high-profile AI labs; no famous products or models evaluated are from these institutions." 487 } 488 }, 489 "hn_data": { 490 "threads": [ 491 { 492 "hn_id": "41613513", 493 "title": "AI Companions Reduce Loneliness", 494 "points": 51, 495 "comments": 81, 496 "url": "https://news.ycombinator.com/item?id=41613513" 497 }, 498 { 499 "hn_id": "43246743", 500 "title": "Order Doesn’t Matter, But Reasoning Does", 501 "points": 14, 502 "comments": 16, 503 "url": "https://news.ycombinator.com/item?id=43246743" 504 }, 505 { 506 "hn_id": "41116325", 507 "title": "Substantial Risk of Atlantic Circulation Tipping Under Moderate Climate Change", 508 "points": 5, 509 "comments": 0, 510 "url": "https://news.ycombinator.com/item?id=41116325" 511 }, 512 { 513 "hn_id": "44542845", 514 "title": "Simulated impact on LSST data of Starlink v1.5 and V2 satellites", 515 "points": 2, 516 "comments": 0, 517 "url": "https://news.ycombinator.com/item?id=44542845" 518 }, 519 { 520 "hn_id": "44438536", 521 "title": "CoVE: Compressed Vocabulary Expansion Makes Better LLM-Based Recommender Systems", 522 "points": 2, 523 "comments": 0, 524 "url": "https://news.ycombinator.com/item?id=44438536" 525 }, 526 { 527 "hn_id": "44240945", 528 "title": "Is (Selective) Round-to-Nearest Quantization All You Need?", 529 "points": 2, 530 "comments": 0, 531 "url": "https://news.ycombinator.com/item?id=44240945" 532 }, 533 { 534 "hn_id": "43265110", 535 "title": "Training LLMs with Order-Centric Augmentation", 536 "points": 2, 537 "comments": 0, 538 "url": "https://news.ycombinator.com/item?id=43265110" 539 }, 540 { 541 "hn_id": "27997501", 542 "title": "So you want to analyze Scheme programs with Datalog?", 543 "points": 2, 544 "comments": 0, 545 "url": "https://news.ycombinator.com/item?id=27997501" 546 }, 547 { 548 "hn_id": "42116437", 549 "title": "A Survey of Explainable AI in Financial Forecasting", 550 "points": 1, 551 "comments": 1, 552 "url": "https://news.ycombinator.com/item?id=42116437" 553 }, 554 { 555 "hn_id": "44465492", 556 "title": "Few-Shot Learning for Industrial Time Series: Screw-Fastening Process Monitoring", 557 "points": 1, 558 "comments": 0, 559 "url": "https://news.ycombinator.com/item?id=44465492" 560 } 561 ], 562 "top_points": 51, 563 "total_points": 82, 564 "total_comments": 98 565 } 566 }