scan-v5.json (30171B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "A Deep Dive into Retrieval-Augmented Generation for Code Completion: Experience on WeChat", 6 "authors": [ 7 "Zezhou Yang", 8 "Ting Peng", 9 "Cuiyun Gao", 10 "Chaozheng Wang", 11 "Hailiang Huang" 12 ], 13 "year": 2025, 14 "venue": "IEEE International Conference on Software Maintenance and Evolution", 15 "arxiv_id": "2507.18515", 16 "doi": "10.1109/ICSME64153.2025.00062" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "All four main abstract claims (RAG effectiveness in closed-source repos, similarity-based superiority, BM25/GTE-Qwen best individually, hybrid optimal) are quantitatively supported by Tables I–III across 26 LLMs.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Causal claims about RAG improving code completion are supported by direct base-model vs RAG comparisons; ablation-style comparisons systematically isolate retrieval technique contributions across Tables I–III.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "Conclusions recommend RAG configurations for 'practitioners in proprietary development environments' broadly, but the study is limited to one company's C++ codebase; the threats-to-validity section acknowledges but does not adequately bound this generalization.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper does not consider alternative explanations such as whether the manually annotated benchmark selection favors similarity-based retrieval, or whether C++ specifically benefits differently from RAG than other languages.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper explicitly acknowledges in threats to validity that CodeBLEU and Edit Similarity 'might not fully capture the semantic correctness and functionality of generated code' and supplements with a developer survey to address this gap.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section V.C 'Threats to Validity' covers internal, external, and construct validity as a dedicated subsection — well beyond a passing sentence in the conclusion.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Threats are specific: internal validity identifies parameter sensitivity; external validity names the single-organization codebase limitation and cites 1,669 diverse projects as partial mitigation; construct validity identifies the metric-quality gap and explains how the developer survey addresses it.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper does not explicitly state what the results do NOT show; the threats section describes limitations but never draws explicit lines around what conclusions cannot be drawn from a single C++ enterprise codebase.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Funding is disclosed in a footnote: National Key R&D Program of China (2022YFB3103900), NSFC (62472126), Natural Science Foundation of Guangdong Province, and Shenzhen-Hong Kong and Shenzhen Basic Research projects.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are clearly stated on the title page: four authors at Tencent and two at The Chinese University of Hong Kong.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": true, 87 "justification": "All disclosed funders are government/academic bodies (NSFC, Guangdong provincial government, Shenzhen municipal) with no financial stake in whether RAG works well for WeChat's code completion.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement or declaration of financial interests is provided; Tencent employees are evaluating RAG methods on Tencent's own production codebase — an implicit institutional conflict that is not formally declared.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key terms are formally defined: 'identifier-based RAG' and 'similarity-based RAG' are defined with equations in Section II; each retrieval technique (BM25, CodeBERT, UniXcoder, CoCoSoDa, GTE-Qwen) is described with technical detail and citations.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Four explicit contributions are listed: systematic study of RAG for closed-source code completion, a fine-grained preprocessing algorithm, finding of complementary retrieval techniques, and developer survey validation.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper engages with prior RAG code completion work (REPOFUSE, ReACC, GraphCoder, FT2Ra) throughout the text and in Section VI, explicitly distinguishing its closed-source focus from prior open-source benchmark studies.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": false, 124 "justification": "No source code is released; the preprocessing algorithm and retrieval system are described but exist as proprietary Tencent infrastructure with no repository link provided.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": false, 130 "justification": "Both the 100-example evaluation benchmark and the 1,669-repository retrieval corpus are proprietary WeChat internal data that cannot be released publicly.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": true, 136 "justification": "Hardware (8×A100 40GB or 8/16×H20 96GB by model size), framework (vLLM in Docker), precision (FP16/FP8), temperature (0), retrieval top-k (4), and 2k-token context limit are all specified.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "No step-by-step reproduction instructions are provided; reproduction is impossible without access to the proprietary benchmark and retrieval corpus, and the paper provides no public artifact to start from.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "All results are reported as point estimates (CB/ES scores in tables) with no confidence intervals or error bars, despite comparing dozens of conditions across 26 models on a 100-example benchmark.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are used for any comparative claims despite the paper asserting superiority of specific retrieval methods — all claims of 'better' or 'superior' rely on raw score differences.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Relative percentage improvements are consistently reported with baseline context (e.g., '71.60% and 27.59% relative increase' for Qwen2.5-Coder-14B-Instruct with GTE-Qwen RAG), giving interpretable effect sizes.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The benchmark size of 100 examples is not statistically justified; the paper explains the annotation process but provides no power analysis or reasoning for why 100 examples provides adequate statistical sensitivity.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "All results are single-run point estimates; no variance, standard deviation, or spread across runs is reported despite using stochastic generation (temperature=0 reduces but does not eliminate variance).", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Base models without any RAG augmentation are included as baselines in Table I for all 26 LLMs, with all RAG variants compared directly against the base model.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Baselines and comparisons include state-of-the-art late-2024 models: DeepSeek-V3 (671B), Qwen2.5-Coder-32B-Instruct, and Llama-3.3-70B-Instruct.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "The study systematically ablates similarity-based RAG components by comparing five individual retrieval techniques and all pairwise combinations of lexical+semantic techniques in Table III across 26 LLMs.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Two complementary metrics are used: CodeBLEU (structural/semantic code similarity) and Edit Similarity (token-level edit distance normalized by length).", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": true, 205 "answer": true, 206 "justification": "A developer survey with 3 internal developers evaluated 52 randomly selected examples across 3 LLMs using a 1–5 quality scale, with error type categorization supplementing automated metrics.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "The 100-example evaluation benchmark is constructed separately from the 1,669-repository retrieval corpus, functioning as a proper held-out test set.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Results are broken down by model size category (0.5B through 200B+) across all tables; the benchmark also covers 7 domain categories with easy/hard difficulty splits shown in Figure 1.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Developer survey identifies three error categories with frequencies: Missing/Incorrect Logic (~52%), Extra Logic (~30%), Nonexistent Function Call (~17%), analyzed across three LLMs.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper reports that hybrid retrieval shows 'limited or even negative impact' for models below 7B, and Table I shows CodeLlama-70B performing worse than its base model with most RAG configurations.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Exact versioned model names are specified (e.g., Qwen2.5-Coder-14B-Instruct, GTE-Qwen2-1.5B-instruct, DeepSeek-V3-671B/37B) obtained from official Hugging Face repositories.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": false, 244 "justification": "The paper describes four prompt templates for identifier-based RAG and mentions prompts in Chinese wrapped in C++ comment format, but no actual prompt text is provided.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Temperature (0), number of retrieved results (4), maximum context length (2k tokens), BM25 parameters k and b (defined in equations 10–11), and model precision (FP16/FP8) are all specified.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": true, 255 "answer": true, 256 "justification": "Identifier-based RAG scaffolding (index creation, LLM-based identifier extraction, four distinct prompt templates per knowledge type) is described with formal equations; similarity-based RAG pipeline is also formalized.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Algorithm 1 provides detailed pseudocode for the preprocessing pipeline covering C++ source/header files, protobuf files, macro transformations, and deduplication/formatting steps.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "Neither the 100-example evaluation benchmark nor the 1,669-repository retrieval corpus is publicly available; all data is proprietary WeChat/Tencent internal material.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Benchmark construction is described in detail (3 senior developers with 5+ years experience, 3 weeks, 4 annotation rules, 7 domains, cross-validation); retrieval corpus collection (1,669 internal projects, deduplication, standardization) is also described.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": true, 281 "answer": false, 282 "justification": "Developer survey participants are described only as 'three developers from our group (excluding the authors)' with no formal recruitment criteria, sampling rationale, or qualification criteria.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "Algorithm 1 documents the full data pipeline from raw C++ and protobuf files through extraction, macro transformation, formatting, and corpus construction; retrieval and inference pipelines are also formalized.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "Training data cutoffs for the 26 evaluated LLMs are not stated anywhere in the paper.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "While contamination is implicitly reduced by using proprietary internal code, the paper does not explicitly discuss train/test overlap or argue why the benchmark cannot appear in any model's training data.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "The paper does not address whether public LLMs may have seen portions of WeChat's codebase through any public Tencent repositories or data leaks during pretraining.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": true, 315 "answer": false, 316 "justification": "The developer survey is not pre-registered.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": true, 321 "answer": false, 322 "justification": "No IRB or ethics approval is mentioned for the developer survey despite it involving human participant evaluations published in an academic venue.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": true, 327 "answer": false, 328 "justification": "No demographic information is reported for the 3 survey participants beyond being from 'our group' and not among the paper's authors.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": true, 333 "answer": false, 334 "justification": "The only stated criterion is 'excluding the authors'; no formal inclusion/exclusion criteria (experience level, role, familiarity with the codebase) are described.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": true, 339 "answer": true, 340 "justification": "The paper states 'a random selection of 52 examples' was used for the developer survey evaluation.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": true, 345 "answer": false, 346 "justification": "No blinding procedure is described; developers evaluated completions with knowledge of the retrieval technique source, introducing potential bias.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "NA — the developer survey involved 3 fixed internal participants completing a predefined evaluation set; attrition was not applicable.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "No inference latency or cost figures are reported; hardware is described but no timing measurements or cost estimates are provided for any of the 26 models.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "Hardware configurations are described (8 A100s, 16 H20s) but total GPU-hours, wall-clock time, or financial cost of running experiments across 26 LLMs and 9 retrieval configurations is not stated.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Both identifier-based and similarity-based RAG consistently improve code completion over base models across all 26 LLMs tested.", 375 "evidence": "Table I shows improvements highlighted across the majority of model/method combinations; e.g., Llama-3.1-8B-Instruct improves from CB/ES 34.02/46.07 to 53.47/55.40 with GTE-Qwen RAG.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Similarity-based RAG substantially outperforms identifier-based RAG for code completion in closed-source repositories.", 380 "evidence": "Table I shows consistent large margins: Qwen2.5-Coder-1.5B reaches max CB/ES 37.28/50.77 with identifier-based vs 46.69/56.04 with similarity-based; DeepSeek-V3 reaches 42.24/61.75 vs 60.28/73.11.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "BM25 and GTE-Qwen achieve superior performance among retrieval techniques, with GTE-Qwen uniquely performing better with incomplete code context queries.", 385 "evidence": "Table II shows BM25 and GTE-Qwen consistently outperform CodeBERT, UniXcoder, and CoCoSoDa; GTE-Qwen is the only technique where incomplete queries outperform complete queries for large models.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Lexical and semantic retrieval capture fundamentally different aspects of code similarity, with minimal overlap in retrieved results.", 390 "evidence": "Out of 100 test examples, there are 76, 74, and 64 completely distinct retrieved samples comparing BM25 with UniXcoder, CoCoSoDa, and GTE-Qwen respectively.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Combining BM25 and GTE-Qwen achieves optimal code completion performance, especially for larger models (7B+), but hurts smaller models.", 395 "evidence": "Table III shows BM25+GTE-Qwen reaches CB/ES 63.62/75.26 for DeepSeek-V3 (vs 60.28/73.11 alone); paper explicitly notes 'limited or even negative impact' for sub-7B models.", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "Developer survey confirms BM25+GTE-Qwen combined retrieval produces higher quality completions than either technique alone.", 400 "evidence": "3-developer survey on 52 examples shows combined technique achieves higher average scores and wins in about half of test cases; but n=3 evaluators is far too small for reliable inference.", 401 "supported": "weak" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval", 406 "case-study" 407 ], 408 "key_findings": "RAG methods consistently improve code completion in WeChat's large-scale proprietary C++ codebase across all 26 tested LLMs (0.5B–671B parameters), with similarity-based RAG substantially outperforming identifier-based RAG. Among retrieval techniques, BM25 and GTE-Qwen individually achieve best performance, with GTE-Qwen's bidirectional architecture uniquely suited to incomplete code queries (the code completion scenario). The combination of BM25+GTE-Qwen achieves optimal results for models 7B and larger by exploiting complementary retrieval distributions (64–76% non-overlapping results), while smaller models do not reliably benefit from hybrid retrieval.", 409 "red_flags": [ 410 { 411 "flag": "Tiny benchmark (n=100)", 412 "detail": "Only 100 examples from a single company's codebase provide insufficient statistical power to support claims of superiority across 26 LLMs and 9 retrieval configurations; no sample size justification or power analysis is provided." 413 }, 414 { 415 "flag": "Minimal developer survey (n=3)", 416 "detail": "Only 3 internal developers participated in the human evaluation study; results from such a small N cannot reliably support conclusions about developer preference across retrieval techniques." 417 }, 418 { 419 "flag": "No statistical significance testing", 420 "detail": "All comparative claims (X outperforms Y, combined is better) are made without any statistical tests despite dozens of pairwise comparisons across 26 models on a 100-example benchmark." 421 }, 422 { 423 "flag": "Single run, no variance reported", 424 "detail": "All results are single-run point estimates; no standard deviation or error bars are reported, making it impossible to assess whether observed differences exceed noise." 425 }, 426 { 427 "flag": "Proprietary, non-reproducible benchmark", 428 "detail": "The evaluation benchmark and 1,669-project retrieval corpus are proprietary WeChat internal data; independent reproduction or verification of any result is structurally impossible." 429 }, 430 { 431 "flag": "C++-only study", 432 "detail": "All experiments use C++ code exclusively; conclusions recommending RAG configurations for 'proprietary environments' broadly are unsupported since other languages may respond differently to lexical vs semantic retrieval." 433 }, 434 { 435 "flag": "No inference latency or cost reported", 436 "detail": "The paper evaluates RAG accuracy but does not report retrieval latency, inference overhead, or compute cost — critical factors for deployment decisions in production code completion systems." 437 } 438 ], 439 "cited_papers": [ 440 { 441 "title": "GraphCoder: Enhancing Repository-Level Code Completion via Code Context Graph-based Retrieval and Language Model", 442 "relevance": "Repository-level RAG code completion using graph-based retrieval, direct structural comparator to this study" 443 }, 444 { 445 "title": "REPOFUSE: Repository-Level Code Completion with Fused Dual Context", 446 "relevance": "Repository-level code completion combining dependency and similarity context, closely related prior approach" 447 }, 448 { 449 "title": "Dataflow-Guided Retrieval Augmentation for Repository-Level Code Completion", 450 "relevance": "Alternative RAG approach using data flow graphs for code completion context retrieval" 451 }, 452 { 453 "title": "ReACC: A Retrieval-Augmented Code Completion Framework", 454 "relevance": "Foundational RAG framework for code completion on public benchmarks, motivates this closed-source extension" 455 }, 456 { 457 "title": "FT2Ra: A Fine-Tuning-Inspired Approach to Retrieval-Augmented Code Completion", 458 "relevance": "Related RAG code completion approach evaluated on public benchmarks" 459 }, 460 { 461 "title": "RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems", 462 "relevance": "Standard benchmark methodology for repository-level code completion this study extends to closed-source settings" 463 }, 464 { 465 "title": "Studying LLM Performance on Closed- and Open-source Data", 466 "relevance": "Directly motivates the investigation of performance gaps between open-source and closed-source codebases" 467 }, 468 { 469 "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages", 470 "relevance": "Semantic retrieval model evaluated as one of four similarity-based retrieval baselines" 471 }, 472 { 473 "title": "CodeBLEU: a Method for Automatic Evaluation of Code Synthesis", 474 "relevance": "Primary evaluation metric used throughout the paper" 475 }, 476 { 477 "title": "STALL+: Boosting LLM-based Repository-level Code Completion with Static Analysis", 478 "relevance": "Alternative approach combining static analysis with LLM-based code completion, related line of work" 479 } 480 ], 481 "engagement_factors": { 482 "practical_relevance": { 483 "score": 3, 484 "justification": "Direct industrial deployment study at WeChat scale with actionable configuration guidance (BM25+GTE-Qwen hybrid for 7B+ models) for practitioners building closed-source code completion systems." 485 }, 486 "surprise_contrarian": { 487 "score": 1, 488 "justification": "Main findings confirm expected directions (RAG helps, hybrid retrieval is better); the finding that GTE-Qwen uniquely outperforms with incomplete queries is a mildly interesting exception to the general pattern." 489 }, 490 "fear_safety": { 491 "score": 0, 492 "justification": "No AI risk, safety, or security concerns are raised; this is a pure productivity tool evaluation." 493 }, 494 "drama_conflict": { 495 "score": 0, 496 "justification": "No controversial claims, disputes with prior work, or conflict angles present." 497 }, 498 "demo_ability": { 499 "score": 1, 500 "justification": "Methods use open-source models and public retrieval libraries (BM25S, Qdrant, vLLM), making the approach replicable in principle, but the proprietary benchmark and corpus prevent direct reproduction." 501 }, 502 "brand_recognition": { 503 "score": 2, 504 "justification": "WeChat/Tencent is a globally recognized platform (1B+ MAU cited); paper also evaluates prominent recent models including DeepSeek-V3 and Qwen2.5 series." 505 } 506 }, 507 "hn_data": { 508 "threads": [ 509 { 510 "hn_id": "44769170", 511 "title": "The unreasonable likelihood of being: origin of life, terraforming, and AI", 512 "points": 16, 513 "comments": 9, 514 "url": "https://news.ycombinator.com/item?id=44769170" 515 }, 516 { 517 "hn_id": "44198829", 518 "title": "Algebra Unveils Deep Learning – An Invitation to Neuroalgebraic Geometry", 519 "points": 13, 520 "comments": 0, 521 "url": "https://news.ycombinator.com/item?id=44198829" 522 }, 523 { 524 "hn_id": "42886971", 525 "title": "Thoughts Are All over the Place: On the Underthinking of O1-Like LLMs", 526 "points": 4, 527 "comments": 0, 528 "url": "https://news.ycombinator.com/item?id=42886971" 529 }, 530 { 531 "hn_id": "42884879", 532 "title": "Streaming DiLoCo: Towards a Distributed Free Lunch (Google DeepMind)", 533 "points": 3, 534 "comments": 0, 535 "url": "https://news.ycombinator.com/item?id=42884879" 536 }, 537 { 538 "hn_id": "45056536", 539 "title": "Galois Theory by Calculator", 540 "points": 2, 541 "comments": 1, 542 "url": "https://news.ycombinator.com/item?id=45056536" 543 }, 544 { 545 "hn_id": "45801598", 546 "title": "Streaming DiLoCo: Towards a Distributed Free Lunch", 547 "points": 2, 548 "comments": 0, 549 "url": "https://news.ycombinator.com/item?id=45801598" 550 }, 551 { 552 "hn_id": "44081257", 553 "title": "An Invitation to Neuroalgebraic Geometry", 554 "points": 2, 555 "comments": 0, 556 "url": "https://news.ycombinator.com/item?id=44081257" 557 }, 558 { 559 "hn_id": "43321959", 560 "title": "Swallowing the Poison Pills: Insights from Vulnerability Disparity Among LLMs", 561 "points": 1, 562 "comments": 0, 563 "url": "https://news.ycombinator.com/item?id=43321959" 564 } 565 ], 566 "top_points": 16, 567 "total_points": 43, 568 "total_comments": 10 569 } 570 }