scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (30171B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "A Deep Dive into Retrieval-Augmented Generation for Code Completion: Experience on WeChat",
      6     "authors": [
      7       "Zezhou Yang",
      8       "Ting Peng",
      9       "Cuiyun Gao",
     10       "Chaozheng Wang",
     11       "Hailiang Huang"
     12     ],
     13     "year": 2025,
     14     "venue": "IEEE International Conference on Software Maintenance and Evolution",
     15     "arxiv_id": "2507.18515",
     16     "doi": "10.1109/ICSME64153.2025.00062"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All four main abstract claims (RAG effectiveness in closed-source repos, similarity-based superiority, BM25/GTE-Qwen best individually, hybrid optimal) are quantitatively supported by Tables I–III across 26 LLMs.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about RAG improving code completion are supported by direct base-model vs RAG comparisons; ablation-style comparisons systematically isolate retrieval technique contributions across Tables I–III.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Conclusions recommend RAG configurations for 'practitioners in proprietary development environments' broadly, but the study is limited to one company's C++ codebase; the threats-to-validity section acknowledges but does not adequately bound this generalization.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not consider alternative explanations such as whether the manually annotated benchmark selection favors similarity-based retrieval, or whether C++ specifically benefits differently from RAG than other languages.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly acknowledges in threats to validity that CodeBLEU and Edit Similarity 'might not fully capture the semantic correctness and functionality of generated code' and supplements with a developer survey to address this gap.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section V.C 'Threats to Validity' covers internal, external, and construct validity as a dedicated subsection — well beyond a passing sentence in the conclusion.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Threats are specific: internal validity identifies parameter sensitivity; external validity names the single-organization codebase limitation and cites 1,669 diverse projects as partial mitigation; construct validity identifies the metric-quality gap and explains how the developer survey addresses it.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what the results do NOT show; the threats section describes limitations but never draws explicit lines around what conclusions cannot be drawn from a single C++ enterprise codebase.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Funding is disclosed in a footnote: National Key R&D Program of China (2022YFB3103900), NSFC (62472126), Natural Science Foundation of Guangdong Province, and Shenzhen-Hong Kong and Shenzhen Basic Research projects.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly stated on the title page: four authors at Tencent and two at The Chinese University of Hong Kong.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "All disclosed funders are government/academic bodies (NSFC, Guangdong provincial government, Shenzhen municipal) with no financial stake in whether RAG works well for WeChat's code completion.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or declaration of financial interests is provided; Tencent employees are evaluating RAG methods on Tencent's own production codebase — an implicit institutional conflict that is not formally declared.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are formally defined: 'identifier-based RAG' and 'similarity-based RAG' are defined with equations in Section II; each retrieval technique (BM25, CodeBERT, UniXcoder, CoCoSoDa, GTE-Qwen) is described with technical detail and citations.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Four explicit contributions are listed: systematic study of RAG for closed-source code completion, a fine-grained preprocessing algorithm, finding of complementary retrieval techniques, and developer survey validation.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper engages with prior RAG code completion work (REPOFUSE, ReACC, GraphCoder, FT2Ra) throughout the text and in Section VI, explicitly distinguishing its closed-source focus from prior open-source benchmark studies.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No source code is released; the preprocessing algorithm and retrieval system are described but exist as proprietary Tencent infrastructure with no repository link provided.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Both the 100-example evaluation benchmark and the 1,669-repository retrieval corpus are proprietary WeChat internal data that cannot be released publicly.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "Hardware (8×A100 40GB or 8/16×H20 96GB by model size), framework (vLLM in Docker), precision (FP16/FP8), temperature (0), retrieval top-k (4), and 2k-token context limit are all specified.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided; reproduction is impossible without access to the proprietary benchmark and retrieval corpus, and the paper provides no public artifact to start from.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results are reported as point estimates (CB/ES scores in tables) with no confidence intervals or error bars, despite comparing dozens of conditions across 26 models on a 100-example benchmark.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are used for any comparative claims despite the paper asserting superiority of specific retrieval methods — all claims of 'better' or 'superior' rely on raw score differences.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Relative percentage improvements are consistently reported with baseline context (e.g., '71.60% and 27.59% relative increase' for Qwen2.5-Coder-14B-Instruct with GTE-Qwen RAG), giving interpretable effect sizes.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The benchmark size of 100 examples is not statistically justified; the paper explains the annotation process but provides no power analysis or reasoning for why 100 examples provides adequate statistical sensitivity.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "All results are single-run point estimates; no variance, standard deviation, or spread across runs is reported despite using stochastic generation (temperature=0 reduces but does not eliminate variance).",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Base models without any RAG augmentation are included as baselines in Table I for all 26 LLMs, with all RAG variants compared directly against the base model.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines and comparisons include state-of-the-art late-2024 models: DeepSeek-V3 (671B), Qwen2.5-Coder-32B-Instruct, and Llama-3.3-70B-Instruct.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The study systematically ablates similarity-based RAG components by comparing five individual retrieval techniques and all pairwise combinations of lexical+semantic techniques in Table III across 26 LLMs.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Two complementary metrics are used: CodeBLEU (structural/semantic code similarity) and Edit Similarity (token-level edit distance normalized by length).",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "A developer survey with 3 internal developers evaluated 52 randomly selected examples across 3 LLMs using a 1–5 quality scale, with error type categorization supplementing automated metrics.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "The 100-example evaluation benchmark is constructed separately from the 1,669-repository retrieval corpus, functioning as a proper held-out test set.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by model size category (0.5B through 200B+) across all tables; the benchmark also covers 7 domain categories with easy/hard difficulty splits shown in Figure 1.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Developer survey identifies three error categories with frequencies: Missing/Incorrect Logic (~52%), Extra Logic (~30%), Nonexistent Function Call (~17%), analyzed across three LLMs.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper reports that hybrid retrieval shows 'limited or even negative impact' for models below 7B, and Table I shows CodeLlama-70B performing worse than its base model with most RAG configurations.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Exact versioned model names are specified (e.g., Qwen2.5-Coder-14B-Instruct, GTE-Qwen2-1.5B-instruct, DeepSeek-V3-671B/37B) obtained from official Hugging Face repositories.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The paper describes four prompt templates for identifier-based RAG and mentions prompts in Chinese wrapped in C++ comment format, but no actual prompt text is provided.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Temperature (0), number of retrieved results (4), maximum context length (2k tokens), BM25 parameters k and b (defined in equations 10–11), and model precision (FP16/FP8) are all specified.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Identifier-based RAG scaffolding (index creation, LLM-based identifier extraction, four distinct prompt templates per knowledge type) is described with formal equations; similarity-based RAG pipeline is also formalized.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Algorithm 1 provides detailed pseudocode for the preprocessing pipeline covering C++ source/header files, protobuf files, macro transformations, and deduplication/formatting steps.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Neither the 100-example evaluation benchmark nor the 1,669-repository retrieval corpus is publicly available; all data is proprietary WeChat/Tencent internal material.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Benchmark construction is described in detail (3 senior developers with 5+ years experience, 3 weeks, 4 annotation rules, 7 domains, cross-validation); retrieval corpus collection (1,669 internal projects, deduplication, standardization) is also described.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "Developer survey participants are described only as 'three developers from our group (excluding the authors)' with no formal recruitment criteria, sampling rationale, or qualification criteria.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Algorithm 1 documents the full data pipeline from raw C++ and protobuf files through extraction, macro transformation, formatting, and corpus construction; retrieval and inference pipelines are also formalized.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs for the 26 evaluated LLMs are not stated anywhere in the paper.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "While contamination is implicitly reduced by using proprietary internal code, the paper does not explicitly discuss train/test overlap or argue why the benchmark cannot appear in any model's training data.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The paper does not address whether public LLMs may have seen portions of WeChat's codebase through any public Tencent repositories or data leaks during pretraining.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "The developer survey is not pre-registered.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": false,
    322           "justification": "No IRB or ethics approval is mentioned for the developer survey despite it involving human participant evaluations published in an academic venue.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": false,
    328           "justification": "No demographic information is reported for the 3 survey participants beyond being from 'our group' and not among the paper's authors.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": false,
    334           "justification": "The only stated criterion is 'excluding the authors'; no formal inclusion/exclusion criteria (experience level, role, familiarity with the codebase) are described.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": true,
    339           "answer": true,
    340           "justification": "The paper states 'a random selection of 52 examples' was used for the developer survey evaluation.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": true,
    345           "answer": false,
    346           "justification": "No blinding procedure is described; developers evaluated completions with knowledge of the retrieval technique source, introducing potential bias.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "NA — the developer survey involved 3 fixed internal participants completing a predefined evaluation set; attrition was not applicable.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference latency or cost figures are reported; hardware is described but no timing measurements or cost estimates are provided for any of the 26 models.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Hardware configurations are described (8 A100s, 16 H20s) but total GPU-hours, wall-clock time, or financial cost of running experiments across 26 LLMs and 9 retrieval configurations is not stated.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Both identifier-based and similarity-based RAG consistently improve code completion over base models across all 26 LLMs tested.",
    375       "evidence": "Table I shows improvements highlighted across the majority of model/method combinations; e.g., Llama-3.1-8B-Instruct improves from CB/ES 34.02/46.07 to 53.47/55.40 with GTE-Qwen RAG.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Similarity-based RAG substantially outperforms identifier-based RAG for code completion in closed-source repositories.",
    380       "evidence": "Table I shows consistent large margins: Qwen2.5-Coder-1.5B reaches max CB/ES 37.28/50.77 with identifier-based vs 46.69/56.04 with similarity-based; DeepSeek-V3 reaches 42.24/61.75 vs 60.28/73.11.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "BM25 and GTE-Qwen achieve superior performance among retrieval techniques, with GTE-Qwen uniquely performing better with incomplete code context queries.",
    385       "evidence": "Table II shows BM25 and GTE-Qwen consistently outperform CodeBERT, UniXcoder, and CoCoSoDa; GTE-Qwen is the only technique where incomplete queries outperform complete queries for large models.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Lexical and semantic retrieval capture fundamentally different aspects of code similarity, with minimal overlap in retrieved results.",
    390       "evidence": "Out of 100 test examples, there are 76, 74, and 64 completely distinct retrieved samples comparing BM25 with UniXcoder, CoCoSoDa, and GTE-Qwen respectively.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Combining BM25 and GTE-Qwen achieves optimal code completion performance, especially for larger models (7B+), but hurts smaller models.",
    395       "evidence": "Table III shows BM25+GTE-Qwen reaches CB/ES 63.62/75.26 for DeepSeek-V3 (vs 60.28/73.11 alone); paper explicitly notes 'limited or even negative impact' for sub-7B models.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Developer survey confirms BM25+GTE-Qwen combined retrieval produces higher quality completions than either technique alone.",
    400       "evidence": "3-developer survey on 52 examples shows combined technique achieves higher average scores and wins in about half of test cases; but n=3 evaluators is far too small for reliable inference.",
    401       "supported": "weak"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "case-study"
    407   ],
    408   "key_findings": "RAG methods consistently improve code completion in WeChat's large-scale proprietary C++ codebase across all 26 tested LLMs (0.5B–671B parameters), with similarity-based RAG substantially outperforming identifier-based RAG. Among retrieval techniques, BM25 and GTE-Qwen individually achieve best performance, with GTE-Qwen's bidirectional architecture uniquely suited to incomplete code queries (the code completion scenario). The combination of BM25+GTE-Qwen achieves optimal results for models 7B and larger by exploiting complementary retrieval distributions (64–76% non-overlapping results), while smaller models do not reliably benefit from hybrid retrieval.",
    409   "red_flags": [
    410     {
    411       "flag": "Tiny benchmark (n=100)",
    412       "detail": "Only 100 examples from a single company's codebase provide insufficient statistical power to support claims of superiority across 26 LLMs and 9 retrieval configurations; no sample size justification or power analysis is provided."
    413     },
    414     {
    415       "flag": "Minimal developer survey (n=3)",
    416       "detail": "Only 3 internal developers participated in the human evaluation study; results from such a small N cannot reliably support conclusions about developer preference across retrieval techniques."
    417     },
    418     {
    419       "flag": "No statistical significance testing",
    420       "detail": "All comparative claims (X outperforms Y, combined is better) are made without any statistical tests despite dozens of pairwise comparisons across 26 models on a 100-example benchmark."
    421     },
    422     {
    423       "flag": "Single run, no variance reported",
    424       "detail": "All results are single-run point estimates; no standard deviation or error bars are reported, making it impossible to assess whether observed differences exceed noise."
    425     },
    426     {
    427       "flag": "Proprietary, non-reproducible benchmark",
    428       "detail": "The evaluation benchmark and 1,669-project retrieval corpus are proprietary WeChat internal data; independent reproduction or verification of any result is structurally impossible."
    429     },
    430     {
    431       "flag": "C++-only study",
    432       "detail": "All experiments use C++ code exclusively; conclusions recommending RAG configurations for 'proprietary environments' broadly are unsupported since other languages may respond differently to lexical vs semantic retrieval."
    433     },
    434     {
    435       "flag": "No inference latency or cost reported",
    436       "detail": "The paper evaluates RAG accuracy but does not report retrieval latency, inference overhead, or compute cost — critical factors for deployment decisions in production code completion systems."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "GraphCoder: Enhancing Repository-Level Code Completion via Code Context Graph-based Retrieval and Language Model",
    442       "relevance": "Repository-level RAG code completion using graph-based retrieval, direct structural comparator to this study"
    443     },
    444     {
    445       "title": "REPOFUSE: Repository-Level Code Completion with Fused Dual Context",
    446       "relevance": "Repository-level code completion combining dependency and similarity context, closely related prior approach"
    447     },
    448     {
    449       "title": "Dataflow-Guided Retrieval Augmentation for Repository-Level Code Completion",
    450       "relevance": "Alternative RAG approach using data flow graphs for code completion context retrieval"
    451     },
    452     {
    453       "title": "ReACC: A Retrieval-Augmented Code Completion Framework",
    454       "relevance": "Foundational RAG framework for code completion on public benchmarks, motivates this closed-source extension"
    455     },
    456     {
    457       "title": "FT2Ra: A Fine-Tuning-Inspired Approach to Retrieval-Augmented Code Completion",
    458       "relevance": "Related RAG code completion approach evaluated on public benchmarks"
    459     },
    460     {
    461       "title": "RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems",
    462       "relevance": "Standard benchmark methodology for repository-level code completion this study extends to closed-source settings"
    463     },
    464     {
    465       "title": "Studying LLM Performance on Closed- and Open-source Data",
    466       "relevance": "Directly motivates the investigation of performance gaps between open-source and closed-source codebases"
    467     },
    468     {
    469       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    470       "relevance": "Semantic retrieval model evaluated as one of four similarity-based retrieval baselines"
    471     },
    472     {
    473       "title": "CodeBLEU: a Method for Automatic Evaluation of Code Synthesis",
    474       "relevance": "Primary evaluation metric used throughout the paper"
    475     },
    476     {
    477       "title": "STALL+: Boosting LLM-based Repository-level Code Completion with Static Analysis",
    478       "relevance": "Alternative approach combining static analysis with LLM-based code completion, related line of work"
    479     }
    480   ],
    481   "engagement_factors": {
    482     "practical_relevance": {
    483       "score": 3,
    484       "justification": "Direct industrial deployment study at WeChat scale with actionable configuration guidance (BM25+GTE-Qwen hybrid for 7B+ models) for practitioners building closed-source code completion systems."
    485     },
    486     "surprise_contrarian": {
    487       "score": 1,
    488       "justification": "Main findings confirm expected directions (RAG helps, hybrid retrieval is better); the finding that GTE-Qwen uniquely outperforms with incomplete queries is a mildly interesting exception to the general pattern."
    489     },
    490     "fear_safety": {
    491       "score": 0,
    492       "justification": "No AI risk, safety, or security concerns are raised; this is a pure productivity tool evaluation."
    493     },
    494     "drama_conflict": {
    495       "score": 0,
    496       "justification": "No controversial claims, disputes with prior work, or conflict angles present."
    497     },
    498     "demo_ability": {
    499       "score": 1,
    500       "justification": "Methods use open-source models and public retrieval libraries (BM25S, Qdrant, vLLM), making the approach replicable in principle, but the proprietary benchmark and corpus prevent direct reproduction."
    501     },
    502     "brand_recognition": {
    503       "score": 2,
    504       "justification": "WeChat/Tencent is a globally recognized platform (1B+ MAU cited); paper also evaluates prominent recent models including DeepSeek-V3 and Qwen2.5 series."
    505     }
    506   },
    507   "hn_data": {
    508     "threads": [
    509       {
    510         "hn_id": "44769170",
    511         "title": "The unreasonable likelihood of being: origin of life, terraforming, and AI",
    512         "points": 16,
    513         "comments": 9,
    514         "url": "https://news.ycombinator.com/item?id=44769170"
    515       },
    516       {
    517         "hn_id": "44198829",
    518         "title": "Algebra Unveils Deep Learning – An Invitation to Neuroalgebraic Geometry",
    519         "points": 13,
    520         "comments": 0,
    521         "url": "https://news.ycombinator.com/item?id=44198829"
    522       },
    523       {
    524         "hn_id": "42886971",
    525         "title": "Thoughts Are All over the Place: On the Underthinking of O1-Like LLMs",
    526         "points": 4,
    527         "comments": 0,
    528         "url": "https://news.ycombinator.com/item?id=42886971"
    529       },
    530       {
    531         "hn_id": "42884879",
    532         "title": "Streaming DiLoCo: Towards a Distributed Free Lunch (Google DeepMind)",
    533         "points": 3,
    534         "comments": 0,
    535         "url": "https://news.ycombinator.com/item?id=42884879"
    536       },
    537       {
    538         "hn_id": "45056536",
    539         "title": "Galois Theory by Calculator",
    540         "points": 2,
    541         "comments": 1,
    542         "url": "https://news.ycombinator.com/item?id=45056536"
    543       },
    544       {
    545         "hn_id": "45801598",
    546         "title": "Streaming DiLoCo: Towards a Distributed Free Lunch",
    547         "points": 2,
    548         "comments": 0,
    549         "url": "https://news.ycombinator.com/item?id=45801598"
    550       },
    551       {
    552         "hn_id": "44081257",
    553         "title": "An Invitation to Neuroalgebraic Geometry",
    554         "points": 2,
    555         "comments": 0,
    556         "url": "https://news.ycombinator.com/item?id=44081257"
    557       },
    558       {
    559         "hn_id": "43321959",
    560         "title": "Swallowing the Poison Pills: Insights from Vulnerability Disparity Among LLMs",
    561         "points": 1,
    562         "comments": 0,
    563         "url": "https://news.ycombinator.com/item?id=43321959"
    564       }
    565     ],
    566     "top_points": 16,
    567     "total_points": 43,
    568     "total_comments": 10
    569   }
    570 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs