scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26522B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Exploring the Security Threats of Knowledge Base Poisoning in Retrieval-Augmented Code Generation",
      6     "authors": [
      7       "Bo Lin",
      8       "Shangwen Wang",
      9       "Liqian Chen",
     10       "Xiaoguang Mao"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2502.03233",
     15     "doi": "10.48550/arXiv.2502.03233"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The 48% VR claim for a single poisoned sample with CodeLlama+JINA is directly supported in Table 4 (VR=0.48 at poisoning=1). The ~36% VR at 20% poisoning in Scenario II is confirmed in Table 5 (CodeLlama JINA VR=0.36 at proportion=0.2).",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Controlled experiments systematically vary poisoning quantity from 0 to 9 samples and 0% to 100% proportion with unpoisoned baselines, providing adequate grounds for causal inference in this systems-evaluation context.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper draws broad conclusions about RACG security in general but tests only 4 LLMs, 4 programming languages, one vulnerability dataset (ReposVul), and two retrievers in a controlled lab setting. Conclusions like 'code LLMs are more prone to generate vulnerable code' extend beyond the tested scope without explicit bounding.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not consider that the high baseline VR (26-29% without any poisoning) may indicate the LLM judge is inflating results or that LLMs already struggle with secure code generation independently. The possibility that LLMs recognize and reproduce training-set vulnerable patterns (contamination) is not discussed.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly validates its LLM-as-judge proxy via manual inspection of ~360 samples, reporting 77-84% accuracy, and acknowledges this as an approximation before using it as the primary evaluation metric.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 6.5 'Threats to Validity' is a dedicated section, not merely a sentence in the conclusion.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The section discusses specific threats: LLM-generated query accuracy (86% verified via manual review of 100 queries per language) and the limited language coverage (4 languages = 42.7% of GitHub pull request activity in Q1 2024).",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The threats section acknowledges limitations but does not explicitly state what the results do NOT show (e.g., no statement that results don't generalize to non-JINA/BM25 retrievers, or don't address real-world attack feasibility beyond the lab setup).",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding source is mentioned anywhere in the paper, including acknowledgments.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors are listed as affiliated with National University of Defense Technology, clearly stated under each author name.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding is disclosed, so independence cannot be assessed.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement, patent disclosure, or financial interests declaration is present in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "RACG is formally defined with a workflow diagram (Figure 1), Vulnerability Rate (VR), VRRC, and the two attack scenarios are formally defined with mathematical notation in Section 3.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three explicit contributions are listed: first comprehensive study of RACG security risks, large-scale experimentation across 16 sub-scenarios, and practical insights on influencing factors.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 6.4 explicitly compares RACG poisoning with RAG poisoning (PoisonedRAG), and Sections 2.1-2.4 situate the work relative to LLMs, RAG, RACG, and existing attack types, showing how this work differs from prior code security studies.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No link to the experimental codebase is provided. The paper only references external tools (a public BM25 GitHub repo, HuggingFace for JINA embeddings) but does not release its own experimental framework.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "The primary dataset ReposVul (Wang et al., ICSE 2024) is a publicly available repository-level vulnerability dataset, not a custom artifact.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "The paper mentions 'single A100-40G GPU server using the Ollama framework' but provides no requirements.txt, Dockerfile, or complete dependency specification with version numbers.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions are provided. The pipeline is described conceptually across multiple sections but not with actionable commands or scripts to follow.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "No confidence intervals or error bars are reported for any metric (VR, Similarity, VRRC). All reported results are single-point estimates.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "Comparative claims are made throughout (JINA vs BM25, code LLMs vs general LLMs, one-shot vs three-shot) without any statistical significance tests.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Absolute effect sizes are reported throughout (e.g., CodeLlama VR increases from 0.29 to 0.48 with one poisoned sample, a 0.19 absolute increase; 6.5% VR rise from one-shot to three-shot). These provide meaningful scale context.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The 12,053 instances from ReposVul are used without formal power analysis or justification for why this size is sufficient for the comparative claims made across 16 sub-scenarios.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No variance, standard deviation, or spread is reported across experimental runs. Temperature=0 reduces but does not eliminate non-determinism, and residual variance is not quantified.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Unpoisoned knowledge base (poisoning=0) is consistently included as baseline in Tables 4, 5, and 6 across all LLMs and retrievers.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Models include GPT-4o and state-of-the-art open-source models (Llama-3-8B, DeepSeek-Coder-V2-16B) selected from the LLM Safety Leaderboard as of October 2024.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "RQ2 systematically ablates poisoning quantity, number of few-shot examples, programming language, example-query similarity range, and CWE vulnerability type as independent sub-questions.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Three metrics are used: Vulnerability Rate (VR), Similarity (CrystalBLEU), and Vulnerability Rate in Retrieved Code (VRRC), capturing different aspects of the attack impact.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "Two authors independently reviewed ~360 generated code samples (95+81+93+91 across four languages) from GPT-4o outputs to validate the LLM judge, constituting human evaluation of system outputs.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "This is not a traditional prediction task; the study evaluates vulnerability propagation under controlled poisoning conditions rather than generalization to unseen test data.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down by programming language (Table 7), CWE vulnerability type (Tables 9 and 12 for Top-25), example-query similarity range (Table 8), and consistently by retriever and individual LLM throughout.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": false,
    223           "justification": "The paper notes CWE-434 has the lowest VR and BM25 has lower susceptibility, but does not present specific failure cases or examples of queries where poisoning did not propagate.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The paper reports that BM25 retriever is substantially less susceptible to poisoning (VRRC 0.06 vs 0.41 for JINA with 5 samples), and that Scenario II (hidden intent) requires orders of magnitude more poisoning to achieve equivalent VRRC.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "GPT-4o is cited without a snapshot date or version string; open-source models have parameter counts but no commit hashes or Hugging Face revision identifiers.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Appendices A, B, and C provide the complete prompt templates for query generation, vulnerability pattern extraction, and security assessment respectively.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Section 4.6 reports temperature=0, top-p=0.95, max_new_tokens=4096, and context window=8192.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "The full RACG pipeline is described in detail: retriever mechanics (BM25 token-frequency vs JINA cosine similarity), knowledge base construction, poisoning injection methodology (clustering-based for Scenario II), and the two-step LLM judge evaluation pipeline.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Filtering criteria are documented (remove functions <3 lines of implementation, remove names containing 'test'), and the query generation procedure for functions lacking comments is described with the specific LLM (DeepSeek-V2.5) used.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "The generated code samples, LLM judge outputs, and intermediate experimental data are not released in any repository.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Table 1 systematically evaluates 12 candidate vulnerability datasets against four criteria, documenting the selection rationale for ReposVul. Post-filtering statistics are provided in Table 2.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "Standard publicly available benchmark dataset (ReposVul) was used; no participant recruitment was involved.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The full pipeline from ReposVul dataset → filtering → query generation → knowledge base construction → poisoning injection → retrieval → LLM generation → LLM judge evaluation is documented across Sections 4.1-4.3.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "No training data cutoff is stated for any of the four LLMs tested. This is critical since ReposVul is built from GitHub repositories that likely overlap with LLM training corpora.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "The paper does not discuss potential overlap between LLM training data (GitHub code) and the ReposVul test dataset (also GitHub-sourced). The high baseline VR (26-29% unpoisoned) may partly reflect LLMs already having learned vulnerable patterns from training.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "ReposVul contains real-world GitHub code that was likely in the pretraining corpora of all tested LLMs (CodeLlama trained on 500B code tokens). Whether LLMs recognize specific vulnerability patterns from training vs. from the retrieved examples is not addressed.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants; IRB not applicable.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No API costs for GPT-4o (used for ~12,053 generations plus judge calls) or GPU hours for the three local models are reported.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Only the hardware (single A100-40G GPU) is mentioned; total compute hours for 16 sub-scenarios × 12,053 instances of generation plus evaluation is not stated.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "A single poisoned code sample with the JINA retriever can render approximately 48% of CodeLlama-generated code vulnerable",
    374       "evidence": "Table 4 shows CodeLlama VR increases from 0.29 (no poisoning) to 0.48 (1 poisoned sample) with JINA retriever in Scenario I",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Code-specialized LLMs (CodeLlama) are more susceptible to knowledge base poisoning than general-purpose LLMs",
    379       "evidence": "Across all poisoning quantities and retrievers, CodeLlama consistently shows the highest VR (e.g., 0.53 at JINA, 9 samples) vs Llama-3 (0.37); attributed to code-focused training on larger datasets including vulnerable patterns",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Dense retrievers (JINA) propagate vulnerabilities far more effectively than sparse retrievers (BM25)",
    384       "evidence": "Table 4 shows JINA achieving VRRC=0.41 vs BM25=0.06 with 5 poisoned samples; confirmed by Table 11 showing JINA MRR=0.85 vs BM25 MRR=0.20",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Increasing few-shot examples from one-shot to three-shot raises vulnerability rate by ~6.5% with JINA retriever",
    389       "evidence": "Table 6 shows aggregated VR increasing from 0.46 to 0.49 (6.5%) in Scenario I with JINA across all LLMs; VRRC rises from 0.41 to 0.44",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Example-query similarity above 60% significantly increases vulnerability risk; similarity below 60% has minor impact",
    394       "evidence": "Table 8 shows VR rising steeply from 0.35 ([40,60) range) to 0.53 ([80,100] range) in Scenario I, while lower similarity ranges show only modest increases",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "CWE-352 (Cross-Site Request Forgery) consistently shows the highest vulnerability propagation rate (~0.79) among MITRE Top-10",
    399       "evidence": "Table 9 reports CWE-352 average VR of 0.79 in Scenario I and 0.78 in Scenario II across all four LLMs",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "Knowledge base poisoning does not significantly degrade functional performance (code similarity)",
    404       "evidence": "Tables 4-5 show Similarity (CrystalBLEU) changes are minimal across poisoning levels (e.g., DS-Coder JINA: 0.76 baseline vs 0.78 at 9 poisoned samples)",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "benchmark-eval",
    410     "case-study"
    411   ],
    412   "key_findings": "Knowledge base poisoning in RACG systems is a realistic, low-effort attack: a single poisoned sample injected into a 12,053-item knowledge base (0.008% poisoning rate) can render 48% of CodeLlama-generated code vulnerable when using a dense retriever. The attack is stealthy because it does not degrade functional performance (CrystalBLEU scores remain stable). Dense retrievers like JINA amplify the attack significantly (VRRC=0.41) compared to sparse retrievers like BM25 (VRRC=0.06) due to superior semantic retrieval. In the blind attack scenario (hidden programmer intent), achieving comparable impact requires injecting ~9,642 samples—orders of magnitude more effort, making it far more detectable.",
    413   "red_flags": [
    414     {
    415       "flag": "No statistical testing",
    416       "detail": "All comparative claims (JINA vs BM25, code LLMs vs general LLMs, one-shot vs three-shot) are made without confidence intervals, significance tests, or variance across runs. Single-point estimates are presented as definitive findings."
    417     },
    418     {
    419       "flag": "Training data contamination unaddressed",
    420       "detail": "ReposVul is sourced from GitHub repositories; all tested LLMs (especially CodeLlama, trained on 500B code tokens from GitHub) likely encountered these vulnerable patterns during pretraining. The high baseline VR (26-29% with no poisoning) may reflect this contamination rather than inherent LLM vulnerability, but this confound is never discussed."
    421     },
    422     {
    423       "flag": "LLM judge reliability ceiling",
    424       "detail": "The vulnerability judge achieves only 77-84% accuracy. At 48% reported VR, a 20% false-positive rate would shift the true VR substantially. Error propagation from judge uncertainty is not quantified in reported results."
    425     },
    426     {
    427       "flag": "GPT-4o version unspecified",
    428       "detail": "GPT-4o is referenced only by marketing name without a snapshot date or API version string, making exact replication of results impossible for the closed-source model."
    429     },
    430     {
    431       "flag": "No replication package",
    432       "detail": "No experimental code, generated outputs, or intermediate data are released. Replication requires re-implementing the full pipeline (clustering-based poisoning, LLM judge, RACG scaffolding) from scratch."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "PoisonedRAG: Knowledge Poisoning Attacks to Retrieval-Augmented Generation of Large Language Models",
    438       "relevance": "Most directly related prior work on RAG knowledge base poisoning; this paper extends the attack surface to code generation security specifically"
    439     },
    440     {
    441       "title": "ReposVul: A Repository-Level High-Quality Vulnerability Dataset",
    442       "relevance": "Primary dataset used in all experiments; foundation of the knowledge base construction and poisoning scenarios"
    443     },
    444     {
    445       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    446       "relevance": "Foundational prior work on security of LLM-generated code without RAG; establishes the baseline security problem this work extends"
    447     },
    448     {
    449       "title": "How Secure is AI-Generated Code: A Large-Scale Comparison of Large Language Models",
    450       "relevance": "Recent large-scale baseline establishing LLM-generated code security rates across LLMs; direct comparison point for this study's unpoisoned baseline"
    451     },
    452     {
    453       "title": "Retrieval Augmented Code Generation and Summarization",
    454       "relevance": "Foundational RACG work that established the paradigm this paper's threat model targets"
    455     },
    456     {
    457       "title": "Vul-RAG: Enhancing LLM-based Vulnerability Detection via Knowledge-Level RAG",
    458       "relevance": "Shares the two-step vulnerability extraction-detection pipeline adopted in this paper's LLM judge design"
    459     },
    460     {
    461       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    462       "relevance": "Justification for the LLM-as-judge evaluation methodology used as the primary vulnerability detection approach"
    463     },
    464     {
    465       "title": "Poisoning Web-Scale Training Datasets is Practical",
    466       "relevance": "Provides the realistic threat model foundation for knowledge base poisoning via public repository injection"
    467     }
    468   ],
    469   "engagement_factors": {
    470     "practical_relevance": {
    471       "score": 3,
    472       "justification": "Directly attacks the security of production RACG systems (GitHub Copilot, Cursor, etc.) that millions of developers use, with a low-effort attack requiring only 1 poisoned sample."
    473     },
    474     "surprise_contrarian": {
    475       "score": 2,
    476       "justification": "The finding that a 0.008% knowledge base poisoning rate can compromise 48% of generated code is a striking quantitative result; the stealthiness (no functional degradation) is a non-obvious finding."
    477     },
    478     "fear_safety": {
    479       "score": 3,
    480       "justification": "Demonstrates a realistic, scalable attack vector against widely deployed AI coding tools with potential for supply-chain security compromise via publicly accessible code repositories."
    481     },
    482     "drama_conflict": {
    483       "score": 2,
    484       "justification": "Security attack research targeting popular AI coding tools has inherent controversy; the framing as 'first comprehensive study' creates urgency around a widely trusted technology."
    485     },
    486     "demo_ability": {
    487       "score": 1,
    488       "justification": "Reproducing the attack requires setting up 4 LLMs, custom retrieval infrastructure, and the full experimental pipeline; no demo or code is released."
    489     },
    490     "brand_recognition": {
    491       "score": 1,
    492       "justification": "National University of Defense Technology is recognized but not a top-tier AI lab; no famous product or model family is introduced."
    493     }
    494   },
    495   "hn_data": {
    496     "threads": [],
    497     "top_points": 0,
    498     "total_points": 0,
    499     "total_comments": 0
    500   }
    501 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs