scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29050B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Give LLMs a Security Course: Securing Retrieval-Augmented Code Generation via Knowledge Injection",
      6     "authors": [
      7       "Bo Lin",
      8       "Shangwen Wang",
      9       "Yihao Qin",
     10       "Liqian Chen",
     11       "Xiaoguang Mao"
     12     ],
     13     "year": 2025,
     14     "venue": "Conference on Computer and Communications Security",
     15     "arxiv_id": "2504.16429",
     16     "doi": "10.1145/3719027.3765049"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All quantitative claims in the abstract (20.12%, 31.53%, 21.91% improvements) are directly backed by Tables 4, 5, and 6; cross-language generalization claims are supported by Table 8.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about component contributions are supported by ablation study (Section 7.3) removing query decomposition and re-ranking; comparisons against multiple baselines across three distinct scenarios provide further causal support.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The conclusion claims this is 'a pivotal advancement towards building secure and trustworthy RACG systems' and 'a critical step toward securing LLM-based software development,' yet evaluation is limited to one benchmark (CyberSecEval, 50 CWEs) and four languages — claims that exceed the evidence.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not consider whether adding any additional context to prompts (regardless of security relevance) might explain improvements, or whether query decomposition benefits general comprehension rather than security specifically.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly acknowledges Security Rate is measured by static analysis tools with 96% precision on 50 CWEs and 'may not perfectly capture the true security posture'; functional correctness is validated separately via MBPP/HumanEval test cases.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7.5 'Threats to Validity' is a dedicated section addressing measurement validity and LLM non-determinism with specific details.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Threats are specific: the 96% precision detector on 50 CWEs may miss other vulnerability types; LLM non-determinism is quantified as max 0.38% SR deviation across 5 DS-V3 runs at temperature=0.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "While measurement limitations and language-specific knowledge gaps are mentioned, the paper does not explicitly state what results do NOT show — e.g., that findings cannot be extrapolated to the full universe of vulnerability types or to deployment contexts outside RACG.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The Acknowledgments section is completely blank; no funding sources are disclosed despite this being institutionally affiliated academic research from NUDT.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All five authors are clearly affiliated with the College of Computer Science, National University of Defense Technology (NUDT), Changsha, China, listed in the paper header.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding is disclosed, making independence assessment impossible.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or declaration of financial interests is present anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "RACG, Security Rate (SR), and the three knowledge base types (S, K, V) are explicitly defined; SR is operationally defined as percentage of generated code verified secure by CyberSecEval's Insecure Code Detector.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit contributions are bullet-pointed: (1) first security-hardening framework for RACG addressing poisoning, (2) the CodeGuarder framework itself, and (3) extensive empirical evaluation across diverse scenarios.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 surveys prior RACG systems and code security approaches (SVEN, SafeCoder, CoSec), explicitly situating CodeGuarder as distinct in addressing RACG-specific poisoning threats not covered by fine-tuning methods.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository or artifact link is mentioned anywhere in the paper; no GitHub URL or supplemental materials pointer is provided.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The constructed Security Knowledge Base (8,861+ C entries extracted from ReposVul via LLM prompting) is the primary artifact and is not released; evaluation uses publicly available CyberSecEval but the core constructed datasets (S, K, V) are unavailable.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions A100 GPU and Ollama framework with model parameters (temperature, max_new_tokens, context window), but no requirements file, Dockerfile, or dependency specification is provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided; prompt templates in the appendix are helpful but the full pipeline lacks runnable instructions without the unreleased code and knowledge bases.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "No confidence intervals or error bars are reported for any main results tables; only a post-hoc variance check (max 0.38% deviation across 5 runs) is reported for one model in one scenario.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to any comparative claims; all improvements are reported as raw percentage differences without p-values.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Percentage improvements are reported with baseline context throughout (e.g., SR increases from 60.84% to 73.08%, i.e., 20.12%) broken down by LLM and language, providing interpretable effect sizes.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "CyberSecEval's 1,916 instances are noted as larger than alternatives (vs. 150 in the second-largest), but no power analysis or formal sample size justification for the evaluation is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Variance is only reported for DS-V3 in one scenario (5 runs, 0.38% max SR deviation); all other models and scenarios report single-run point estimates with no spread.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "RACG without CodeGuarder serves as the baseline for main experiments; SVEN, SafeCoder, and CoSec are included as baselines in the non-func-retrieval comparison (Table 7).",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "SafeCoder (2024), CoSec (2024), and SVEN (2023) are contemporary state-of-the-art security hardening methods; DeepSeek-V3 and GPT-4o are current frontier models.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 7.3 presents ablation removing query decomposition (CodeGuarder-QD) and knowledge re-ranking/filtering (CodeGuarder-KRF), evaluated on DS-V3 across all three scenarios.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Security Rate (SR), CodeBLEU similarity (Sim), pass@1 and pass@5 on MBPP and HumanEval, and per-CWE prevention percentages are all used.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human evaluation of generated code security or quality is conducted; all security assessment is automated via CyberSecEval's Insecure Code Detector (static analysis).",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "CyberSecEval serves as the evaluation benchmark, while the Security Knowledge Base is constructed from ReposVul (a separate dataset), maintaining separation between knowledge and evaluation data.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by programming language (C, C++, Java, Python), by LLM (4 models), by scenario (3 scenarios), and by CWE type (Table 10, 12 weaknesses from MITRE Top-25).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 7.2 discusses CWE-79 (XSS, 4.76% prevention rate attributed to absent JavaScript-specific knowledge) and CWE-22 (path traversal, 8.26%) with explanations for poor performance.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper reports Rust achieves only 7.03% improvement in cross-language generalization, and explicitly states CodeGuarder's effectiveness is 'somewhat limited' without language-specific knowledge (21.26% vs. 31.53% with it).",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "GPT-4o is referenced without a snapshot date or version identifier; since GPT-4o updates continuously via the API, this is insufficient for reproduction. Other models (DeepSeek-V3, CodeLlama-13B, DeepSeek-Coder-V2-16B) are more specifically identified.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Appendix B provides three complete prompt templates (security knowledge extraction, query decomposition, security-augmented code generation) with placeholder variables and a concrete example instantiation for Prompt 3.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Temperature=0, max_new_tokens=4096, context window=8192, k'=2, k=5 are all reported; the embedding model (jina-embeddings-v3) and poisoning retriever (text-embedding-3-large) are both specified.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The full CodeGuarder pipeline is described in detail in Sections 4.1–4.3: offline knowledge base construction, query decomposition, cosine similarity retrieval, CWE-frequency re-ranking, and prompt injection.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 5.2 documents construction of each knowledge base (S, K, V), including exclusion of insecure CSN code (81.3% of Java random-function code flagged), and extraction of function-level diffs from ReposVul CVE pairs.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Raw experimental outputs (per-instance generated code, security classifications) are not made available; only aggregated statistics in tables are reported.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Construction of the Security Knowledge Base from ReposVul CVE instances via LLM extraction (Prompt 1, Section 4.1) is described with the three knowledge dimensions (functionality, root cause, fixing pattern).",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; all data comes from automated benchmark evaluation and existing vulnerability databases.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline from CVE instances → LLM extraction → knowledge base entries is documented with example prompts and output JSON format; the online retrieval and injection pipeline is fully specified.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoffs are stated for any of the four evaluated LLMs (GPT-4o, DeepSeek-V3, CodeLlama-13B, DeepSeek-Coder-V2-16B).",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The paper does not discuss whether CyberSecEval (published 2023) or ReposVul CVEs appeared in LLM training corpora, which could inflate baseline security rates for all models.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "CyberSecEval was published before plausible training cutoffs of GPT-4o and DeepSeek-V3; whether benchmark examples were available during training is not discussed anywhere in the paper.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants involved.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants involved.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants involved.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants involved.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants involved.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants involved.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants involved.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No latency, API cost, or throughput overhead of the knowledge injection pipeline is reported, despite this being practically relevant for deployment decisions in production RACG systems.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Only 'A100 GPU server' is mentioned; total GPU-hours, API call counts, or computational budget for running the full evaluation (4 LLMs × 4 languages × 3 scenarios × 1,916 instances) are not reported.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "CodeGuarder achieves an average security rate improvement of 20.12% in standard RACG scenarios across four LLMs and four programming languages",
    375       "evidence": "Table 4 shows average baseline SR 60.84% vs. 73.08% with CodeGuarder across all 16 LLM-language combinations; per-language: C +27.20%, C++ +9.37%, Java +47.72%, Python +12.35%",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "CodeGuarder improves security rate by 31.53% under targeted poisoning (Scenario I) and 21.91% under intent-agnostic poisoning (Scenario II)",
    380       "evidence": "Tables 5 and 6 show consistent improvements across all four LLMs and languages under both poisoning conditions; Java shows the highest gain in both scenarios (54.99% and 43.12%)",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "CodeGuarder does not compromise functional correctness of generated code",
    385       "evidence": "Table 9 shows pass@1 and pass@5 scores on MBPP and HumanEval are maintained or slightly improved for all four LLMs (e.g., GPT-4o MBPP pass@1: 72.8 → 73.2)",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "CodeGuarder outperforms state-of-the-art SafeCoder by 9.80% SR in non-retrieval code generation scenarios",
    390       "evidence": "Table 7 shows CodeGuarder 79.40% vs. SafeCoder 72.31% on Mistral-7B (C/C++/Python average); similar advantage on CodeLlama-7B (77.06% vs. 73.08%)",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Query decomposition is the most critical component, responsible for approximately 10.1% of the security improvement",
    395       "evidence": "Table 11 ablation: removing QD drops SR from 76.36% to 68.61% in standard scenario; removing only KRF drops SR by 2.2% (76.36% to 74.13%)",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "CodeGuarder generalizes to languages without specific security knowledge, achieving 15.69% SR improvement in standard scenario",
    400       "evidence": "Table 8 shows improvements for C#, JavaScript, PHP, Rust ranging from 7.03% (Rust) to 22.23% (PHP) in standard scenario; gains are lower than in languages with dedicated knowledge",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "case-study"
    407   ],
    408   "key_findings": "CodeGuarder is a prompt-engineering security-hardening framework for RACG systems that injects CVE-derived security knowledge via query decomposition, cosine similarity retrieval, and CWE-frequency-based re-ranking — requiring no model weight modification. Evaluated on CyberSecEval across four LLMs and four programming languages, it achieves average security rate improvements of 20.12% in standard RACG and 31.53%/21.91% under targeted and generalized knowledge-base poisoning attacks, without degrading functional correctness on MBPP/HumanEval. Ablation reveals query decomposition (which enables fine-grained sub-task knowledge matching) is the dominant contributor (~10% of improvement vs. ~2% from re-ranking). The framework outperforms fine-tuning-based methods (SafeCoder, SVEN, CoSec) in non-retrieval settings and generalizes to languages without dedicated security knowledge, though with diminished effectiveness (7–22% vs. 31% with language-specific knowledge).",
    409   "red_flags": [
    410     {
    411       "flag": "GPT-4o version unspecified",
    412       "detail": "GPT-4o is used via API without a snapshot date or model version identifier, making results irreproducible as the model is continuously updated."
    413     },
    414     {
    415       "flag": "No statistical significance testing",
    416       "detail": "All comparative claims across LLMs, languages, and scenarios are based on raw percentage differences without p-values, confidence intervals, or multiple-comparison corrections."
    417     },
    418     {
    419       "flag": "Variance reported for only one model",
    420       "detail": "The 5-run variance check (0.38% max deviation) is performed only for DS-V3 in the standard scenario; all other models and all poisoning scenarios use single-run point estimates."
    421     },
    422     {
    423       "flag": "Contamination unaddressed",
    424       "detail": "CyberSecEval (published 2023) and ReposVul CVE descriptions could be present in the training data of GPT-4o and DeepSeek-V3, potentially inflating baseline security rates and overestimating improvements."
    425     },
    426     {
    427       "flag": "Code and knowledge base not released",
    428       "detail": "Neither the CodeGuarder implementation nor the constructed Security Knowledge Base (the primary artifact) is released, making independent validation and replication impossible."
    429     },
    430     {
    431       "flag": "Non-retrieval comparison uses smaller models",
    432       "detail": "The SafeCoder/SVEN/CoSec comparison uses 7B models (Mistral-7B, CodeLlama-7B) rather than the 13B–671B models in the main experiments, introduced without adequate justification and potentially advantaging CodeGuarder."
    433     },
    434     {
    435       "flag": "Broad conclusion claims beyond evidence",
    436       "detail": "Claims of being 'a pivotal advancement towards building secure and trustworthy RACG systems' and 'the first security-hardening framework for RACG' rest on evaluation with one benchmark covering only 50 CWEs."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    442       "relevance": "Foundational study finding ~40% of LLM-generated code contains vulnerabilities; establishes the security problem CodeGuarder addresses"
    443     },
    444     {
    445       "title": "Large Language Models for Code: Security Hardening and Adversarial Testing (SVEN)",
    446       "relevance": "Prior prefix-tuning approach to steer LLMs toward secure code; primary baseline in non-retrieval comparison"
    447     },
    448     {
    449       "title": "Instruction Tuning for Secure Code Generation (SafeCoder)",
    450       "relevance": "State-of-the-art fine-tuning approach for secure code generation; main comparison target in non-retrieval scenario"
    451     },
    452     {
    453       "title": "CoSec: On-the-Fly Security Hardening of Code LLMs via Supervised Co-decoding",
    454       "relevance": "Co-decoding approach modifying token probabilities for security without weight modification; baseline in non-retrieval comparison"
    455     },
    456     {
    457       "title": "CyberSecEval 3: Advancing the Evaluation of Cybersecurity Risks and Capabilities in Large Language Models",
    458       "relevance": "Primary evaluation benchmark (1,916 instances, 50 CWEs, Insecure Code Detector with 96% precision) used for all security assessment"
    459     },
    460     {
    461       "title": "How Secure is AI-Generated Code: A Large-Scale Comparison of Large Language Models",
    462       "relevance": "Provides vulnerability type distribution across 13 LLMs on 310K+ instructions; CodeGuarder's re-ranking weights are derived directly from this data"
    463     },
    464     {
    465       "title": "ReposVul: A Repository-Level High-Quality Vulnerability Dataset",
    466       "relevance": "Source dataset for constructing Security Knowledge Base and Functional Code Base (12,053 function-level vulnerable/fixed pairs across four languages)"
    467     },
    468     {
    469       "title": "Exploring the Security Threats of Knowledge Base Poisoning in Retrieval-Augmented Code Generation",
    470       "relevance": "Prior work by same authors establishing the poisoning threat model and empirical baselines (48% vulnerability rate from single poisoned sample) that CodeGuarder directly addresses"
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 3,
    476       "justification": "Directly addresses production security concern in developer tooling (RACG systems like Copilot) without requiring model fine-tuning, making it immediately applicable as a prompt-layer wrapper."
    477     },
    478     "surprise_contrarian": {
    479       "score": 1,
    480       "justification": "The core result (adding security knowledge improves security) is expected; the finding that cross-language generalization works via universal CWE patterns is mildly non-obvious."
    481     },
    482     "fear_safety": {
    483       "score": 2,
    484       "justification": "Addresses real-world risks of AI-generated insecure code and knowledge-base poisoning attacks against RACG systems used in production software development."
    485     },
    486     "drama_conflict": {
    487       "score": 1,
    488       "justification": "Frames a security vs. functionality tradeoff but largely resolves it without tension; the poisoning attack scenarios add some adversarial drama."
    489     },
    490     "demo_ability": {
    491       "score": 2,
    492       "justification": "The framework could be reimplemented from the paper's detailed method descriptions and tested on public CyberSecEval, though absence of released code requires significant engineering effort."
    493     },
    494     "brand_recognition": {
    495       "score": 0,
    496       "justification": "Authors are from NUDT (National University of Defense Technology), not a prominent AI lab; no famous models or products are introduced."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [
    501       {
    502         "hn_id": "45955957",
    503         "title": "Official LIGO-Virgo-Kagra Benchmark Shows KFR Outperforming FFTW in CERN Root",
    504         "points": 3,
    505         "comments": 0,
    506         "url": "https://news.ycombinator.com/item?id=45955957"
    507       },
    508       {
    509         "hn_id": "44318076",
    510         "title": "The Impact of Generative AI on Social Media: An Experimental Study",
    511         "points": 3,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=44318076"
    514       },
    515       {
    516         "hn_id": "46931046",
    517         "title": "Open Problems in Mechanistic Interpretability",
    518         "points": 2,
    519         "comments": 0,
    520         "url": "https://news.ycombinator.com/item?id=46931046"
    521       },
    522       {
    523         "hn_id": "45156732",
    524         "title": "Zero-Shot Reinforcement Learning",
    525         "points": 2,
    526         "comments": 0,
    527         "url": "https://news.ycombinator.com/item?id=45156732"
    528       },
    529       {
    530         "hn_id": "42864437",
    531         "title": "Open Problems in Mechanistic Interpretability",
    532         "points": 2,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=42864437"
    535       },
    536       {
    537         "hn_id": "44193004",
    538         "title": "I-Con: A Unifying Framework for Representation Learning",
    539         "points": 1,
    540         "comments": 1,
    541         "url": "https://news.ycombinator.com/item?id=44193004"
    542       },
    543       {
    544         "hn_id": "44012060",
    545         "title": "Open Problems in Mechanistic Interpretability",
    546         "points": 1,
    547         "comments": 0,
    548         "url": "https://news.ycombinator.com/item?id=44012060"
    549       }
    550     ],
    551     "top_points": 3,
    552     "total_points": 14,
    553     "total_comments": 1
    554   }
    555 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs