scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26359B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Defending Against Prompt Injection with DataFilter",
      6     "authors": ["Yizhu Wang", "Sizhe Chen", "Raghad F Alkhudair", "Basel Alomair", "David Wagner"],
      7     "year": 2025,
      8     "venue": "arXiv.org",
      9     "arxiv_id": "2510.19207",
     10     "doi": "10.48550/arXiv.2510.19207"
     11   },
     12   "checklist": {
     13     "claims_and_evidence": {
     14       "abstract_claims_supported": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "All abstract claims are supported: ASR reduction to near-zero is shown in Tables II–IV, utility preservation within 1–2% in Tables V–VI, and superiority over baselines in Figure 2.",
     18         "source": "haiku"
     19       },
     20       "causal_claims_justified": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Controlled experiments hold all variables constant except presence of DataFilter, making causal attribution appropriate for claims about its effect on ASR and utility.",
     24         "source": "haiku"
     25       },
     26       "generalization_bounded": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The limitations section explicitly states DataFilter cannot defend against optimization-based adaptive attacks (83% ASR) and struggles with very long user prompts, bounding the generalization claims.",
     30         "source": "haiku"
     31       },
     32       "alternative_explanations_discussed": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper does not discuss whether Llama-3.1-8B's inherent instruction-following strength rather than the filtering mechanism drives results, nor other confounds like benchmark difficulty differences.",
     36         "source": "haiku"
     37       },
     38       "proxy_outcome_distinction": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "ASR (whether malicious API call occurs) and utility (task completion rate) are clearly defined and tied to specific claims; no conflation between what is measured and what is claimed.",
     42         "source": "haiku"
     43       }
     44     },
     45     "limitations_and_scope": {
     46       "limitations_section_present": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Section VI contains a dedicated 'Limitations' paragraph listing inference overhead, failure against optimization-based attacks, and difficulties with long user prompts.",
     50         "source": "haiku"
     51       },
     52       "threats_to_validity_specific": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Specific threats are named: strong adaptive LLM-based attacks break the defense (83% ASR), and DataFilter requires developers to extract short user instructions when the full prompt is very long.",
     56         "source": "haiku"
     57       },
     58       "scope_boundaries_stated": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper explicitly states DataFilter 'cannot defend against the strong optimization-based adaptive attacks' and 'may not yet match the absolute strongest protection possible with model-level defenses.'",
     62         "source": "haiku"
     63       }
     64     },
     65     "conflicts_of_interest": {
     66       "funding_disclosed": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Funding is disclosed: KACST-UC Berkeley Center of Excellence for Secure Computing, NSF grant 2229876, and gifts from Google, Meta, and Noyce Foundation.",
     70         "source": "haiku"
     71       },
     72       "affiliations_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Author affiliations are clearly stated on the title page: UC Berkeley and KACST.",
     76         "source": "haiku"
     77       },
     78       "funder_independent_of_outcome": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "Meta and Google are funders; Meta's PromptGuard is one of the baselines being outperformed, and DataFilter uses Meta's Llama-3.1-8B as its backbone model.",
     82         "source": "haiku"
     83       },
     84       "financial_interests_declared": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is provided beyond the funding acknowledgment.",
     88         "source": "haiku"
     89       }
     90     },
     91     "scope_and_framing": {
     92       "key_terms_defined": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Prompt injection attack, attack success rate, utility, and model-agnostic are all explicitly defined in Sections II and IV, with attacker and defender goals formally stated.",
     96         "source": "haiku"
     97       },
     98       "intended_contribution_clear": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper clearly contributes DataFilter: a test-time, model-agnostic SFT-based defense that removes injected instructions from untrusted data before it reaches the backend LLM.",
    102         "source": "haiku"
    103       },
    104       "engagement_with_prior_work": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section III provides extensive related work; Table I explicitly positions DataFilter against fine-tuning, prompting, detection, and system-level defenses, with concurrent work (PromptArmor, PromptLocate) distinguished.",
    108         "source": "haiku"
    109       }
    110     }
    111   },
    112   "type_checklist": {
    113     "empirical": {
    114       "artifacts": {
    115         "code_released": {
    116           "applies": true,
    117           "answer": true,
    118           "justification": "The abstract states 'Our DataFilter model is released here for immediate use, with the code to reproduce our results here,' indicating release of both model and code.",
    119           "source": "haiku"
    120         },
    121         "data_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "All evaluation benchmarks are publicly available (SEP, InjecAgent, AgentDojo, AlpacaEval2), and training uses the public Alpaca dataset.",
    125           "source": "haiku"
    126         },
    127         "environment_specified": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Training hardware (A100/H100 GPUs) and key hyperparameters are stated, but no requirements.txt, Dockerfile, or explicit dependency specification is provided in the paper.",
    131           "source": "haiku"
    132         },
    133         "reproduction_instructions": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "Algorithm 1 provides step-by-step SFT dataset construction, Section V-A describes all training parameters, and code is released; sufficient to reproduce without guessing.",
    137           "source": "haiku"
    138         }
    139       },
    140       "statistical_methodology": {
    141         "confidence_intervals_or_error_bars": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "All results in Tables II–IX are single point estimates with no confidence intervals or error bars, despite the paper acknowledging GPT-4o is non-deterministic.",
    145           "source": "haiku"
    146         },
    147         "significance_tests": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "No statistical significance tests are applied to any comparative claims despite making superiority claims over multiple baselines.",
    151           "source": "haiku"
    152         },
    153         "effect_sizes_reported": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Absolute percentage differences are reported (e.g., average ASR 2.2% vs 5.9% for PromptArmor; utility drop 1.0% vs 4.1%), providing practical effect size context.",
    157           "source": "haiku"
    158         },
    159         "sample_size_justified": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "SEP is evaluated on a random 1K subset of 9.1K samples with no justification for the subset size or representativeness confirmation; no power analysis anywhere.",
    163           "source": "haiku"
    164         },
    165         "variance_reported": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No variance, standard deviation, or spread measures are reported across any experimental runs, despite acknowledged model non-determinism.",
    169           "source": "haiku"
    170         }
    171       },
    172       "evaluation_design": {
    173         "baselines_included": {
    174           "applies": true,
    175           "answer": true,
    176           "justification": "Seven baselines are tested: PromptGuard, DataSentinel, Sandwich, Instructional, Spotlight, Tool Filter, and PromptArmor, spanning detection-based, prompt-based, and system-level approaches.",
    177           "source": "haiku"
    178         },
    179         "baselines_contemporary": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "All baselines are from 2023–2025 publications and represent the current state of the art in model-agnostic prompt injection defense.",
    183           "source": "haiku"
    184         },
    185         "ablation_study": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "Four training goals are described but their individual contributions are not systematically ablated; only a brief mention of training without user prompt context appears in the discussion.",
    189           "source": "haiku"
    190         },
    191         "multiple_metrics": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Multiple metrics are used: ASR, benign utility, utility under attack (AgentDojo), and length-controlled win rate (AlpacaEval2).",
    195           "source": "haiku"
    196         },
    197         "human_evaluation": {
    198           "applies": false,
    199           "answer": false,
    200           "justification": "Human evaluation is not standard for prompt injection defense evaluation; utility is measured via GPT-4-based automatic evaluation (AlpacaEval2).",
    201           "source": "haiku"
    202         },
    203         "held_out_test_set": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "DataFilter is trained on Alpaca and evaluated on entirely separate benchmarks (SEP, InjecAgent, AgentDojo, AlpacaEval2) not used in training.",
    207           "source": "haiku"
    208         },
    209         "per_category_breakdown": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Results are broken down by attack type (6 in SEP, 4 in AgentDojo, 2 in InjecAgent), backend model (gpt-4o vs Llama), and benchmark, providing granular breakdowns.",
    213           "source": "haiku"
    214         },
    215         "failure_cases_discussed": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Appendix C provides concrete false negative (billing document confusion) and false positive (cooking recipe instructions) examples with full input/output shown.",
    219           "source": "haiku"
    220         },
    221         "negative_results_reported": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "DataFilter fails against strong LLM-based adaptive attacks (83% ASR); false positives on benign imperative content are documented; limitations with long prompts reported.",
    225           "source": "haiku"
    226         }
    227       },
    228       "setup_transparency": {
    229         "model_versions_specified": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "Exact model versions are specified: gpt-4o-2024-05-13, meta-llama/Llama-Prompt-Guard-2-86M, Llama-3.1-8B-Instruct, and GPT-5.1/GPT-4.1 for relevant comparisons.",
    233           "source": "haiku"
    234         },
    235         "prompts_provided": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "The full system prompt and user message template for DataFilter are shown verbatim in Section IV-C, including the filter instruction and special token formatting.",
    239           "source": "haiku"
    240         },
    241         "hyperparameters_reported": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "All key hyperparameters reported: batch size 1, gradient accumulation 16, learning rate 2×10^-5, cosine schedule, 100 warmup steps, BF16 precision, 300 training steps.",
    245           "source": "haiku"
    246         },
    247         "scaffolding_described": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "JSON parsing and recursive filtering for structured agentic data (Section IV-D) and the multi-turn agent setup in AgentDojo are described in sufficient detail.",
    251           "source": "haiku"
    252         },
    253         "data_preprocessing_documented": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Algorithm 1 documents exact preprocessing: truncation proportions (65%/10%/10%/15%), injection position distributions (20%/20%/60%), and attack type assignments.",
    257           "source": "haiku"
    258         }
    259       },
    260       "data_integrity": {
    261         "raw_data_available": {
    262           "applies": true,
    263           "answer": false,
    264           "justification": "The constructed SFT training dataset is not explicitly released as a separate artifact; only the base Alpaca source and the trained model are released.",
    265           "source": "haiku"
    266         },
    267         "data_collection_described": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "Algorithm 1 provides the complete data construction procedure from Alpaca samples to (prompt, data, output) triples with all design decisions and proportions documented.",
    271           "source": "haiku"
    272         },
    273         "recruitment_methods_described": {
    274           "applies": false,
    275           "answer": false,
    276           "justification": "No human participants; evaluation uses automated benchmarks requiring no recruitment.",
    277           "source": "haiku"
    278         },
    279         "data_pipeline_documented": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "The full pipeline from Alpaca → SFT dataset construction (Algorithm 1) → fine-tuning → deployment is documented with specific parameters and design rationale for each step.",
    283           "source": "haiku"
    284         }
    285       },
    286       "contamination": {
    287         "training_cutoff_stated": {
    288           "applies": true,
    289           "answer": false,
    290           "justification": "Llama-3.1-8B-Instruct's training data cutoff is not stated; it is possible the model's pretraining included examples similar to or identical to evaluation benchmarks.",
    291           "source": "haiku"
    292         },
    293         "train_test_overlap_discussed": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "The paper does not discuss whether Llama-3.1-8B's pretraining data overlaps with the evaluation benchmarks (SEP, InjecAgent, AgentDojo), which could inflate filtering performance.",
    297           "source": "haiku"
    298         },
    299         "benchmark_contamination_addressed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "SEP and InjecAgent were published before Llama 3.1's likely training cutoff; potential contamination of the filter model's base knowledge is not discussed.",
    303           "source": "haiku"
    304         }
    305       },
    306       "human_studies": {
    307         "pre_registered": {
    308           "applies": false,
    309           "answer": false,
    310           "justification": "No human participants.",
    311           "source": "haiku"
    312         },
    313         "irb_or_ethics_approval": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "demographics_reported": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "inclusion_exclusion_criteria": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "randomization_described": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "blinding_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "attrition_reported": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         }
    349       },
    350       "cost_and_practicality": {
    351         "inference_cost_reported": {
    352           "applies": true,
    353           "answer": true,
    354           "justification": "Table IX reports per-sample monetary cost and wall-clock time for GPT-5.1 (+3.7% cost, +4.0% latency) and GPT-4o (+1.0% cost, +17.5% latency) with DataFilter.",
    355           "source": "haiku"
    356         },
    357         "compute_budget_stated": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "Training hardware (two 80GB A100/H100 GPUs) and steps (300) are mentioned but total GPU-hours for training are not reported.",
    361           "source": "haiku"
    362         }
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "DataFilter reduces average ASR from over 40% to approximately 2% across multiple benchmarks",
    369       "evidence": "Tables II, III, IV show ASR reductions to max 1.2% on AgentDojo, ~2% on InjecAgent Base, and 1.5–3.4% on SEP for gpt-4o backend",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "DataFilter preserves utility within 1–2% of the undefended baseline",
    374       "evidence": "Table V shows benign utility 79.4% vs 81.4% baseline on AgentDojo; Table VI shows 54.1% vs 54.0% on AlpacaEval2 for gpt-4o",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "DataFilter outperforms all tested model-agnostic baselines on security-utility tradeoff",
    379       "evidence": "Figure 2 shows DataFilter closest to ideal defense; average ASR 2.2% vs PromptArmor 5.9%; average utility drop 1.0% vs 4.1% for PromptArmor",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "DataFilter trained on general instruction-tuning data generalizes to unseen agentic settings",
    384       "evidence": "DataFilter trained on non-agentic Alpaca achieves low ASR on agentic benchmarks AgentDojo and InjecAgent involving multi-turn tool calls",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "DataFilter is the first model-agnostic defense simultaneously achieving strong security and high utility",
    389       "evidence": "Table I categorizes all prior defenses as lacking at least one of security, utility, or model-agnostic properties; DataFilter satisfies all three",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Strong optimization-based adaptive attacks break DataFilter with 83% ASR",
    394       "evidence": "Table VIII shows DataFilter achieves 83% ASR under genetic algorithm-based LLM attack, though lowest among all tested defenses (93–100% for others)",
    395       "supported": "strong"
    396     }
    397   ],
    398   "methodology_tags": ["benchmark-eval"],
    399   "key_findings": "DataFilter, a supervised fine-tuned Llama-3.1-8B model, reduces prompt injection ASR from >40% to ~2% across three benchmarks while maintaining utility within 2% of baseline, outperforming all tested model-agnostic defenses on the security-utility tradeoff. Training on general-purpose Alpaca data enables generalization to unseen agentic settings (AgentDojo, InjecAgent) without domain-specific adaptation. However, strong optimization-based adaptive attacks still achieve 83% ASR, and the defense struggles with very long user prompts requiring developer intervention. Marginal inference overhead (+1–4% cost, +4–18% latency) and plug-and-play deployment make it immediately practical for black-box commercial LLMs.",
    400   "red_flags": [
    401     {
    402       "flag": "No statistical testing",
    403       "detail": "All comparative claims are made without confidence intervals, significance tests, or variance reporting, despite the paper acknowledging non-determinism in GPT-4o; results may not be reliable across runs."
    404     },
    405     {
    406       "flag": "Funder conflict with baseline",
    407       "detail": "Meta and Google are funders; Meta's PromptGuard is a baseline being outperformed, and DataFilter uses Meta's Llama-3.1-8B as its backbone model."
    408     },
    409     {
    410       "flag": "PromptArmor reproduced by authors",
    411       "detail": "Authors reproduced PromptArmor from scratch (no official code) and modified its detection prompt, which may not reflect the strongest possible PromptArmor configuration."
    412     },
    413     {
    414       "flag": "No ablation table",
    415       "detail": "Four training goals (benign preservation, anti-hallucination, anti-repetition, position robustness) are described but their individual contributions are not systematically ablated in a table."
    416     },
    417     {
    418       "flag": "Contamination unaddressed",
    419       "detail": "Llama-3.1-8B's training cutoff is not stated; evaluation benchmarks (SEP, InjecAgent) predate Llama 3.1 and may have been seen during pretraining, potentially inflating filtering performance."
    420     },
    421     {
    422       "flag": "SEP subsample without justification",
    423       "detail": "Only 1K of 9.1K SEP samples are evaluated with no justification for subset size or confirmation that the subsample is representative."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "AgentDojo: A Dynamic Environment to Evaluate Attacks and Defenses for LLM Agents",
    429       "relevance": "Primary evaluation benchmark for both security and utility of DataFilter in multi-turn agentic tool-calling settings"
    430     },
    431     {
    432       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents",
    433       "relevance": "Secondary evaluation benchmark measuring indirect injection in API-calling scenarios with 1K samples"
    434     },
    435     {
    436       "title": "Can LLMs Separate Instructions from Data? And What Do We Even Mean by That?",
    437       "relevance": "SEP benchmark used for instruction-following security evaluation across 6 attack types"
    438     },
    439     {
    440       "title": "Meta SecAlign: A Secure Foundation LLM Against Prompt Injection Attacks",
    441       "relevance": "State-of-the-art fine-tuning defense, used as reference for training strategy design and as comparison for model-level vs model-agnostic tradeoffs"
    442     },
    443     {
    444       "title": "The Attacker Moves Second: Stronger Adaptive Attacks Bypass Defenses Against LLM Jailbreaks and Prompt Injections",
    445       "relevance": "Strong adaptive attack that breaks DataFilter, establishing the ceiling on defense effectiveness against optimized adversaries"
    446     },
    447     {
    448       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    449       "relevance": "Foundational work defining indirect prompt injection and motivating the threat landscape for LLM agents"
    450     },
    451     {
    452       "title": "StruQ: Defending Against Prompt Injection with Structured Queries",
    453       "relevance": "Fine-tuning defense using structured query format, key prior work in model-level defenses that DataFilter is positioned against"
    454     },
    455     {
    456       "title": "DataSentinel: A Game-Theoretic Detection of Prompt Injection Attacks",
    457       "relevance": "Detection-based baseline that DataFilter outperforms, demonstrating the detection-vs-filtering design space tradeoff"
    458     },
    459     {
    460       "title": "Defeating Prompt Injections by Design",
    461       "relevance": "System-level defense providing security-by-design guarantees, representing the alternative architectural approach to DataFilter"
    462     },
    463     {
    464       "title": "AlpacaEval: An Automatic Evaluator of Instruction-following Models",
    465       "relevance": "Utility evaluation benchmark used to measure instruction-following quality with and without DataFilter applied"
    466     }
    467   ],
    468   "engagement_factors": {
    469     "practical_relevance": {
    470       "score": 3,
    471       "justification": "DataFilter is released as a plug-and-play defense for any LLM system, directly addressing OWASP #1 LLM threat with marginal overhead and no backend model access required."
    472     },
    473     "surprise_contrarian": {
    474       "score": 2,
    475       "justification": "Challenges the assumed security-utility tradeoff in model-agnostic defenses, showing it is possible to nearly eliminate injections without meaningful utility loss."
    476     },
    477     "fear_safety": {
    478       "score": 3,
    479       "justification": "Directly addresses OWASP #1 LLM threat citing real attacks against Google Bard, Slack AI, Anthropic Claude Computer Use, and OpenAI Operator causing data leakage and malware execution."
    480     },
    481     "drama_conflict": {
    482       "score": 1,
    483       "justification": "Mild security arms race framing with acknowledgment that strong adaptive attacks break the defense, but no major controversy or conflict angle."
    484     },
    485     "demo_ability": {
    486       "score": 3,
    487       "justification": "Model and code are explicitly released for immediate use; practitioners can deploy DataFilter today on any LLM application without accessing backend model weights."
    488     },
    489     "brand_recognition": {
    490       "score": 2,
    491       "justification": "UC Berkeley affiliation, Meta and Google funding, and evaluation on GPT-4o/GPT-5.1 add credibility; David Wagner is a well-known security researcher."
    492     }
    493   },
    494   "hn_data": {
    495     "threads": [
    496       {
    497         "hn_id": "42919597",
    498         "title": "Efficient Reasoning with Hidden Thinking",
    499         "points": 172,
    500         "comments": 43,
    501         "url": "https://news.ycombinator.com/item?id=42919597",
    502         "created_at": "2025-02-03T16:06:48Z"
    503       },
    504       {
    505         "hn_id": "38355249",
    506         "title": "Open Problems in DAOs",
    507         "points": 3,
    508         "comments": 0,
    509         "url": "https://news.ycombinator.com/item?id=38355249",
    510         "created_at": "2023-11-20T21:39:59Z"
    511       },
    512       {
    513         "hn_id": "46311266",
    514         "title": "Tiny-TSM: Efficiently Training a Lightweight SOTA Time Series Foundation Model",
    515         "points": 1,
    516         "comments": 0,
    517         "url": "https://news.ycombinator.com/item?id=46311266",
    518         "created_at": "2025-12-18T11:07:07Z"
    519       },
    520       {
    521         "hn_id": "37939342",
    522         "title": "Can Large Language Models Explain Themselves? A Study",
    523         "points": 1,
    524         "comments": 0,
    525         "url": "https://news.ycombinator.com/item?id=37939342",
    526         "created_at": "2023-10-19T06:41:38Z"
    527       }
    528     ],
    529     "top_points": 172,
    530     "total_points": 177,
    531     "total_comments": 43
    532   }
    533 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs