scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27534B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DataSentinel: A Game-Theoretic Detection of Prompt Injection Attacks",
      6     "authors": [
      7       "Yupei Liu",
      8       "Yuqi Jia",
      9       "Jinyuan Jia",
     10       "Dawn Song",
     11       "Neil Zhenqiang Gong"
     12     ],
     13     "year": 2025,
     14     "venue": "IEEE Symposium on Security and Privacy",
     15     "arxiv_id": "2504.11358",
     16     "doi": "10.1109/SP61157.2025.00250"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims (FPR≈0, FNR≤0.07 for existing attacks, outperforms 6 baselines by large margin) are directly supported by Tables 1–3 and Table 6.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper makes causal claims that minimax fine-tuning improves detection; these are supported by ablation comparing DataSentinel(Minimax) vs DataSentinel(Min) and hyperparameter ablations in Figures 3–5.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper explicitly bounds claims to the evaluated setting (7 NLP tasks, 3 open-source LLMs ≤8B parameters) and the limitations section clearly states DataSentinel fails when injected and target tasks are the same type.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss alternative explanations for why minimax optimization improves detection (e.g., whether it is the fine-tuning itself rather than the game-theoretic framing that drives gains).",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "FPR and FNR are precisely defined as detection error rates, and claims about 'effective detection' match these measurements; no proxy conflation is present.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 6 is titled 'Discussion and Limitations' and contains multiple dedicated subsections on specific failure modes.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats are identified: (1) DataSentinel fails when target=injected task (adversarial examples, FNR=0.87 shown empirically), (2) benign instructions in user data may cause false positives, (3) better instruction-following LLMs may weaken the defense.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The conclusion explicitly states detection 'is highly effective... as long as the injected prompts mislead the backend LLM into performing injected tasks that differ from the target task,' clearly bounding scope.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments section states: 'This work was supported by NSF grant No. 2131859, 2125977, 2112562, and 1937787, as well as ARO grant No. W911NF2110182.'",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations (Penn State, Duke University, UC Berkeley) are disclosed on the title page.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "NSF and ARO are US government research funding agencies with no commercial stake in prompt injection detection outcomes.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial disclosures statement (patents, equity, consulting) is present in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "All key terms are precisely defined with formal notation: LLM-integrated application, target task (st, xt, yt), injected task (se, xe, ye), contaminated target data xc, FPR, FNR, and known-answer detection.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Contributions are explicitly itemized: (1) first game-theoretic prompt injection detector, (2) minimax optimization formulation, (3) gradient-based solution and comprehensive evaluation.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 thoroughly situates the work relative to heuristic and optimization-based attacks, prevention vs. detection defenses, and directly compares against known-answer detection as the prior SOTA.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The abstract states 'Our code and data are available at: https://github.com/liu00222/Open-Prompt-Injection' — a live repository, not a promise of future release.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All 7 datasets used (MRPC, Jfleg, SMS Spam, RTE, SST2, HSOL, Gigaword) are standard public benchmarks, and the paper states code and data are available at GitHub.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper specifies GPU hardware (Quadro RTX 6000) and mentions QLoRA but does not provide requirements.txt, Dockerfile, or equivalent environment specification.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "Algorithms 1–3 provide detailed pseudocode, Section 5.1 lists all hyperparameters explicitly, and code is publicly available at GitHub.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 1–6 are point estimates (FPR/FNR) with no confidence intervals or error bars reported; results are from single runs with fixed seed.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are used for any comparative claims between DataSentinel and baselines.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Absolute FPR/FNR values are reported for all methods, allowing direct magnitude comparison; e.g., KAD FPR up to 0.10 vs DataSentinel 0.00 is clearly quantified.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The choice of 100 samples per task combination (giving 35,700 contaminated samples total) is stated but not justified via power analysis or any other principled rationale.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance or standard deviation across runs is reported; temperature is fixed at 0.1 with a fixed seed, so only single deterministic runs are presented.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Six baselines are compared: EVD, NLLMD, SSFTD, SSFTD-G, PromptGuard, and Known-Answer Detection (KAD).",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "KAD (Liu et al., USENIX 2024) is the prior state-of-the-art; PromptGuard was released by Meta in 2024; all baselines are contemporaneous.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "DataSentinel(Min) ablates the game-theoretic component; Figures 3–5 ablate hyperparameters r, |D|, α, β, nin, and nout individually.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Both FPR and FNR are reported, along with ASV (attack success value) for adaptive attacks in Table 8.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "The paper evaluates automated detection of injected prompts; human evaluation is not relevant to this system.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Evaluation uses test sets of each NLP benchmark dataset, while fine-tuning uses training sets; fine-tuning uses Gigaword training data while evaluation tasks are different.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Tables 10–16 in the appendix report FPR and FNR for every injected-target task combination (7×7) for each of the 9 attacks.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 6 reports that DataSentinel achieves FNR=0.87 when target and injected tasks are both sentiment analysis, and explains this is because attacks reduce to adversarial examples.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The adversarial examples failure case (FNR=0.87) is explicitly reported in Table 6 and discussed as a fundamental limitation in Section 6.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Exact model identifiers are given: Mistral-7B [36], LLaMA2-7B [37,38], LLaMA3-8B-Instruct [39], with citations to the corresponding model papers/repos.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The detection instruction template is provided, but the 7 target and injected task instructions are only referenced as 'consistent with [7]' — Appendix A does not list them.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Section 5.1 explicitly reports all hyperparameters: α=1, β=1, r=3, lrout=0.000025, bin=8, bout=2, nin=10, nout=500, temperature=0.1, fixed random seed.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "DataSentinel is a fine-tuned detection model, not an agentic scaffold; no agentic scaffolding is used or evaluated.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "The sampling procedure is documented: 100 data points from each dataset test set for evaluation, 500 from Gigaword training set for fine-tuning, and contaminated data construction is formally specified.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "The paper states 'Our code and data are available at: https://github.com/liu00222/Open-Prompt-Injection', implying the generated contaminated datasets are released.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Data collection is described precisely: 100 test samples per task drawn from 7 standard benchmarks, 100 injected samples per combination, totaling 35,700 contaminated samples.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; datasets are standard NLP benchmarks with no recruitment.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline from clean benchmark data → contaminated target data construction via each attack → evaluation of FPR/FNR is documented in Section 5.1 and Algorithm 1.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "The paper evaluates detection of prompt injection attacks using binary FPR/FNR metrics, not LLM capability benchmarks susceptible to training contamination.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "NA — the evaluation is not about LLM benchmark performance where training data contamination is a concern.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "NA — contaminated target data is generated fresh by the attacks; this is not an LLM capability evaluation.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Section 5.2 reports average query time: 1.6 seconds for Mistral-7B detection LLM, 0.7 seconds for LLaMA3.2-1B alternative, and 15.3 seconds for backend LLM processing.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Fine-tuning takes ~3 hours on one Quadro RTX 6000 GPU, costing ~$0.90 in cloud GPU rental, explicitly reported in Section 5.2.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "DataSentinel achieves FPR close to 0 across all 7 target tasks and 9 existing prompt injection attacks",
    375       "evidence": "Tables 1–2 show FPR of 0.00–0.01 across all target tasks for all 9 attacks (6 heuristic + 3 optimization-based)",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "DataSentinel achieves FNR at most 0.07 for all existing prompt injection attacks",
    380       "evidence": "Table 1 shows FNR ≤ 0.01 for all heuristic attacks and ≤ 0.07 for NeuralExec across all injected task types",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "DataSentinel significantly outperforms 6 baseline detectors including state-of-the-art KAD",
    385       "evidence": "Table 3 shows baselines have FPR up to 1.00 (PromptGuard) and FNR up to 0.21 (KAD under NeuralExec) vs DataSentinel's near-zero rates",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "The game-theoretic minimax formulation is essential for detecting adaptive attacks",
    390       "evidence": "Table 6 shows DataSentinel(Minimax) FNR ≤ 0.06 vs DataSentinel(Min) FNR up to 0.98 and KAD FNR up to 0.93 under adaptive attacks",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "DataSentinel generalizes across different detection and backend LLMs without retraining for each combination",
    395       "evidence": "Table 4 shows consistently low FPR/FNR across Mistral-7B, LLaMA2-7B, and LLaMA3-8B-Instruct; Table 5 shows cross-backend generalization",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "DataSentinel fails when injected and target tasks are the same type (adversarial examples case)",
    400       "evidence": "Table 6 reports FNR=0.87 for all methods including DataSentinel when both target and injected task are sentiment analysis under optimization-based adaptive attack",
    401       "supported": "strong"
    402     },
    403     {
    404       "claim": "Fine-tuning a smaller 1B detection LLM achieves comparable detection performance with lower latency",
    405       "evidence": "Section 5.2 reports LLaMA3.2-1B achieves FPR=0.00, FNR=0.01 at 0.7s/query vs Mistral-7B's FPR=0.00, FNR≈0.00 at 1.6s/query",
    406       "supported": "moderate"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval"
    411   ],
    412   "key_findings": "DataSentinel fine-tunes a detection LLM via minimax optimization that simulates an adversarial game between detector and attacker, achieving near-zero FPR and FNR (≤0.07) across 9 existing prompt injection attacks, 7 NLP tasks, and 6 LLMs. The game-theoretic approach substantially outperforms 6 baselines including the prior state-of-the-art known-answer detection, with the advantage most pronounced against adaptive attacks (FNR ≤0.06 vs up to 0.93 for KAD). DataSentinel is computationally practical: fine-tuning requires ~3 GPU-hours (~$0.90) and inference overhead is ~10% of backend LLM latency. The method fails gracefully in a clearly acknowledged edge case: when target and injected tasks are the same type, attacks reduce to adversarial examples (FNR=0.87), which the authors identify as open future work.",
    413   "red_flags": [
    414     {
    415       "flag": "No confidence intervals or significance tests",
    416       "detail": "All results are point estimates from single runs with fixed seed and temperature=0.1. With only 100 samples per task combination, small FPR/FNR differences could fall within noise, yet no statistical testing is done."
    417     },
    418     {
    419       "flag": "No evaluation on production LLMs",
    420       "detail": "All experiments use open-source LLMs ≤8B parameters (Mistral-7B, LLaMA2-7B, LLaMA3-8B). Generalization to closed-source production LLMs (GPT-4, Claude, Gemini) as backend LLMs is not addressed."
    421     },
    422     {
    423       "flag": "Sample size unjustified",
    424       "detail": "The choice of 100 samples per task combination (35,700 total contaminated samples) is not justified by power analysis or prior convention."
    425     },
    426     {
    427       "flag": "Task prompts not provided",
    428       "detail": "The 7 target and injected task instructions used in all experiments are referenced as 'consistent with [7]' but not reproduced; Appendix A provides no actual prompt text."
    429     },
    430     {
    431       "flag": "White-box threat model may not reflect real deployments",
    432       "detail": "The strongest baselines and DataSentinel's fine-tuning assume white-box access to detection LLM weights; real attackers targeting closed-source detection APIs would face a harder problem, potentially making the gains over KAD less relevant in practice."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    438       "relevance": "Key prior work (Liu et al., USENIX Security 2024) that introduced known-answer detection — the direct baseline DataSentinel improves upon"
    439     },
    440     {
    441       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    442       "relevance": "Seminal work (Greshake et al., AISec 2023) establishing the threat model for indirect prompt injection in deployed LLM applications"
    443     },
    444     {
    445       "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks",
    446       "relevance": "NeuralExec — the strongest optimization-based attack used as the primary evaluation adversary in this paper"
    447     },
    448     {
    449       "title": "Automatic and universal prompt injection attacks against large language models",
    450       "relevance": "Universal attack — another optimization-based attack evaluated, and source of GCG-based adversarial token optimization techniques"
    451     },
    452     {
    453       "title": "Struq: Defending against prompt injection with structured queries",
    454       "relevance": "Prevention-based defense compared to DataSentinel's detection approach; used as backend LLM in robustness experiments"
    455     },
    456     {
    457       "title": "Universal and transferable adversarial attacks on aligned language models",
    458       "relevance": "GCG (Greedy Coordinate Gradient) method used as the core discrete optimization algorithm for generating adaptive injected prompts"
    459     },
    460     {
    461       "title": "SecAlign: Defending against prompt injection with preference optimization",
    462       "relevance": "Another prevention baseline whose fine-tuned LLM is tested as both detection and backend LLM in Section 6"
    463     },
    464     {
    465       "title": "PLeak: Prompt leaking attacks against large language model applications",
    466       "relevance": "Optimization-based attack targeting instruction confidentiality, used as one of 9 evaluated attacks"
    467     }
    468   ],
    469   "engagement_factors": {
    470     "practical_relevance": {
    471       "score": 3,
    472       "justification": "Directly addresses a critical vulnerability in deployed LLM-integrated applications with a practical defense requiring only 3 GPU-hours of fine-tuning and released code."
    473     },
    474     "surprise_contrarian": {
    475       "score": 2,
    476       "justification": "The counterintuitive core insight — that making a detection LLM MORE vulnerable to prompt injection improves detection — is a genuine conceptual surprise."
    477     },
    478     "fear_safety": {
    479       "score": 3,
    480       "justification": "Prompt injection is a top OWASP LLM risk; the paper demonstrates attacks on Bing Copilot-style applications and shows existing defenses fail against adaptive attackers."
    481     },
    482     "drama_conflict": {
    483       "score": 2,
    484       "justification": "The arms race framing (attacker adapts to detector, detector trained against adaptive attacker) is compelling, and the meta-review note about potential future erosion adds tension."
    485     },
    486     "demo_ability": {
    487       "score": 2,
    488       "justification": "Code is available at GitHub and uses open-source LLMs, but requires a GPU and ~3 hours of fine-tuning — not a one-click demo."
    489     },
    490     "brand_recognition": {
    491       "score": 2,
    492       "justification": "Dawn Song (UC Berkeley) is a prominent AI security researcher; the IEEE S&P venue is highly prestigious in security."
    493     }
    494   },
    495   "hn_data": {
    496     "threads": [
    497       {
    498         "hn_id": "40115482",
    499         "title": "Survey Study on AI Agent Architectures (2024)",
    500         "points": 77,
    501         "comments": 16,
    502         "url": "https://news.ycombinator.com/item?id=40115482",
    503         "created_at": "2024-04-22T15:47:47Z"
    504       },
    505       {
    506         "hn_id": "44585492",
    507         "title": "How Many Instruction Can LLMs Follow at Once?",
    508         "points": 11,
    509         "comments": 0,
    510         "url": "https://news.ycombinator.com/item?id=44585492",
    511         "created_at": "2025-07-16T18:38:36Z"
    512       },
    513       {
    514         "hn_id": "23442899",
    515         "title": "Scientists demonstrate particle detector for dark matter",
    516         "points": 6,
    517         "comments": 2,
    518         "url": "https://news.ycombinator.com/item?id=23442899",
    519         "created_at": "2020-06-06T22:33:57Z"
    520       },
    521       {
    522         "hn_id": "45482380",
    523         "title": "Acoustic Eavesdropping via Mouse Sensors",
    524         "points": 4,
    525         "comments": 0,
    526         "url": "https://news.ycombinator.com/item?id=45482380",
    527         "created_at": "2025-10-05T15:40:37Z"
    528       },
    529       {
    530         "hn_id": "35695104",
    531         "title": "Emergent and Predictable Memorization in Large Language Models",
    532         "points": 3,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=35695104",
    535         "created_at": "2023-04-25T00:31:12Z"
    536       },
    537       {
    538         "hn_id": "45461534",
    539         "title": "Comparing Quantum Annealing and BF-DCQO",
    540         "points": 2,
    541         "comments": 0,
    542         "url": "https://news.ycombinator.com/item?id=45461534",
    543         "created_at": "2025-10-03T11:13:53Z"
    544       },
    545       {
    546         "hn_id": "40106947",
    547         "title": "From r to Q∗: Your Language Model is a Q-Function",
    548         "points": 2,
    549         "comments": 0,
    550         "url": "https://news.ycombinator.com/item?id=40106947",
    551         "created_at": "2024-04-21T16:22:09Z"
    552       },
    553       {
    554         "hn_id": "23416215",
    555         "title": "Sensei: Direct-Detection Results on Sub-GeV Dark Matter from a New Skipper-CCD",
    556         "points": 2,
    557         "comments": 0,
    558         "url": "https://news.ycombinator.com/item?id=23416215",
    559         "created_at": "2020-06-04T13:23:14Z"
    560       },
    561       {
    562         "hn_id": "44191952",
    563         "title": "Questioning Representational Optimism in Deep Learning",
    564         "points": 1,
    565         "comments": 3,
    566         "url": "https://news.ycombinator.com/item?id=44191952",
    567         "created_at": "2025-06-05T14:17:23Z"
    568       },
    569       {
    570         "hn_id": "45934130",
    571         "title": "Questioning Representational Optimism in Deep Learning",
    572         "points": 1,
    573         "comments": 1,
    574         "url": "https://news.ycombinator.com/item?id=45934130",
    575         "created_at": "2025-11-15T01:07:24Z"
    576       }
    577     ],
    578     "top_points": 77,
    579     "total_points": 109,
    580     "total_comments": 22
    581   }
    582 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs