scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27095B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Defense against Prompt Injection Attacks via Mixture of Encodings",
      6     "authors": [
      7       "Ruiyi Zhang",
      8       "David Sullivan",
      9       "Kyle Jackson",
     10       "Pengtao Xie",
     11       "Mei Chen"
     12     ],
     13     "year": 2025,
     14     "venue": "North American Chapter of the Association for Computational Linguistics",
     15     "arxiv_id": "2504.07467",
     16     "doi": "10.48550/arXiv.2504.07467"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Tables 1 and 2 support all three main claims: attack success rates are competitive/lowest (Table 1), task performance is maintained within 2-5% of baseline (Table 2), and outperformance over Base64/Caesar is shown across most benchmarks.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper compares full methods but includes no ablation study isolating the contribution of each encoding or the mixture aggregation strategy. Cannot attribute improvement to the mixture mechanism vs. other factors.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Results are shown on specific BIPIA attack datasets and 9 NLP tasks, but generalization claims aren't bounded. No discussion of applicability to other attack types, models beyond GPT-4/4o/Qwen, or tasks outside this set.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper presents empirical results without exploring why the mixture works or considering alternative mechanistic explanations for the improved performance.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Attack success rate directly measures whether LLM follows malicious instructions (not a proxy), and NLP task accuracy directly measures helpfulness. Measurements align with stated claims.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Section 7 contains only one brief paragraph discussing computational overhead. A single sentence per limitation does not constitute a dedicated limitations section.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Only generic mention of inference cost. No specific threats discussed: sample representativeness, attack generalization, model generalization, or whether 50 BIPIA attacks represent the full attack space.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Scope is implicitly tested (GPT-4/4o/Qwen, BIPIA attacks, 9 tasks) but explicit boundaries stating what the work does NOT show are not stated.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "First author footnote states 'internship project at Microsoft' but no formal funding source statement or acknowledgments section disclosing sponsors.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations clearly stated: UC San Diego and Microsoft. However, Microsoft employees are among the authors, creating potential bias (though evaluating non-Microsoft LLMs).",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Microsoft funded the internship but doesn't directly benefit from results—the method is model-agnostic and evaluates OpenAI (GPT-4/4o) and Alibaba (Qwen), not Microsoft products.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement, patent disclosures, or equity declarations included.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Prompt injection attacks defined with Figure 1 example. Mixture of encodings explained. Safety/helpfulness used contextually. Attack success rate implicitly clear from context.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Contribution is explicitly stated in abstract and introduction: a defense method balancing safety and helpfulness using multiple character encodings.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 discusses prior attacks (Perez 2022, Liu 2024), existing defenses (detection vs. prevention), Base64 defense (Hines 2024), and mixture-of-experts literature. Positions this work as improvement on Base64.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Paper states 'Our code is publicly available at https://github.com/ruz048/MoEMEnT'. Assuming claim is accurate; GitHub URL provided.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Uses only public benchmark datasets (BIPIA, MMLU, SQuAD, Hellaswag, MGSM, SamSum, WMT, IMDB, WildGuard, WebQ). No custom data created.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Model versions specified (GPT-4 turbo-2024-04-09, GPT-4o 2024-05-13) but no requirements.txt, Dockerfile, Python version, or dependency specifications provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Method described algorithmically but no step-by-step reproduction instructions included. Code promise is made but not documented in paper.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Tables 1 and 2 report point estimates only. No error bars, confidence intervals, or measures of variance across runs.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "Comparative claims made (e.g., 'outperforms Base64') but no statistical significance tests (t-tests, chi-squared, etc.) reported.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Tables report accuracy percentages and attack success rates, which are effect measures. Improvements quantified (e.g., Table 1: GPT-4o Abstract 1.0% vs Base64 5.7%).",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Dataset sizes are standard benchmarks (BIPIA: 7.5K-22.5K; NLP tasks: 1.3K-25K) but not justified. No power analysis or rationale for sufficiency.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Single point estimates per condition. No variance, standard deviation, or multiple run results reported.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Baselines: No Defense, Datamark, Ignoring, Base64, Caesar. Covers both detection-adjacent and encoding-based defenses.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Base64 (Hines 2024), Caesar (proposed concurrently), Datamark/Ignoring (Yi et al. 2023). All recent encoding/defense methods.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "Method combines 3 prompts (P1, P2, P3) and aggregates responses, but no ablation isolating individual components or encoding combinations.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Safety: attack success rate. Helpfulness: accuracy on 9 NLP tasks. Cost: inference multiplier (Table 4). Multiple dimensions evaluated.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "All metrics are automated. No human judgment on whether output quality is preserved for summarization, translation, or QA tasks.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Standard benchmarks use validation/test splits (BIPIA test set, NLP task test splits). Appropriate held-out evaluation.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Table 1 breaks safety by 4 attack datasets (Email, Table, Abstract, Code). Table 2 breaks helpfulness by 9 tasks.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Figure 4 shows example where Base64 fails (math problem returns 2 instead of 4) but mixture succeeds. Some failure analysis via example.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Tables show cases where method underperforms baseline, e.g., GPT-4o WebQ: 25.3% vs 29.7% no-defense. Reported but not analyzed.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "GPT-4 (turbo-2024-04-09) and GPT-4o (2024-05-13) specify snapshot dates. Qwen-2.5-72B-Instruct mentioned for open-source.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Meta-prompts (MP1, MP2) provided in Table 3 (Appendix D). MP1: 'The following sentence is encoded in Base64/Caesar format.' MP2: 'Given answers A, B, C, reply with your answer.' Instructions are shown.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Caesar shift = 3 (Appendix E). Temperature, top-p, and other generation parameters not reported.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Method clearly described: encode input with 3 methods, get 3 LLM responses, aggregate (sum probabilities for classification, meta-prompt for generation).",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": false,
    262           "justification": "Uses standard benchmark splits without custom preprocessing. No preprocessing pipeline documented.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "All datasets are public: BIPIA components (OpenAI Evals, WikiTableQA, XSum, Stack Overflow) and standard NLP benchmarks (MMLU, SQuAD, etc.).",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Safety datasets sourced from: Email (OpenAI Evals), Table (WikiTableQA), Abstract (XSum), Code (Stack Overflow). Helpfulness datasets standard (MMLU, SQuAD, etc.). Sources cited.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Benchmark evaluation only.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "Paper references existing datasets but doesn't document the pipeline from raw data to final evaluation (e.g., filtering, splitting, preprocessing).",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "Model snapshots specified: GPT-4 turbo (cutoff ~April 2024), GPT-4o (cutoff ~May 2024). BIPIA benchmark from 2023, before cutoffs.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Standard NLP benchmarks (MMLU, SQuAD) are known to have training data contamination in large LLMs. This risk is not discussed.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "No discussion of whether MMLU, SQuAD, or other helpfulness benchmarks were in training data. Known contamination issue not addressed.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human subjects; benchmark evaluation only.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Table 4 (Appendix H) shows inference cost multiplier: no defense = 1x, mixture = 3.46x. Absolute runtime/USD cost not provided but relative cost clear.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Total computational budget for experiments (number of API calls, total cost, compute-hours) not stated.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Mixture of encodings achieves one of the lowest attack success rates under prompt injection attacks",
    375       "evidence": "Table 1 shows attack success rates on 4 BIPIA datasets: for GPT-4o, mixture achieves 1.5% (Email), 1.0% (Table), 1.0% (Abstract), 0% (Code), competitive with or better than Base64 and Caesar.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "The method maintains high performance across all NLP tasks",
    380       "evidence": "Table 2 shows helpfulness performance on 9 tasks: GPT-4o ranges 75.5-96.1% with mixture vs 79.9-92.3% without defense, within 2-5% of baseline on 8 of 9 tasks.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Mixture of encodings outperforms Base64 and Caesar defenses",
    385       "evidence": "Table 1: mixture beats Base64 on 3/4 attack datasets for GPT-4o; Table 2: mixture beats Caesar on all 9 tasks (e.g., MGSM 52.0% vs 14.2%).",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Base64 defense significantly degrades performance on mathematical and multilingual reasoning tasks",
    390       "evidence": "Table 2: Base64 achieves 5.2% on MGSM (vs 53.1% no-defense) and 64.9% on MMLU (vs 79.9%), demonstrating substantial degradation on reasoning tasks.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Aggregating multiple encodings balances safety and helpfulness better than single encodings",
    395       "evidence": "Tables 1 and 2 show mixture achieving competitive safety with acceptable helpfulness, while Base64 achieves better safety but much worse helpfulness.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Caesar cipher is an effective character encoding for defending against prompt injection attacks",
    400       "evidence": "Table 1: Caesar achieves low attack success rates (e.g., 0% on Code for GPT-4o), but Table 2 shows severe helpfulness degradation (e.g., 7.3% on MGSM).",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval"
    406   ],
    407   "key_findings": "This paper proposes a mixture of encodings defense against prompt injection attacks that processes external data in three forms—unencoded (P1), Base64-encoded (P2), and Caesar-cipher-encoded (P3)—sending each to an LLM and aggregating responses. On the BIPIA safety benchmark (4 attack datasets), the method achieves 1.0-1.5% average attack success rate for GPT-4o, competitive with Base64. On nine NLP tasks (MMLU, SQuAD, Hellaswag, MGSM, SamSum, WMT, IMDB, WildGuard, WebQ), it maintains 75.5-96.1% accuracy, within 2-5% of the undefended baseline—a major improvement over Base64 alone, which degrades to 5% on math reasoning. The trade-off is 3.46x inference cost due to triple processing, though the authors suggest parallelization could reduce latency.",
    408   "red_flags": [
    409     {
    410       "flag": "No ablation study",
    411       "detail": "Method combines 3 prompts (unencoded, Base64, Caesar) and aggregates, but no ablation isolates which components matter. Is Caesar necessary? Would 2-encoding mixture suffice? Contribution of aggregation vs. ensemble is unclear."
    412     },
    413     {
    414       "flag": "No statistical significance testing",
    415       "detail": "All results are point estimates. No confidence intervals, p-values, or multiple runs reported. Improvements could be noise, especially for small differences (e.g., 1% on some datasets)."
    416     },
    417     {
    418       "flag": "Computational overhead inadequately justified",
    419       "detail": "3.46x inference cost is substantial. Section 7 mentions 'can be processed in parallel' but doesn't quantify actual latency reduction or real-world deployment feasibility."
    420     },
    421     {
    422       "flag": "Limited attack diversity",
    423       "detail": "Only evaluated on BIPIA benchmark (50 attack types). Unknown whether method generalizes to other prompt injection strategies, novel attack formulations, or adversarially crafted attacks outside BIPIA scope."
    424     },
    425     {
    426       "flag": "No human evaluation",
    427       "detail": "Automated metrics only. Doesn't verify whether human-perceived output quality is preserved for subjective tasks like summarization or machine translation."
    428     },
    429     {
    430       "flag": "Benchmark contamination unaddressed",
    431       "detail": "Standard NLP benchmarks (MMLU, SQuAD, Hellaswag) are known to be in GPT-4/4o training data. Reported helpfulness improvements may be inflated due to memorization rather than genuine robustness."
    432     },
    433     {
    434       "flag": "No alternative aggregation strategies explored",
    435       "detail": "Why sum probabilities for classification? Why meta-prompt for generation? No comparison of aggregation methods or justification that choices are optimal."
    436     },
    437     {
    438       "flag": "Weak generalization claims",
    439       "detail": "Results shown for GPT-4, GPT-4o, and Qwen, but generalization to other LLM architectures, sizes, or deployment contexts (on-device, quantized) is unexplored."
    440     }
    441   ],
    442   "cited_papers": [
    443     {
    444       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    445       "relevance": "Introduces BIPIA benchmark used for safety evaluation. Proposes detection-based and prevention-based defenses. Core baseline for this work."
    446     },
    447     {
    448       "title": "Defending against indirect prompt injection attacks with spotlighting",
    449       "relevance": "Proposes Base64 defense, the state-of-the-art encoding-based defense. This paper builds on and improves Base64."
    450     },
    451     {
    452       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    453       "relevance": "Formalizes prompt injection attacks and proposes baselines (Datamark, Ignoring). Defines attack/defense taxonomy."
    454     },
    455     {
    456       "title": "Ignore previous prompt: Attack techniques for language models",
    457       "relevance": "Foundational work on prompt injection attack techniques. Establishes the threat model."
    458     },
    459     {
    460       "title": "Jailbroken: How does LLM safety training fail?",
    461       "relevance": "Studies LLM understanding of encodings (Base64, cipher). Provides evidence that recent LLMs can decode Base64, motivating encoding-based defenses."
    462     },
    463     {
    464       "title": "GPT-4 is too smart to be safe: Stealthy chat with LLMs via cipher",
    465       "relevance": "Demonstrates LLM capability on Caesar cipher decoding. Motivates choice of Caesar as one encoding in mixture."
    466     },
    467     {
    468       "title": "Mixture of Experts and Prompt Ensemble",
    469       "relevance": "Surveys mixture-of-experts and prompt ensemble methods. Provides conceptual foundation for this paper's mixture-of-encodings strategy."
    470     }
    471   ],
    472   "engagement_factors": {
    473     "practical_relevance": {
    474       "score": 2,
    475       "justification": "Method is deployable (code released) but 3.46x inference cost is prohibitive for production systems without extreme safety requirements. Viable only for latency-insensitive applications (e.g., content moderation)."
    476     },
    477     "surprise_contrarian": {
    478       "score": 1,
    479       "justification": "Straightforward application of mixture-of-experts / ensemble logic to defenses. Expected result that combining encoding methods improves robustness; no surprising insights about prompt injection or defenses."
    480     },
    481     "fear_safety": {
    482       "score": 2,
    483       "justification": "Addresses prompt injection, a real LLM safety concern, but limited scope: only applies to systems with external data pipelines. Doesn't address broader alignment, jailbreaking, or adversarial robustness."
    484     },
    485     "drama_conflict": {
    486       "score": 0,
    487       "justification": "No controversy. Straightforward safety engineering. No contentious claims or competing approaches."
    488     },
    489     "demo_ability": {
    490       "score": 2,
    491       "justification": "Code released but requires GPT-4/4o API access or local LLM to try. Not immediately runnable for most readers without API keys and budget."
    492     },
    493     "brand_recognition": {
    494       "score": 2,
    495       "justification": "UC San Diego + Microsoft (respectable but not top-tier). No famous authors. Published at ACL (mainstream venue but not top-tier conference)."
    496     }
    497   },
    498   "hn_data": {
    499     "threads": [
    500       {
    501         "hn_id": "44884091",
    502         "title": "A Comprehensive Survey of Self-Evolving AI Agents [pdf]",
    503         "points": 94,
    504         "comments": 29,
    505         "url": "https://news.ycombinator.com/item?id=44884091",
    506         "created_at": "2025-08-13T02:26:32Z"
    507       },
    508       {
    509         "hn_id": "43736366",
    510         "title": "Inferring the Phylogeny of Large Language Models",
    511         "points": 69,
    512         "comments": 6,
    513         "url": "https://news.ycombinator.com/item?id=43736366",
    514         "created_at": "2025-04-19T13:47:15Z"
    515       },
    516       {
    517         "hn_id": "26794843",
    518         "title": "Certifying Multimedia News Content for Fake News Defense",
    519         "points": 12,
    520         "comments": 3,
    521         "url": "https://news.ycombinator.com/item?id=26794843",
    522         "created_at": "2021-04-13T16:28:40Z"
    523       },
    524       {
    525         "hn_id": "43989432",
    526         "title": "OnPrem.LLM: A Privacy-Conscious Document Intelligence Toolkit",
    527         "points": 5,
    528         "comments": 0,
    529         "url": "https://news.ycombinator.com/item?id=43989432",
    530         "created_at": "2025-05-14T21:30:02Z"
    531       },
    532       {
    533         "hn_id": "40043146",
    534         "title": "Why do small language models underperform?",
    535         "points": 4,
    536         "comments": 1,
    537         "url": "https://news.ycombinator.com/item?id=40043146",
    538         "created_at": "2024-04-15T17:10:46Z"
    539       },
    540       {
    541         "hn_id": "35626433",
    542         "title": "Learning to Compress Prompts with Gist Tokens",
    543         "points": 2,
    544         "comments": 1,
    545         "url": "https://news.ycombinator.com/item?id=35626433",
    546         "created_at": "2023-04-19T10:22:30Z"
    547       },
    548       {
    549         "hn_id": "35721355",
    550         "title": "Compressing Large Language Model Prompts via Gist Tokens",
    551         "points": 1,
    552         "comments": 0,
    553         "url": "https://news.ycombinator.com/item?id=35721355",
    554         "created_at": "2023-04-26T23:30:32Z"
    555       },
    556       {
    557         "hn_id": "35641820",
    558         "title": "Learning to Compress Prompts with Gist Tokens",
    559         "points": 1,
    560         "comments": 0,
    561         "url": "https://news.ycombinator.com/item?id=35641820",
    562         "created_at": "2023-04-20T15:43:27Z"
    563       },
    564       {
    565         "hn_id": "9413569",
    566         "title": "Efficient Approximation Algorithms for the Largest Weight Data Retrieval Problem",
    567         "points": 1,
    568         "comments": 0,
    569         "url": "https://news.ycombinator.com/item?id=9413569",
    570         "created_at": "2015-04-21T12:35:14Z"
    571       }
    572     ],
    573     "top_points": 94,
    574     "total_points": 189,
    575     "total_comments": 40
    576   }
    577 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs