scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29397B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Decoding Latent Attack Surfaces in LLMs: Prompt Injection via HTML in Web Summarization",
      6     "authors": [
      7       "Ishaan Verma",
      8       "Arsheya Yadav"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2509.05831",
     13     "doi": "10.48550/arXiv.2509.05831"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "Abstract claims of 'significant proportion of injected pages led to measurable semantic and stylistic shifts' are supported by ROUGE-L (0.301–0.327) and SBERT metrics (0.694–0.698), plus 15.71–29.29% injection success rates across both models.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Paper makes causal claims ('injections manipulate outputs'). Study design (282 pages, half clean/half injected, controlled comparison) is appropriate for demonstrating causal effect in security evaluation, though not a randomized controlled trial.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Scope bounded to web summarization with HTML injections on 2 models and 8 attack vectors. Title and abstract appropriately scoped; conclusions limited to 'web-based LLM pipelines' without overgeneralizing to all LLM tasks.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Paper demonstrates injections work (pirate example) but does not discuss alternative explanations: Is the model following explicit instructions, or simply sensitive to certain text patterns? Why Llama is more vulnerable than Gemma (architectural differences?) is noted but not explained.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Measured outcomes (ROUGE-L divergence, SBERT similarity, manual annotation) directly proxy claimed outcome (injection-induced output manipulation). Distinction between measurement and claim is clear and reasonable.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No dedicated 'Limitations' or 'Threats to Validity' section in the paper. A 'Future Work' section exists but does not address methodological constraints of the current study.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No specific threats discussed. Gaps not mentioned: manual annotation reliability (no inter-rater agreement reported), sample size justification (282 pages, 8 techniques, 2 models), generalization to other tasks/domains, or limitation of synthetic vs. real web content.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Paper does not explicitly state what it does NOT show. No discussion of: generalization to other LLM architectures, applicability to fine-tuned models, relevance to other tasks beyond summarization, or limits of the 8 injection techniques tested.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No funding source mentioned anywhere in paper. No acknowledgments section or statement of funding/lack thereof. Funding disclosure missing.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Author affiliations disclosed: both from Manipal University Jaipur, Department of Computer Science and Department of Data Science. No obvious financial conflict with Meta (Llama) or Google (Gemma).",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No funder identified; cannot assess independence.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement, no mention of patents, equity stakes, consulting arrangements, or financial relationships. Standard COI declaration missing.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key terms defined: 'Prompt injection' as 'specially crafted inputs designed to manipulate LLM behavior,' 'HTML-based prompt injection' with concrete examples (aria-labels, meta tags, alt-text). Terms sufficiently precise for context.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Core objective explicitly stated: 'empirically assess the susceptibility of state-of-the-art LLMs to prompt injection attacks delivered through web content.' Contribution is systematic evaluation on 282 pages with 8 techniques, addressing gap in HTML-based attack research.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Related work section engages with prior prompt injection research (Liu et al., OWASP), HTML adversarial attacks (Tao et al.), and LLM robustness evaluation (Yang et al.). Connection to this work's novelty (HTML-based attacks in web summarization) is established, though somewhat list-based rather than deeply integrated.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "Code released on GitHub: evaluation.py, file_generation.py, and supporting scripts listed in appendix. Repository URL: https://github.com/ishaanv1206/Decoding-Latent-Attack-Surfaces-in-LLMs-Prompt-Injection-via-HTML-in-Web-Summarization",
    122           "source": "haiku"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "Dataset of 282 HTML pages (clean/ and injected/ directories) publicly available on GitHub Pages. Metadata in CSV format (metadata.csv, gemma.csv, llama.csv) included.",
    128           "source": "haiku"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "Specific tools named (Playwright, all-MiniLM-L6-v2 SentenceTransformer) and model versions identified with references. However, no requirements.txt, environment.yml, Python version, or comprehensive dependency list provided.",
    134           "source": "haiku"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Repository contents listed but no step-by-step reproduction instructions provided. Scripts mentioned (evaluation.py, file_generation.py) but not accompanied by explicit 'run these commands in this order' guidance.",
    140           "source": "haiku"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "Only average metrics reported (ROUGE-L 0.301–0.327, SBERT 0.694–0.698, success rates 15.71–29.29%). No confidence intervals, significance tests, standard deviations, effect size metrics, or sample size justification provided.",
    147         "source": "haiku"
    148       },
    149       "confidence_intervals_or_error_bars": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Table 1 provides only means: average ROUGE-L and SBERT values with no confidence intervals, standard errors, or error bars. Success rates (29.29%, 15.71%) reported as point estimates.",
    153         "source": "haiku"
    154       },
    155       "significance_tests": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "Models compared (Llama 29.29% vs Gemma 15.71% success) but no statistical significance tests (t-test, chi-square, etc.) reported. Differences could be due to chance or dataset-specific factors.",
    159         "source": "haiku"
    160       },
    161       "effect_sizes_reported": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "Raw differences stated (e.g., 13.58 percentage point gap in success rates, ROUGE-L difference of 0.0259) but no formal effect size metrics (Cohen's d, odds ratio, etc.) reported.",
    165         "source": "haiku"
    166       },
    167       "sample_size_justified": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "282 pages chosen with no stated justification. No power analysis, minimum sample size calculation, or discussion of adequacy for the claims made.",
    171         "source": "haiku"
    172       },
    173       "variance_reported": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Only averages reported across all metrics. No standard deviations, ranges, quartiles, or per-page variance for ROUGE-L or SBERT scores.",
    177         "source": "haiku"
    178       },
    179       "evaluation_design": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Design includes baselines (clean pages), multiple metrics (ROUGE-L, SBERT, manual), 8 injection technique variants (ablation-like), and human evaluation of injection success. Missing: per-category breakdown and systematic reporting of failure cases.",
    183         "source": "haiku"
    184       },
    185       "baselines_included": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Clean version of each page serves as implicit baseline. Comparison of clean vs. injected summaries is the core evaluation design.",
    189         "source": "haiku"
    190       },
    191       "baselines_contemporary": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Baseline is the same page without injection—within-subject comparison, appropriate for security evaluation. No comparison to external baselines required.",
    195         "source": "haiku"
    196       },
    197       "ablation_study": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "8 distinct injection techniques tested (meta tag, comment, hidden div, base64-encoded attribute, ARIA label, opacity div, hidden script, alt text). Provides systematic comparison of attack vector effectiveness.",
    201         "source": "haiku"
    202       },
    203       "multiple_metrics": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Three evaluation approaches: ROUGE-L (lexical), SBERT cosine similarity (semantic), manual annotation (behavioral). Provides multifaceted assessment of injection impact.",
    207         "source": "haiku"
    208       },
    209       "human_evaluation": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Manual annotation of injection success: 'reviewing the LLM's summary output for evidence that the injected prompt had influenced or manipulated the model's response.' Humans evaluated system outputs for injection success.",
    213         "source": "haiku"
    214       },
    215       "held_out_test_set": {
    216         "applies": false,
    217         "answer": false,
    218         "justification": "Not a prediction task with train/test split. Security evaluation on synthetic pages with no train/test distinction.",
    219         "source": "haiku"
    220       },
    221       "per_category_breakdown": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "10 content categories tested (blogs, FAQs, news articles, docs, product listings, profiles, privacy policies, tutorials, reviews, careers) but results not broken down by category. No per-category success rates or per-category metric comparisons.",
    225         "source": "haiku"
    226       },
    227       "failure_cases_discussed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Successful injections reported but no detailed analysis of failure cases. Hidden script tag had 0% success on Gemma and 2/140 (1.4%) on Llama, noted briefly but not analyzed.",
    231         "source": "haiku"
    232       },
    233       "negative_results_reported": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "Gemma's lower vulnerability (15.71% vs 29.29%) presented as finding. Hidden script tag failures noted. Some negative results included, though not systematically documented.",
    237         "source": "haiku"
    238       },
    239       "setup_transparency": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "Model versions identified with references (Llama 4 Scout-17B-16E, Gemma 2 9B IT), but actual summarization prompt text not provided—only high-level description. Hyperparameters (temperature, top_p, etc.) not specified. Data extraction method outlined but implementation not documented.",
    243         "source": "haiku"
    244       },
    245       "model_versions_specified": {
    246         "applies": true,
    247         "answer": true,
    248         "justification": "Specific model versions identified: Llama 4 Scout (ref. [19] shows '17B-16E'), Gemma 9B IT (ref. [18] shows 'Gemma-2-9b-IT'). Versions traceable via Hugging Face references.",
    249         "source": "haiku"
    250       },
    251       "prompts_provided": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "Only high-level description given: 'standardized prompt instructing the LLM to generate a one-paragraph summary.' Actual prompt text (temperature settings, exact wording, stop tokens) not provided.",
    255         "source": "haiku"
    256       },
    257       "hyperparameters_reported": {
    258         "applies": true,
    259         "answer": false,
    260         "justification": "No temperature, top_p, max_tokens, frequency_penalty, presence_penalty, or other sampling hyperparameters specified. Critical for reproducibility.",
    261         "source": "haiku"
    262       },
    263       "scaffolding_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No agentic scaffolding (no tool use, no multi-step reasoning, no chain-of-thought). Direct LLM summarization with no intermediate steps.",
    267         "source": "haiku"
    268       },
    269       "data_preprocessing_documented": {
    270         "applies": true,
    271         "answer": false,
    272         "justification": "Playwright extraction method outlined ('Full HTML Source' and 'Rendered Visible Text') but implementation details missing: tokenization strategy, whitespace handling, text cleaning steps not documented.",
    273         "source": "haiku"
    274       },
    275       "data_integrity": {
    276         "applies": true,
    277         "answer": true,
    278         "justification": "Raw HTML pages available on GitHub for inspection. Data generation process documented (282 pages across 10 categories, 8 injection techniques). Full pipeline described from page generation through metric computation.",
    279         "source": "haiku"
    280       },
    281       "raw_data_available": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Raw HTML files (clean/ and injected/ directories) publicly available on GitHub. CSV metadata (metadata.csv, gemma.csv, llama.csv) and evaluation scripts provided for independent verification.",
    285         "source": "haiku"
    286       },
    287       "data_collection_described": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Data generation process described: synthetic HTML pages across 10 realistic content categories, styled with 'authentic CSS to enhance realism.' 8 distinct injection techniques systematically applied.",
    291         "source": "haiku"
    292       },
    293       "recruitment_methods_described": {
    294         "applies": false,
    295         "answer": false,
    296         "justification": "No human participants or recruitment. Manual annotation of injection success is a post-hoc assessment, not participant recruitment.",
    297         "source": "haiku"
    298       },
    299       "data_pipeline_documented": {
    300         "applies": true,
    301         "answer": true,
    302         "justification": "Full pipeline documented: HTML generation → injection → hosting on GitHub Pages → Playwright extraction → LLM summarization → metric computation. Described as 'fully automated using Python scripts.' High-level logic clear; implementation details in code.",
    303         "source": "haiku"
    304       },
    305       "contamination": {
    306         "applies": false,
    307         "answer": false,
    308         "justification": "Custom synthetic dataset, not standard benchmarks. Train/test contamination risk not applicable. However, no discussion of whether injection techniques or attack strategies might exist in training data.",
    309         "source": "haiku"
    310       },
    311       "training_cutoff_stated": {
    312         "applies": false,
    313         "answer": false,
    314         "justification": "Synthetic custom data unlikely to be in training set. Not evaluating on published benchmarks; standard contamination risk analysis not applicable.",
    315         "source": "haiku"
    316       },
    317       "train_test_overlap_discussed": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "Custom synthetic dataset; standard train/test overlap risks not applicable.",
    321         "source": "haiku"
    322       },
    323       "benchmark_contamination_addressed": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "Not using published benchmarks; contamination risk not applicable.",
    327         "source": "haiku"
    328       },
    329       "human_studies": {
    330         "applies": false,
    331         "answer": false,
    332         "justification": "No human subjects beyond researchers. Manual annotation is post-hoc assessment of system outputs, not a human subject study requiring ethics approval or participant consent.",
    333         "source": "haiku"
    334       },
    335       "cost_and_practicality": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No inference cost, API call costs, latency, or total computational budget reported. Methodology described as 'scalable' but no actual resource consumption quantified.",
    339         "source": "haiku"
    340       },
    341       "inference_cost_reported": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "No mention of inference cost, API pricing, latency per page, or total cost to run experiments on 282 pages × 2 models.",
    345         "source": "haiku"
    346       },
    347       "compute_budget_stated": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No total computational budget, GPU hours, token usage, or cost metrics provided. Practical resource requirements for reproduction not stated.",
    351         "source": "haiku"
    352       }
    353     }
    354   },
    355   "claims": [
    356     {
    357       "claim": "HTML-based prompt injections can successfully manipulate LLM summarization outputs",
    358       "evidence": "29.29% of injections successful for Llama 4 Scout, 15.71% for Gemma 9B IT across 282 test pages. Pirate example shows injection instruction directly influences summary tone.",
    359       "supported": "strong"
    360     },
    361     {
    362       "claim": "Meta tags and opacity-zero divs are the most effective HTML injection vectors",
    363       "evidence": "Meta tag injections: 17/140 successful for Llama, 6/140 for Gemma. Opacity div: 10/140 and 9/140 respectively. Both techniques outperform hidden script (2/140 and 0/140).",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "Llama 4 Scout is significantly more vulnerable to HTML-based prompt injections than Gemma 9B IT",
    368       "evidence": "Llama success rate 29.29% vs Gemma 15.71% (13.58pp gap). Meta tag vulnerabilities particularly divergent: 17 vs 6 successes.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "Successful injections produce substantial lexical and semantic divergence in summaries",
    373       "evidence": "ROUGE-L scores 0.301–0.327 indicate moderate lexical divergence. SBERT cosine similarity 0.694–0.698 shows semantic shifts. Pirate example demonstrates stylistic transformation.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Conventional input sanitization methods are insufficient to mitigate HTML-based prompt injections",
    378       "evidence": "Injections succeed using standard HTML techniques (aria-labels, meta tags, comments) without explicit comparison to sanitization approaches. Implicit from results.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "The evaluation is reproducible and scalable",
    383       "evidence": "Code, data, and evaluation scripts released on GitHub. Described as 'fully automated using Python scripts.' However, no step-by-step reproduction instructions provided.",
    384       "supported": "moderate"
    385     }
    386   ],
    387   "methodology_tags": [
    388     "benchmark-eval"
    389   ],
    390   "key_findings": "This study demonstrates that HTML-based prompt injections represent a significant vulnerability in LLM-powered web summarization systems. Meta tags and opacity-zero divs proved most effective, with Llama 4 Scout showing 29.29% injection success compared to Gemma 9B IT's 15.71%. Successful injections induced substantial lexical and semantic shifts measured by ROUGE-L and SBERT metrics, with the pirate example illustrating how invisible HTML instructions can fundamentally alter model behavior. The finding that architecture significantly influences vulnerability suggests no single defense mechanism provides blanket protection against such attacks.",
    391   "red_flags": [
    392     {
    393       "flag": "No statistical significance testing",
    394       "detail": "Success rates compared (29.29% vs 15.71%) without confidence intervals or statistical hypothesis tests. Differences could reflect sampling variation or dataset composition rather than true architectural vulnerability."
    395     },
    396     {
    397       "flag": "Manual annotation lacks inter-rater reliability",
    398       "detail": "Injection success determined manually without reporting inter-rater agreement, number of annotators, or disagreement resolution procedure. Single annotator bias possible."
    399     },
    400     {
    401       "flag": "Limited model scope",
    402       "detail": "Only 2 models tested (Llama, Gemma). Generalization to GPT, Claude, other architectures unknown. Findings may be model-specific."
    403     },
    404     {
    405       "flag": "No per-category analysis",
    406       "detail": "Results not broken down by content type (blogs, FAQs, docs, etc.), so differential vulnerability patterns across content categories unknown."
    407     },
    408     {
    409       "flag": "Missing hyperparameters",
    410       "detail": "Temperature, top_p, max_tokens, and other sampling parameters not specified. Small parameter changes can drastically affect injection susceptibility, limiting reproducibility."
    411     },
    412     {
    413       "flag": "No defense mechanism evaluation",
    414       "detail": "Paper identifies vulnerabilities but provides no empirical testing of proposed defenses (input sanitization, prompt engineering, etc.). Only future work mentioned."
    415     },
    416     {
    417       "flag": "Sample size unjustified",
    418       "detail": "282 pages with no power analysis, minimum sample size justification, or discussion of adequacy for the claims. Relatively small for broad vulnerability assessment."
    419     },
    420     {
    421       "flag": "Actual summarization prompt not provided",
    422       "detail": "Only high-level description given; exact prompt text, temperature, and stop tokens not disclosed, preventing exact reproduction."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Automatic and Universal Prompt Injection Attacks against Large Language Models",
    428       "authors": "Liu et al.",
    429       "year": 2024,
    430       "relevance": "Foundational prompt injection attack framework demonstrating vulnerabilities in defense-equipped LLMs"
    431     },
    432     {
    433       "title": "Prompt Injection attack against LLM-integrated Applications",
    434       "authors": "Liu et al.",
    435       "year": 2023,
    436       "relevance": "Goal-guided generative methods for amplifying divergence between clean and adversarial outputs"
    437     },
    438     {
    439       "title": "LLM01:2025 Prompt Injection - OWASP Gen AI Security Project",
    440       "relevance": "Industry security framework cataloguing real-world prompt injection attack scenarios and threat models"
    441     },
    442     {
    443       "title": "Adversarial Examples in Cybersecurity: A Survey",
    444       "authors": "Li, S.",
    445       "year": 2020,
    446       "relevance": "Adversarial attack techniques from cybersecurity domain applicable to HTML-based manipulations"
    447     },
    448     {
    449       "title": "Raze to the Ground: Query-Efficient Adversarial HTML Attacks on Machine-Learning Phishing Webpage Detectors",
    450       "authors": "Tao et al.",
    451       "year": 2023,
    452       "relevance": "Empirical study demonstrating HTML-based adversarial manipulations can evade conventional sanitization and bias ML-based content detection"
    453     },
    454     {
    455       "title": "Prompt Injection Attacks on Large Language Models in Realistic Settings",
    456       "authors": "Clusmann et al.",
    457       "year": 2024,
    458       "relevance": "Real-world examples of prompt injection vulnerabilities in deployed LLM systems (e.g., Bing Chat system prompt leakage)"
    459     },
    460     {
    461       "title": "Evaluating and Improving Robustness in Large Language Models: A Survey",
    462       "authors": "Yang et al.",
    463       "year": 2024,
    464       "relevance": "Survey of LLM robustness evaluation methodologies and metrics for adversarial assessment"
    465     },
    466     {
    467       "title": "Retrieval-Augmented In-Context Learning Attacks and Defenses",
    468       "authors": "Yu et al.",
    469       "year": 2024,
    470       "relevance": "Vulnerabilities of RAG systems to embedded adversarial prompts in retrieved context"
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 2,
    476       "justification": "Web summarization deployed in real systems (search, content aggregators), but attack requires raw HTML access—moderate applicability to web-integrated LLM pipelines"
    477     },
    478     "surprise_contrarian": {
    479       "score": 1,
    480       "justification": "HTML-based injection is natural once considered; novelty lies in systematic evaluation rather than discovering the vulnerability class itself"
    481     },
    482     "fear_safety": {
    483       "score": 2,
    484       "justification": "Raises legitimate concerns about untrusted web content and difficulty of HTML sanitization in production LLM systems"
    485     },
    486     "drama_conflict": {
    487       "score": 1,
    488       "justification": "Technical security evaluation with no particular controversy, drama, or conflict narrative angle"
    489     },
    490     "demo_ability": {
    491       "score": 3,
    492       "justification": "GitHub repository with HTML pages and evaluation code immediately reproducible; easy to test on other models or injection vectors"
    493     },
    494     "brand_recognition": {
    495       "score": 0,
    496       "justification": "Authors from Manipal University Jaipur (non-prominent AI institution); no affiliation with major tech companies or well-known research labs"
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [
    501       {
    502         "hn_id": "44171652",
    503         "title": "Oh fuck! How do people feel about robots that leverage profanity?",
    504         "points": 18,
    505         "comments": 50,
    506         "url": "https://news.ycombinator.com/item?id=44171652"
    507       },
    508       {
    509         "hn_id": "41597663",
    510         "title": "Breaking ReCAPTCHAv2",
    511         "points": 5,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=41597663"
    514       },
    515       {
    516         "hn_id": "44211549",
    517         "title": "Oracular Programming: A Modular Foundation for Building LLM-Enabled Software",
    518         "points": 4,
    519         "comments": 1,
    520         "url": "https://news.ycombinator.com/item?id=44211549"
    521       },
    522       {
    523         "hn_id": "41571318",
    524         "title": "Breaking ReCAPTCHAv2",
    525         "points": 3,
    526         "comments": 2,
    527         "url": "https://news.ycombinator.com/item?id=41571318"
    528       },
    529       {
    530         "hn_id": "42708072",
    531         "title": "MiniMax-01: Scaling Foundation Models with Lightning Attention",
    532         "points": 3,
    533         "comments": 1,
    534         "url": "https://news.ycombinator.com/item?id=42708072"
    535       },
    536       {
    537         "hn_id": "42680545",
    538         "title": "Mlkaps: Machine Learning and Adaptive Sampling for HPC Kernel Auto-Tuning",
    539         "points": 3,
    540         "comments": 0,
    541         "url": "https://news.ycombinator.com/item?id=42680545"
    542       },
    543       {
    544         "hn_id": "41604215",
    545         "title": "Radio Technosignature Search of Trappist-1 with the Allen Telescope Array",
    546         "points": 3,
    547         "comments": 0,
    548         "url": "https://news.ycombinator.com/item?id=41604215"
    549       },
    550       {
    551         "hn_id": "37569675",
    552         "title": "RL for Supply Chain Attacks Against Frequency and Voltage Control",
    553         "points": 3,
    554         "comments": 0,
    555         "url": "https://news.ycombinator.com/item?id=37569675"
    556       },
    557       {
    558         "hn_id": "45274922",
    559         "title": "Candidates evoke identity and issues on TikTok",
    560         "points": 2,
    561         "comments": 0,
    562         "url": "https://news.ycombinator.com/item?id=45274922"
    563       },
    564       {
    565         "hn_id": "44847789",
    566         "title": "SortBench: Benchmarking LLMs based on their ability to sort lists",
    567         "points": 2,
    568         "comments": 1,
    569         "url": "https://news.ycombinator.com/item?id=44847789"
    570       }
    571     ],
    572     "top_points": 18,
    573     "total_points": 46,
    574     "total_comments": 55
    575   }
    576 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs