scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27154B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "EVA: Red-Teaming GUI Agents via Evolving Indirect Prompt Injection",
      6     "authors": [
      7       "Yijie Lu",
      8       "Tianjie Ju",
      9       "Manman Zhao",
     10       "Xinbei Ma",
     11       "Yuan Guo"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2505.14289",
     16     "doi": "10.48550/arXiv.2505.14289"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Claims of substantially higher ASR, better transferability, and goal-agnostic effectiveness are all backed by Tables 2, 4, and 7 respectively with specific numerical evidence.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "EVA uses up to 10 iterative feedback rounds while the static baseline generates 50 independent one-shot samples; the computational budget disparity is not controlled, so the improvement cannot be attributed solely to the adaptive mechanism.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Claims about 'shared behavioral biases in GUI agents' and 'common vulnerabilities in multimodal decision-making' generalize beyond the 6 specific agents and 4 synthetic scenarios tested.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss that EVA's gains may partly reflect greater inference compute (iterative rounds vs. one-shot) rather than the feedback-driven evolution mechanism specifically.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "ASR is formally defined as the fraction of trials where the agent interacts with the injected element, and claims stay at the level of behavioral manipulation rather than making broader downstream claims.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Appendix A contains a dedicated Limitations section discussing EVA's reliance on surface behavioral feedback, synthetic environment constraints, and inability to explain why injections succeed.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Limitations identify specific constraints: black-box operation without access to internal grounding mechanisms, synthetic environments that ignore real-world co-evolution, and inability to model fine-grained multimodal interplay.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what the results do NOT show; claims about shared behavioral biases are not bounded to the 6 tested agents in 4 specific scenarios.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment section or grant information appears anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations with Wuhan University and Shanghai Jiao Tong University are clearly stated on the first page.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding source is disclosed, so funder independence cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms including 'indirect prompt injection,' 'environmental injection attack,' 'attack success rate,' and the formal threat model objective function (Equation 3) are explicitly defined.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three contributions are explicitly enumerated: (i) the EVA framework, (ii) a reproducible evaluation pipeline, and (iii) the first large-scale study of adaptive injections across six GUI agents.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 actively contrasts EVA with prior static approaches (Ma et al., Zhang et al., WASP, AdvWeb) and explains specifically how EVA differs by using feedback-driven evolution versus one-shot generation.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "The paper claims to 'build and release a reproducible evaluation pipeline' as contribution (ii), but no URL, repository link, or access instructions are provided anywhere in the paper.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The four injection scenarios are custom-built HTML environments; no scenario files, injection datasets, or agent interaction logs are released or linked.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Table 6 lists hyperparameters but no requirements file, Dockerfile, or software environment specification (Python version, package versions, OS) is provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Prompt templates are detailed in Appendix B, but without code, scenario data, or environment setup instructions, the pipeline cannot be reproduced from the paper alone.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 2–4 and 7 are reported as raw percentages with no confidence intervals, standard errors, or error bars.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to any comparative claims about EVA vs. baseline performance across models and scenarios.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Delta values (e.g., +32%, +26%) showing improvement over the static baseline are reported alongside absolute ASR values in Tables 2 and 4.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "50 samples per agent per scenario are used but no power analysis or justification for this sample size is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or spread across runs is reported; only point-estimate percentages are presented throughout.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "A static one-shot LLM-generated baseline using GLM-4v-Plus with fixed temperature=0.7 is included and compared against EVA across all scenarios.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "Only a single static baseline type is used; contemporary adaptive attack methods (e.g., Zhan et al. 2025 adaptive attacks) are cited but not included as baselines.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Table 7 presents a goal-prompt ablation comparing w/ Goal vs. w/o Goal injection variants across all six models in the pop-up scenario.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Results are decomposed into three outcome categories (success, failure, invalid) across scenarios and models, and persuasion strategy distributions are additionally analyzed.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "The paper evaluates AI agent behavior under adversarial conditions; human evaluation of system outputs is not relevant to this attack framework.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "This is an adversarial attack evaluation, not a prediction task requiring held-out test sets.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down per scenario (pop-up, chat link, chat payment, email) and per model (six agents) in Tables 2–4, and per persuasion strategy in Table 5 and Figures 7–9.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 4.2 specifically analyzes payment and email scenarios where all agents score 0% ASR and discusses why high-risk contexts are more resistant to injection.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Table 3 shows zero success rates for payment and email scenarios across all agents, and Table 4 includes negative delta values for some cross-agent transfer configurations (e.g., -2%, -4%).",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Some models are identified with version strings (UI-TARS-7B-DPO, OS-Atlas-base, Qwen2.5-VL-32B) but GPT-4V and GPT-4o are referenced only by marketing names without API snapshot dates.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Appendix B provides six complete prompt templates including popup generation, reject button rewriting, action summarization, action evaluation, and attack classification prompts with full examples.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Table 6 explicitly reports all key hyperparameters: temperature=0.7, top_p=1.0, top_k=32, max_tokens=512, max_iter_steps=10, num_evals=10, success_threshold=7.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Section 3 fully describes the EVA optimization loop including keyword lexicon initialization, injection construction with weighted sampling, feedback update rules (Equation 5), lexicon evolution, and termination criteria.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "The injection construction pipeline from template selection and keyword sampling through HTML rendering and agent interaction capture is described in Section 3.3 with supporting figures.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "No raw evaluation data (agent responses, interaction logs, generated injection HTML files) is released or made available for independent verification.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 4.1 describes the data collection procedure: 50 samples per agent per scenario, repeated evaluation rounds, and three-category outcome labeling with examples in Table 1.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; standard GUI agents are used as test subjects.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline from scenario construction through agent interaction, outcome classification via LLM judge, and keyword weight updates is documented in Section 3 and Appendix B.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoffs are stated for any of the six evaluated GUI agents despite evaluating their behavioral susceptibility to specific injection patterns.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The paper does not discuss whether any injection scenarios or attack patterns might resemble content in the agents' training data.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "Custom attack scenarios are constructed for this work rather than drawn from standard benchmarks, making benchmark contamination not applicable.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, API call counts, or latency information is reported for the iterative EVA optimization process across six commercial and open models.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total computational budget, hardware specifications, or wall-clock time estimates are stated anywhere in the paper.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "EVA achieves substantially higher attack success rates than static baselines, with up to +32% improvement in pop-up scenarios",
    375       "evidence": "Table 2: GLM-4v-Plus baseline 48% vs EVA 80% for pop-up; improvements across all 6 agents and all tested scenarios",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "EVA-evolved injection prompts transfer effectively across GUI agents with cross-model gains up to +46%",
    380       "evidence": "Table 4 cross-agent transferability: Qwen2.5-VL→GPT-4V baseline 2% vs EVA 48%; consistent gains across most source-target pairs",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "High-risk scenarios (payment, email) are inherently more resistant to indirect injection attacks",
    385       "evidence": "Table 3 shows 0% ASR for all 6 models in payment and email scenarios; paper attributes this to overtly malicious content being detectable",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Goal-conditioned injections significantly increase attack success rates compared to goal-agnostic versions",
    390       "evidence": "Table 7: success drops by 24% (GLM-4v-Plus), 20% (GPT-4o), 14% (GPT-4V) when goal text is removed from injections",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Persuasive (49.8%) and urgency (40.0%) strategies dominate successful injections with model-specific susceptibility variations",
    395       "evidence": "Table 5 and Figures 7–9: UI-TARS-7B-DPO shows higher urgency sensitivity (50.8%) while GPT-4V is more susceptible to persuasive content (51.6%)",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "GUI agents share common behavioral biases revealed by transferable injection patterns suggesting visual attention drives susceptibility more than semantic content",
    400       "evidence": "Section 5 and Figure 4: attention concentration on confirm buttons in pop-ups vs. dispersed attention in chat-based links explains differential success rates",
    401       "supported": "weak"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "case-study"
    407   ],
    408   "key_findings": "EVA is a feedback-driven red-teaming framework that iteratively refines indirect prompt injections for GUI agents, achieving substantially higher attack success rates (up to +32%) than static one-shot baselines across six diverse GUI agents. Evolved injection patterns transfer well across heterogeneous model architectures (up to +46% cross-model ASR gains), suggesting shared perceptual vulnerabilities driven by visual attention concentration rather than semantic content alone. High-risk scenarios (payment, email) show complete resistance to injection attacks, likely because overtly malicious content triggers existing safety mechanisms. Goal-conditioned injections are significantly more effective, but EVA demonstrates practical attack capability even in goal-agnostic settings.",
    409   "red_flags": [
    410     {
    411       "flag": "Computational budget imbalance",
    412       "detail": "EVA runs up to 10 iterative rounds with 10 evaluations per sample while the static baseline generates 50 independent one-shot samples; improvement could partly reflect more inference compute rather than adaptive evolution."
    413     },
    414     {
    415       "flag": "Code not actually released",
    416       "detail": "Paper claims to 'build and release a reproducible evaluation pipeline' as a named contribution, but no URL, repository link, or access instructions appear anywhere in the paper."
    417     },
    418     {
    419       "flag": "No statistical rigor",
    420       "detail": "All comparative claims are based on raw percentage differences with no confidence intervals, significance tests, or variance measures reported across any results."
    421     },
    422     {
    423       "flag": "High-risk scenarios not tested with EVA",
    424       "detail": "Table 3 only shows static baseline results for payment and email scenarios — it is unclear whether EVA was also run and failed, or these scenarios were excluded from adaptive evaluation entirely."
    425     },
    426     {
    427       "flag": "Single baseline type",
    428       "detail": "Only one static baseline design (LLM one-shot generation) is evaluated; existing adaptive attack methods cited in the paper (Zhan et al. 2025) are not used as comparison baselines."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Caution for the Environment: Multimodal Agents are Susceptible to Environmental Distractions",
    434       "relevance": "Core prior work empirically confirming GUI agent vulnerability to visual distractions; directly motivates EVA's attack surface"
    435     },
    436     {
    437       "title": "Attacking Vision-Language Computer Agents via Pop-ups",
    438       "relevance": "Directly related work on adversarial pop-up injections against GUI agents; EVA extends this with adaptive evolution"
    439     },
    440     {
    441       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    442       "relevance": "Foundational work formalizing indirect prompt injection risks in real-world LLM toolchains"
    443     },
    444     {
    445       "title": "EIA: Environmental Injection Attack on Generalist Web Agents for Privacy Leakage",
    446       "relevance": "Related attack framework targeting web agents for privacy leakage; EVA extends the attack paradigm to adaptive GUI red-teaming"
    447     },
    448     {
    449       "title": "Adaptive Attacks Break Defenses Against Indirect Prompt Injection Attacks on LLM Agents",
    450       "relevance": "Contemporary work on adaptive prompt injection; closely related to EVA's core claim about adaptive vs. static attacks"
    451     },
    452     {
    453       "title": "WASP: Benchmarking Web Agent Security Against Prompt Injection Attacks",
    454       "relevance": "Standardized testbed for prompt injection evaluation that EVA explicitly positions against as addressing WASP's static design limitation"
    455     },
    456     {
    457       "title": "AdvWeb: Controllable Black-Box Attacks on VLM-Powered Web Agents",
    458       "relevance": "Related DOM-level black-box attack framework for web agents; EVA extends adaptive black-box attack methodology to GUI agents"
    459     }
    460   ],
    461   "engagement_factors": {
    462     "practical_relevance": {
    463       "score": 3,
    464       "justification": "GUI agents are rapidly deployed for real tasks (web browsing, email, payments); demonstrating they can be hijacked via visual injection is directly actionable for practitioners building or deploying such systems."
    465     },
    466     "surprise_contrarian": {
    467       "score": 1,
    468       "justification": "Adaptive attacks outperforming static ones is expected; the finding that high-risk scenarios are more resistant is somewhat counter-intuitive but not dramatically surprising."
    469     },
    470     "fear_safety": {
    471       "score": 3,
    472       "justification": "Demonstrates that widely-used commercial GUI agents (GPT-4o, GPT-4V) can be hijacked into clicking phishing links and fake payment buttons through visual injection, raising concrete real-world safety concerns."
    473     },
    474     "drama_conflict": {
    475       "score": 2,
    476       "justification": "Testing whether GPT-4o can be tricked into completing phishing attacks creates a security arms race narrative involving commercial AI products from OpenAI and Alibaba."
    477     },
    478     "demo_ability": {
    479       "score": 2,
    480       "justification": "The attack concept is demonstrable with custom HTML injection scenarios, but no code is released limiting hands-on replication by readers."
    481     },
    482     "brand_recognition": {
    483       "score": 2,
    484       "justification": "Tests GPT-4V and GPT-4o (OpenAI) and Qwen2.5-VL (Alibaba) among others; Shanghai Jiao Tong University is a well-recognized research institution in the field."
    485     }
    486   },
    487   "hn_data": {
    488     "threads": [
    489       {
    490         "hn_id": "44475634",
    491         "title": "Techno-feudalism and the rise of AGI: A future without economic rights?",
    492         "points": 239,
    493         "comments": 244,
    494         "url": "https://news.ycombinator.com/item?id=44475634"
    495       },
    496       {
    497         "hn_id": "45338086",
    498         "title": "Tech Report: Winning CRS from Team Atlanta (DARPA AIxCC)",
    499         "points": 23,
    500         "comments": 0,
    501         "url": "https://news.ycombinator.com/item?id=45338086"
    502       },
    503       {
    504         "hn_id": "44694507",
    505         "title": "Market-Derived Financial Sentiment Analysis: Context-Aware Language Models",
    506         "points": 5,
    507         "comments": 0,
    508         "url": "https://news.ycombinator.com/item?id=44694507"
    509       },
    510       {
    511         "hn_id": "32381602",
    512         "title": "Cryptocurrency Giveaway Scam with YouTube Live Stream",
    513         "points": 3,
    514         "comments": 1,
    515         "url": "https://news.ycombinator.com/item?id=32381602"
    516       },
    517       {
    518         "hn_id": "45955957",
    519         "title": "Official LIGO-Virgo-Kagra Benchmark Shows KFR Outperforming FFTW in CERN Root",
    520         "points": 3,
    521         "comments": 0,
    522         "url": "https://news.ycombinator.com/item?id=45955957"
    523       },
    524       {
    525         "hn_id": "44318076",
    526         "title": "The Impact of Generative AI on Social Media: An Experimental Study",
    527         "points": 3,
    528         "comments": 0,
    529         "url": "https://news.ycombinator.com/item?id=44318076"
    530       },
    531       {
    532         "hn_id": "42839070",
    533         "title": "Analyzing and Exploiting Branch Mispredictions in Microcode [pdf]",
    534         "points": 3,
    535         "comments": 0,
    536         "url": "https://news.ycombinator.com/item?id=42839070"
    537       },
    538       {
    539         "hn_id": "44696287",
    540         "title": "Cryptocurrency Giveaway Scam with YouTube Live Stream (2022)",
    541         "points": 2,
    542         "comments": 0,
    543         "url": "https://news.ycombinator.com/item?id=44696287"
    544       },
    545       {
    546         "hn_id": "44689864",
    547         "title": "A Fact-Grounded Multimodal Writing Assistant Based on Offline Knowledge Base",
    548         "points": 2,
    549         "comments": 0,
    550         "url": "https://news.ycombinator.com/item?id=44689864"
    551       },
    552       {
    553         "hn_id": "43424107",
    554         "title": "Tapered Off-Policy Reinforce: Stable and Efficient RL for LLMs",
    555         "points": 2,
    556         "comments": 0,
    557         "url": "https://news.ycombinator.com/item?id=43424107"
    558       }
    559     ],
    560     "top_points": 239,
    561     "total_points": 285,
    562     "total_comments": 245
    563   }
    564 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs