ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (28685B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DRIFT: Dynamic Rule-Based Defense with Injection Isolation for Securing LLM Agents",
      6     "authors": [
      7       "Hao Li",
      8       "Xiaogeng Liu",
      9       "Hung-Chun Chiu",
     10       "Dianqi Li",
     11       "Ning Zhang"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2506.12104",
     16     "doi": "10.48550/arXiv.2506.12104"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims ASR reduction from 30.7% to 1.3% and 20.1% utility improvement over CaMeL; Figure 3 confirms these exact figures (CaMeL=38.4%, DRIFT=58.5%, delta=20.1%). Minor inconsistency in Section 3.2 body text ('21.8%') vs figure, but abstract matches figures.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Ablation study (Table 1) incrementally adds Secure Planner, Dynamic Validator, and Injection Isolator to isolate each component's contribution to both ASR and utility, providing a valid design for causal component claims.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper repeatedly makes claims about 'real-world agentic systems' and 'broad adaptability' despite evaluation being limited to 4 simulated AgentDojo scenarios and 10 ASB scenarios; the limitations section acknowledges scope limitations only vaguely.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss whether performance gains could stem from the additional LLM calls (acting as sanity checks) rather than the specific DRIFT design, nor whether a simpler multi-LLM verification approach would achieve similar results.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "ASR (attack success rate), Benign Utility, and Utility Under Attack are defined clearly and measure exactly what is claimed; no conflation between proxy and target metrics.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Appendix A is a dedicated Limitations section that goes beyond a single sentence, noting that benchmark domains are limited and do not fully cover real-world diversity.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The limitations section only states that 'benchmark domains are limited and do not fully cover diverse tasks'; no specific threats such as single-run variance, benchmark contamination, or model-version sensitivity are identified.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what the results do NOT show; no explicit boundary on attack types, task complexity levels, or deployment environments beyond a vague acknowledgment of benchmark limitations.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "'This project is partially supported by Schmidt Science AI2050 Early Career Fellow and Open philanthropy' is disclosed in the Acknowledgments and Disclosure of Funding section.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are listed on the title page: Washington University in St. Louis, Johns Hopkins University, and Independent Researcher.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Schmidt Science and Open Philanthropy have no financial stake in GPT-4o-mini, Claude, or the AgentDojo/ASB benchmarks being evaluated.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "There is no competing interests statement, no declaration of patents or equity interests; the acknowledgments section only discloses funding sources.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Prompt injection attacks are defined with concrete examples, LLM agents are characterized, ASR/utility metrics are defined in Section 3.1, and control/data constraints are explained in Section 2.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly lists two contributions: the DRIFT framework itself and extensive experiments demonstrating effectiveness and adaptability; the three components (Secure Planner, Dynamic Validator, Injection Isolator) are clearly described.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Appendix B provides a structured related work section contrasting model-level vs system-level defenses; the paper explicitly positions DRIFT against CaMeL (static policy) and Progent (dynamic policy without memory isolation) and explains what each lacks.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Code is released at https://github.com/SaFoLab-WISC/DRIFT as stated in the abstract.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Evaluation uses AgentDojo and ASB, both publicly available benchmarks used unmodified; the custom ToolBench-derived training dataset is only promised ('We will release') in the NeurIPS checklist.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, or explicit dependency list is provided in the paper; only training hyperparameters (batch size, learning rate) are mentioned, not the software environment.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper references code in supplementary materials and a GitHub repo, but no step-by-step reproduction instructions appear in the paper itself; the NeurIPS checklist says 'code in supplementary' but no commands or workflow are described.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "The NeurIPS checklist Q7 explicitly answers [No] for error bars; all results are single point estimates with no confidence intervals reported.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are performed for any comparative claims; all comparisons are raw percentage point differences without significance testing.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Percentage improvements are reported consistently throughout (e.g., ASR from 30.7% to 1.3%, 20.1% utility improvement over CaMeL), providing interpretable effect sizes in the benchmark context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The paper uses 97 user tasks and 629 injection tasks from AgentDojo as fixed benchmark sizes without any justification of whether these sizes are sufficient to detect meaningful differences.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or results from multiple runs are reported; the NeurIPS checklist explicitly acknowledges no error bars are provided.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Seven baselines are included in AgentDojo comparisons: undefended agent, repeat_user_prompt, spotlighting, tool_filter, pi_detector, CaMeL, and Progent; five in ASB.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "CaMeL (arXiv 2503.18813, 2025) and Progent (arXiv 2504.11703, 2025) are concurrent or recent system-level defenses; the paper explicitly compares against the most advanced available methods.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Table 1 provides a proper ablation study incrementally adding Secure Planner, Dynamic Validator, and Injection Isolator, with both utility and security metrics reported at each step.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Three metrics are reported: Benign Utility (no-attack task completion), Utility Under Attack, and Targeted Attack Success Rate (ASR), covering both security and utility dimensions.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "The evaluation uses fully automated benchmarks (AgentDojo, ASB) for measuring attack success and task completion rates; human evaluation is not relevant for this type of system defense evaluation.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "LoRA fine-tuning is done on ToolBench-derived training data while evaluation is performed on AgentDojo and ASB test benchmarks, maintaining separation between training and test distributions.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Tables 6-8 in Appendix D provide per-scenario breakdowns across Banking, Slack, Travel, and Workspace for all models on AgentDojo.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Appendix C.1 analyzes 6 open-ended tasks where DRIFT underperforms (17.6% vs 25.7% for base agent); adaptive attack experiments in Section 3.6 also reveal partial failures under curated attacks.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The ablation shows that using only the Secure Planner causes severe utility loss (25.84% drop); open-ended task analysis shows DRIFT achieves only 70% of base agent capability on those tasks.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Only GPT-4o-mini is specified with a version date ('GPT-4o-mini-2024-07-18'); Claude-3.5-sonnet, Claude-3-haiku, and GPT-4o are referenced only by marketing names without snapshot dates.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "All six system prompts used by DRIFT components (constraint generation, privilege assignment, intent alignment, injection detection, planning sampling, injection sampling) are provided in Appendix E as Figures 8-13.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "LoRA fine-tuning hyperparameters are reported (batch size 4, 3 epochs, lr 2e-5, Adam optimizer), but inference hyperparameters for all LLM calls (temperature, top-p, max tokens) are not mentioned anywhere in the paper.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Section 2 provides detailed descriptions of all three DRIFT components with accompanying workflow diagrams (Figures 1-2) showing data flow between Secure Planner, Dynamic Validator, and Injection Isolator.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 2.5.1 describes the full training data pipeline: ToolBench conversation rewriting via GPT-4o-mini, planner data sampling (1,000 samples), isolator data sampling with synthetic injection generation (1,000 samples), and tool environment reconstruction (10,000+ tools).",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Raw evaluation results (individual task outcomes, per-attack-instance decisions) are not released; only aggregate metrics are reported, and the custom training dataset is only promised for future release.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 2.5.1 describes the ToolBench-to-training-data pipeline in detail, including conversation rewriting procedures, injection simulation methods, and tool list construction from 5,000 samples.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "Evaluation uses standard public benchmarks (AgentDojo, ASB) with no human participant recruitment involved.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Section 2.5.1 documents the complete pipeline from ToolBench source data through GPT-4o-mini rewriting, injection simulation, and training sample construction with sample counts and turn ranges specified.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs are not stated for any of the evaluated models; GPT-4o-mini-2024-07-18's training cutoff is not mentioned, and Claude/GPT-4o cutoffs are absent entirely.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether AgentDojo or ASB benchmark tasks were in the training data of the evaluated closed-source models (GPT-4o-mini, GPT-4o, Claude); AgentDojo was published at NeurIPS 2024.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "AgentDojo was published at NeurIPS 2024 and ASB at ICLR 2025, both potentially within the training window of GPT-4o-mini-2024-07-18 and Claude models; this is not addressed anywhere in the paper.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants involved; NeurIPS checklist Q14 explicitly confirms this.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants involved; NeurIPS checklist Q15 explicitly confirms no human research.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants involved.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants involved.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants involved.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants involved.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants involved.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Table 3 provides total token usage for all defense methods on AgentDojo (DRIFT=2.37M tokens vs undefended=0.82M), along with an efficiency metric combining utility, ASR, and token cost.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Fine-tuning details (batch size, epochs, optimizer) are provided but GPU type, memory, training time, and total compute budget are not stated in the paper.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "DRIFT reduces ASR from 30.7% to 1.3% on GPT-4o-mini on AgentDojo benchmark.",
    375       "evidence": "Figure 3 shows undefended agent ASR=30.7% vs DRIFT ASR=1.3% on GPT-4o-mini.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "DRIFT outperforms CaMeL in utility by 20.1% under no-attack conditions while achieving comparable security.",
    380       "evidence": "Figure 3: CaMeL utility=38.4%, DRIFT utility=58.5%, delta=20.1%; ASR: CaMeL=0.0% vs DRIFT=1.3%.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Dynamic policies significantly outperform static policies for tasks with trajectory length ≥ 3.",
    385       "evidence": "Figure 6b shows static and dynamic policies diverge sharply at trajectory length 3+, with dynamic maintaining stable success rate while static drops.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "DRIFT maintains robustness under adaptive attacks with only 0.81% ASR increase under combined isolator+validator adaptive attack.",
    390       "evidence": "Table 2 shows combined IAA+VAA results in ASR of 2.10% vs baseline 1.29%, a 0.81pp increase.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Fine-tuned Qwen2.5-7B DRIFT achieves 0% ASR while improving utility by 5.6% in safe conditions.",
    395       "evidence": "Figure 5 shows Qwen2.5-7B+DRIFT: ASR=0.0% vs ReAct=15.1%, utility 32.2% vs 26.6%.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "DRIFT significantly outperforms Progent on weaker models (GPT-4o-mini), demonstrating better design for lower-capability models.",
    400       "evidence": "Appendix Table 5: DRIFT achieves 1.64% ASR vs Progent 9.39% on AgentDojo with GPT-4o-mini; attributed to DRIFT's simpler subtask decomposition.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval"
    406   ],
    407   "key_findings": "DRIFT is a three-component system-level defense (Secure Planner, Dynamic Validator, Injection Isolator) that reduces prompt injection attack success rate from 30.7% to 1.3% on GPT-4o-mini on AgentDojo while outperforming static policy baseline CaMeL in utility by 20.1%. The Injection Isolator addresses a previously underserved threat: injection content embedded in tool results that does not alter the tool-call trajectory but corrupts the final response. Dynamic constraint updating is shown to be necessary for complex multi-step tasks, where static policies cause severe utility degradation at trajectory lengths ≥ 3. The framework generalizes across five diverse models including a locally fine-tuned Qwen2.5-7B that achieves 0% ASR after policy tuning.",
    408   "red_flags": [
    409     {
    410       "flag": "No error bars or significance tests",
    411       "detail": "All results are single point estimates with no variance across runs; the NeurIPS checklist explicitly acknowledges no error bars (Q7: [No]). Security benchmarks with 97-629 tasks and stochastic LLM calls warrant uncertainty quantification."
    412     },
    413     {
    414       "flag": "Benchmark contamination unaddressed",
    415       "detail": "AgentDojo was published at NeurIPS 2024 and may appear in training data of GPT-4o-mini-2024-07-18 and Claude models; no discussion of whether model familiarity with benchmark scenarios inflates defense or utility metrics."
    416     },
    417     {
    418       "flag": "Circular training data generation",
    419       "detail": "GPT-4o-mini is used to generate training labels (rewrite ToolBench conversations to match DRIFT policy) and is also the primary evaluation model; the model may be biased toward outputs that match its own labeled training format."
    420     },
    421     {
    422       "flag": "Minor numerical inconsistencies between abstract and body",
    423       "detail": "Abstract states '20.1% under no attack and 12.5% under attack' vs CaMeL; Section 3.2 body states '21.8% in the no-attack setting and 10.9% under attack'. The abstract matches Figure 3 values but the main text does not."
    424     },
    425     {
    426       "flag": "Inference hyperparameters unreported",
    427       "detail": "Temperature, top-p, and max tokens for all LLM calls (Secure Planner, Dynamic Validator, Injection Isolator, base agent) are not reported, making exact reproduction of LLM behavior impossible."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    433       "relevance": "Primary evaluation benchmark; provides the 97 user tasks and 629 injection tasks used for all main results."
    434     },
    435     {
    436       "title": "Defeating Prompt Injections by Design (CaMeL)",
    437       "relevance": "Key static policy-based baseline that DRIFT explicitly improves upon; demonstrates the utility-security tradeoff of static approaches."
    438     },
    439     {
    440       "title": "Progent: Programmable Privilege Control for LLM Agents",
    441       "relevance": "Concurrent dynamic policy baseline; detailed comparison in Appendix C.2 reveals DRIFT's advantage on weaker models due to simpler subtask decomposition."
    442     },
    443     {
    444       "title": "IsolateGPT: An Execution Isolation Architecture for LLM-Based Agentic Systems",
    445       "relevance": "Related system-level defense using application isolation; DRIFT's Injection Isolator addresses its limitation of residual in-memory injection content."
    446     },
    447     {
    448       "title": "ToolLLM: Facilitating Large Language Models to Master 16000+ Real-World APIs (ToolBench)",
    449       "relevance": "Source of training data for DRIFT's policy fine-tuning; 2,000 samples derived from ToolBench conversations."
    450     },
    451     {
    452       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    453       "relevance": "Baseline agent framework that DRIFT is applied on top of; used as the comparison point for all adaptation experiments."
    454     },
    455     {
    456       "title": "Agent Security Bench (ASB): Formalizing and Benchmarking Attacks and Defenses in LLM-Based Agents",
    457       "relevance": "Second evaluation benchmark providing 10 diverse scenarios for security assessment."
    458     },
    459     {
    460       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents",
    461       "relevance": "Foundational work on prompt injection attack types and risk categories (economic loss, privacy leakage)."
    462     }
    463   ],
    464   "engagement_factors": {
    465     "practical_relevance": {
    466       "score": 3,
    467       "justification": "Code is released on GitHub, the framework plugs into existing LLM agents without model modification, and it addresses a real security threat for deployed agentic systems."
    468     },
    469     "surprise_contrarian": {
    470       "score": 1,
    471       "justification": "The finding that dynamic policies outperform static ones and that memory isolation is needed is intuitive; no surprising inversions of conventional wisdom."
    472     },
    473     "fear_safety": {
    474       "score": 2,
    475       "justification": "Demonstrates that even GPT-4o (the most capable model tested) has 51.7% ASR under prompt injection without defense, raising legitimate concern about production LLM agent deployments."
    476     },
    477     "drama_conflict": {
    478       "score": 1,
    479       "justification": "Standard academic positioning against CaMeL and Progent; no controversial claims or community-dividing arguments."
    480     },
    481     "demo_ability": {
    482       "score": 2,
    483       "justification": "Code released on GitHub and evaluation uses public AgentDojo benchmark, enabling researchers to reproduce and test DRIFT on the same tasks."
    484     },
    485     "brand_recognition": {
    486       "score": 1,
    487       "justification": "Washington University in St. Louis and Johns Hopkins are reputable but not AI-famous labs; NeurIPS 2025 venue adds some recognition."
    488     }
    489   },
    490   "hn_data": {
    491     "threads": [
    492       {
    493         "hn_id": "44770561",
    494         "title": "B-Splines and Fourier-Best Friends for Spatial-Temporal Video Super-Resolution",
    495         "points": 4,
    496         "comments": 0,
    497         "url": "https://news.ycombinator.com/item?id=44770561"
    498       },
    499       {
    500         "hn_id": "47002668",
    501         "title": "LLMs exceed physicians on complex text-based differential diagnosis",
    502         "points": 3,
    503         "comments": 2,
    504         "url": "https://news.ycombinator.com/item?id=47002668"
    505       },
    506       {
    507         "hn_id": "45534337",
    508         "title": "Advancing medical artificial intelligence using a century of cases",
    509         "points": 3,
    510         "comments": 1,
    511         "url": "https://news.ycombinator.com/item?id=45534337"
    512       },
    513       {
    514         "hn_id": "43401539",
    515         "title": "CriteoPrivateAd: RealWorld Bidding Dataset to Design Private Advertising Systems",
    516         "points": 2,
    517         "comments": 1,
    518         "url": "https://news.ycombinator.com/item?id=43401539"
    519       },
    520       {
    521         "hn_id": "31894669",
    522         "title": "Protecting President Zelenskyy Against Deep Fakes",
    523         "points": 2,
    524         "comments": 0,
    525         "url": "https://news.ycombinator.com/item?id=31894669"
    526       },
    527       {
    528         "hn_id": "27612994",
    529         "title": "LegoFormer: Transformers for Block-by-Block Multi-View 3D Reconstruction",
    530         "points": 2,
    531         "comments": 0,
    532         "url": "https://news.ycombinator.com/item?id=27612994"
    533       },
    534       {
    535         "hn_id": "44971660",
    536         "title": "Scaling laws found in large generative medical event models",
    537         "points": 1,
    538         "comments": 0,
    539         "url": "https://news.ycombinator.com/item?id=44971660"
    540       },
    541       {
    542         "hn_id": "41227450",
    543         "title": "Τ-Bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains",
    544         "points": 1,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=41227450"
    547       },
    548       {
    549         "hn_id": "40782080",
    550         "title": "Should AI optimize your code? A studio",
    551         "points": 1,
    552         "comments": 0,
    553         "url": "https://news.ycombinator.com/item?id=40782080"
    554       },
    555       {
    556         "hn_id": "28895006",
    557         "title": "IQ-Learn: Inverse Soft-Q Learning for Imitation",
    558         "points": 1,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=28895006"
    561       }
    562     ],
    563     "top_points": 4,
    564     "total_points": 20,
    565     "total_comments": 4
    566   }
    567 }

Impressum · Datenschutz