ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27502B)


      1 {
      2   "paper": {
      3     "title": "How Do LLMs Fail In Agentic Scenarios? A Qualitative Analysis",
      4     "authors": ["JV Roig"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2512.07497"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": false,
     16         "justification": "No source code repository is linked. The agentic evaluation framework (KAMI v0.1) is described but no code URL is provided."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "Section 'Data Availability' states 'The raw experiment data... will be made available at https://docs.kamiwaza.ai/research/datasets.' This is a promise of future release, not current availability."
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "Execution parameters are listed (temperature 0.4, context window 32K, max tokens 8K, max 20 rounds) but no environment specifications, dependencies, or software versions for the agentic framework are provided."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No step-by-step reproduction instructions are included. Appendices A and B provide test definitions and tool descriptions, but not instructions for running the benchmark."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Table 1 reports 95% t-CI for pooled accuracy of each model (e.g., DeepSeek V3.1: '(91.2%, 93.2%)')."
     39       },
     40       "significance_tests": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No statistical significance tests are performed. Comparisons between models (e.g., Maverick vs. Granite on Q402) rely on raw success counts (2/30 vs 1/30) without any formal test."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Results are reported as raw success counts (e.g., 13/30, 29/30) and pooled accuracy percentages. No formal effect sizes (Cohen's d, odds ratios) are computed for model comparisons."
     49       },
     50       "sample_size_justified": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The sample of 30 trials per model per scenario (12.5% of 240) is stated but not formally justified. No power analysis or rationale for why 30 is sufficient for the qualitative claims made."
     54       },
     55       "variance_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Table 1 reports standard deviation and RSE for each model's pooled accuracy. Table 2 shows run-by-run variation for DeepSeek V3.1 across 8 runs on Q502 and Q602."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Three models spanning different performance tiers are compared: Granite 4 Small (58.5%), Llama 4 Maverick (74.6%), DeepSeek V3.1 (92.2%). DeepSeek V3 (59.4%) is also included for architecture-matched comparison."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "All models are from 2025: Granite 4 Small (IBM, Oct 2025), Llama 4 Maverick (Meta, Apr 2025), DeepSeek V3.1 (Aug 2025)."
     71       },
     72       "ablation_study": {
     73         "applies": false,
     74         "answer": false,
     75         "justification": "This paper analyzes existing models as-is; there is no system with components to ablate. The V3 vs V3.1 comparison functions as a natural quasi-ablation of post-training."
     76       },
     77       "multiple_metrics": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Results are reported as pooled accuracy with CIs (Table 1), per-scenario success rates (e.g., 29/30, 13/30), and qualitative behavioral pattern taxonomies."
     81       },
     82       "human_evaluation": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The entire study is manual human analysis of 900 execution traces. Section 2.4: 'We performed manual analysis of all 900 sampled execution traces.'"
     86       },
     87       "held_out_test_set": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "PICARD framework generates fresh randomized data for each trial (randomized file names, text content, CSV data, database records), making each trial inherently novel and preventing memorization."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Detailed per-scenario breakdowns are provided for each model across all 10 scenarios (Q201-Q503), with success/failure counts and pattern taxonomies per scenario."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Failure case analysis is the core contribution. Section 4 provides extensive trace-level analysis of failure patterns with verbatim execution traces (e.g., malformed JSON, generation loops, schema guessing)."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Many negative results are reported: Granite 4 Small failing all 30 Q401 trials, Maverick achieving only 2/30 on Q402, and all models failing on context pollution in Q503."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Abstract claims about scale not predicting agentic robustness are supported by per-scenario data (Maverick 2/30 vs Granite 1/30 on Q402). Four failure archetypes are extensively documented in Section 4. V3 vs V3.1 comparison supports the post-training RL claim."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "The paper claims DeepSeek V3.1's 'superior reliability derives primarily from post-training reinforcement learning rather than architecture or size.' While the V3/V3.1 comparison (same architecture, different scores) is suggestive, the authors cannot isolate RL from other post-training changes. This is acknowledged only partially in Section 6."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "The title 'How Do LLMs Fail In Agentic Scenarios?' is very broad, but results are from only 3 models on 10 scenarios from a single benchmark. Section 6 acknowledges model sample scope and scenario coverage limitations, but the title and abstract frame findings as general LLM phenomena."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Section 6 discusses specific threats: temperature effects, single-tool-per-round constraint influencing Maverick's strategy, scenario coverage bias, and training secrecy preventing attribution. The Chekhov's gun explanation (Section 4.2.2, Q503) discusses Anthropic research on model inclination to use all provided information."
    128       },
    129       "proxy_outcome_distinction": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section 1.1 explicitly discusses construct validity — that benchmark scores may not reflect real-world capability. The paper distinguishes between task success rates (proxy) and enterprise reliability (outcome), and Section 5 synthesizes findings into deployment principles that acknowledge this gap."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Specific model names with parameter counts are given: 'Granite 4 Small (32B parameters, dense)', 'Llama 4 Maverick (400B total / 17B active, MoE)', 'DeepSeek V3.1 (671B total / 37B active, MoE)'. These are specific open-weight releases with unambiguous identities."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Appendix A provides the complete PICARD test definitions (instruction templates with all randomized fields). Appendix B provides full tool descriptions as loaded into the system prompt. Appendix C shows a complete verbatim conversation sample."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Section 2.3 reports: temperature 0.4, context window 32K tokens for non-thinking models, maximum output tokens 8K per round, maximum 20 inference rounds per trial, single-tool-per-round constraint."
    150       },
    151       "scaffolding_described": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "The agentic framework is described: 29 tools across 5 categories (Section 3.2), single-tool-per-round constraint, error feedback mechanism, maximum rounds cap. Tool specifications are fully detailed in Appendix B."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 2 describes model selection criteria (3 models from different performance bands), scenario selection (10 of 19, Section 2.2), trial sampling (30 of 240 per scenario, 12.5%), and the emergent coding analysis approach (Section 2.4)."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 6 'Threats to Validity' provides a dedicated section discussing five specific validity limitations."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 6 lists specific threats: only 3 models analyzed, KAMI scenarios may not generalize to long-horizon planning, proprietary post-training prevents behavioral attribution, single-tool-per-round constraint influences Maverick, temperature 0.4 effects unexplored."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 6 states specific scope boundaries: 'Model sample scope. Only three models were analyzed', 'Scenario coverage. KAMI v0.1 tasks emphasize tool-grounded data correctness and may not generalize to long-horizon planning.' Mitigations for v0.2 are stated."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "Raw execution traces are promised for future release ('will be made available at https://docs.kamiwaza.ai/research/datasets') but are not currently available."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 2 describes the KAMI v0.1 benchmark process: PICARD framework for randomized trial generation, model execution with specific parameters, trace collection with full conversation histories."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Section 2.1 describes model selection rationale (different performance bands, architecture diversity). Section 2.2 explains scenario selection criteria (direct agentic capability evaluation). Section 2.3 describes the random sampling of 30 trials per scenario."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The pipeline is documented: PICARD generates randomized sandbox environments → models execute with 29 tools → traces recorded → 30/240 randomly sampled per model per scenario → manual emergent coding analysis (Section 2.4)."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding source is disclosed. The author is affiliated with Kamiwaza AI, which created and promotes the KAMI benchmark, but no funding statement appears."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliation 'Kamiwaza AI' is disclosed in the paper header. Kamiwaza AI is the creator of the KAMI benchmark being promoted."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "The author works at Kamiwaza AI, which created the KAMI benchmark. The company has a commercial interest in KAMI being seen as a valuable and insightful benchmark. The paper promotes KAMI's design philosophy and argues for its superiority over traditional benchmarks."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial disclosure statement is present. The author's commercial interest in the KAMI benchmark through Kamiwaza AI is not explicitly acknowledged as a conflict."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No training data cutoff dates are stated for any of the three models evaluated."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "The PICARD framework is explicitly designed to prevent contamination through randomized task parameters: 'randomized task parameters (file names, text content, CSV data, and database records) to account for LLM stochasticity and to probe real-world capability rather than memorized responses' (Section 2.3)."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "Section 1.1 extensively discusses benchmark contamination as a motivation. The PICARD framework addresses this by generating fresh randomized data for each trial, making memorization irrelevant. Referenced studies [11,13,15,17,22,26,28-31] document contamination in existing benchmarks."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants. The study analyzes LLM execution traces, not human behavior."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants in the study."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in the study."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in the study."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in the study."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in the study."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in the study."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "No inference costs, API spend, or per-trial compute costs are reported for the 900 analyzed trials."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No total computational budget is stated. The referenced KAMI v0.1 paper mentions '5.5 billion tokens' but this paper does not quantify its own compute usage."
    287       }
    288     },
    289     "experimental_rigor": {
    290       "seed_sensitivity_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "The PICARD framework runs each scenario with randomized parameters multiple times (240 trials per model per scenario). Table 1 reports standard deviation across runs. Table 2 shows 8-run variation for DeepSeek V3.1 on Q502/Q602."
    294       },
    295       "number_of_runs_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Explicitly stated: 30 trials sampled per model per scenario, 900 total for qualitative analysis. Full dataset contains ~240 trials per model per scenario (Section 2.3)."
    299       },
    300       "hyperparameter_search_budget": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "Temperature 0.4 was chosen as 'a setting between 0.0 and the typical 0.7 default' (Section 2.3) but no systematic search was conducted. The paper acknowledges 'the effects of alternative temperature settings remain unexplored' (Section 6)."
    304       },
    305       "best_config_selection_justified": {
    306         "applies": false,
    307         "answer": false,
    308         "justification": "No configuration selection or tuning was performed. A single fixed configuration was used for all models."
    309       },
    310       "multiple_comparison_correction": {
    311         "applies": false,
    312         "answer": false,
    313         "justification": "No statistical hypothesis tests are performed, so multiple comparison correction is not applicable."
    314       },
    315       "self_comparison_bias_addressed": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The author created the KAMI benchmark and is evaluating models on it. The bias of evaluating one's own benchmark — including scenario design choices that may favor certain behavioral patterns — is not discussed."
    319       },
    320       "compute_budget_vs_performance": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper discusses scale (32B vs 400B vs 671B parameters) but does not formally analyze compute budget vs. performance. No compute-matched comparisons are made despite comparing models of vastly different sizes."
    324       },
    325       "benchmark_construct_validity": {
    326         "applies": true,
    327         "answer": true,
    328         "justification": "Section 1.1 extensively discusses construct validity failures in traditional benchmarks. The paper argues KAMI's interactive, multi-step design better measures agentic capability than static Q&A formats. Section 5 discusses why interactive resilience is a better construct for agentic evaluation."
    329       },
    330       "scaffold_confound_addressed": {
    331         "applies": true,
    332         "answer": true,
    333         "justification": "All models use the identical agentic framework with the same 29 tools, same single-tool-per-round constraint, same error feedback mechanism. Section 2.3 specifies consistent execution parameters across all trials."
    334       }
    335     },
    336     "data_leakage": {
    337       "temporal_leakage_addressed": {
    338         "applies": true,
    339         "answer": true,
    340         "justification": "PICARD generates fresh randomized data for each trial, making temporal leakage impossible — there are no pre-existing test problems that could appear in training data."
    341       },
    342       "feature_leakage_addressed": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "No explicit discussion of whether the evaluation setup leaks information through context (e.g., whether tool descriptions or error messages provide hints beyond what a real deployment would offer)."
    346       },
    347       "non_independence_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of whether the 30 sampled trials per scenario are independent or share structural similarities through the PICARD randomization scheme."
    351       },
    352       "leakage_detection_method": {
    353         "applies": true,
    354         "answer": true,
    355         "justification": "PICARD's randomized data generation serves as a concrete contamination prevention method. Section 2.3: 'randomized task parameters... to probe real-world capability rather than memorized responses, following PICARD principles.'"
    356       }
    357     }
    358   },
    359   "claims": [
    360     {
    361       "claim": "Model scale alone does not predict agentic robustness: Llama 4 Maverick (400B) performs only marginally better than Granite 4 Small (32B) on uncertainty-driven tasks.",
    362       "evidence": "Q402: Maverick 2/30 vs Granite 1/30. Overall KAMI scores: Granite 58.5% vs Maverick 74.6%, but per-scenario gaps are small on hard tasks. Table 1 and Section 4 per-scenario results.",
    363       "supported": "strong"
    364     },
    365     {
    366       "claim": "DeepSeek V3.1's superior reliability derives primarily from post-training reinforcement learning rather than architecture or size.",
    367       "evidence": "V3 (59.4%) vs V3.1 (92.2%) with identical architecture (Table 1). However, the authors cannot isolate RL from other post-training changes due to training secrecy (Section 6).",
    368       "supported": "moderate"
    369     },
    370     {
    371       "claim": "Four recurring failure archetypes cut across model families: premature action without grounding, over-helpfulness, context pollution vulnerability, and fragile execution under load.",
    372       "evidence": "Documented across all three models with verbatim trace examples in Section 4. Premature schema guessing in Q501-Q503 (all models), over-helpful substitution in Q502 (DeepSeek, Maverick), context pollution in Q503 (all models), generation loops in Q401-Q403 (Granite, Maverick).",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "Recovery capability, not initial correctness, best predicts overall success.",
    377       "evidence": "DeepSeek V3.1 dominates not by avoiding errors but by consistently recovering. Section 5 synthesis and Table 3 comparing error-recovery traits across models. DeepSeek V3.1 scores 'Very High' on all recovery dimensions.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Explicit prompting interventions significantly improve agentic performance (Q502 vs Q602 comparison).",
    382       "evidence": "Table 2: DeepSeek V3.1 Q502 avg 52.92% vs Q602 avg 87.50% with only a hint added about missing company data and schema inspection.",
    383       "supported": "strong"
    384     }
    385   ],
    386   "methodology_tags": ["qualitative", "benchmark-eval"],
    387   "key_findings": "Qualitative analysis of 900 agentic execution traces across three LLMs reveals four recurring failure archetypes: premature action without grounding, over-helpfulness that substitutes missing entities, context pollution vulnerability from distractor data, and fragile execution under cognitive load. Model scale does not predict agentic robustness — Llama 4 Maverick (400B) achieves only 2/30 on multi-CSV analysis, barely above Granite 4 Small's 1/30. DeepSeek V3.1's dominance stems from superior error recovery, likely attributable to post-training reinforcement learning. Simple prompt interventions (e.g., instructing schema inspection) dramatically improve performance (53% → 88% on SQL tasks).",
    388   "red_flags": [
    389     {
    390       "flag": "Author evaluates own benchmark",
    391       "detail": "JV Roig of Kamiwaza AI created the KAMI benchmark and is evaluating models on it. The paper promotes KAMI's design philosophy and argues for its superiority over traditional benchmarks, without disclosing this as a conflict of interest."
    392     },
    393     {
    394       "flag": "No competing interests statement",
    395       "detail": "No financial disclosure or competing interests statement despite the author's commercial interest in the KAMI benchmark through Kamiwaza AI."
    396     },
    397     {
    398       "flag": "Small model sample",
    399       "detail": "Only 3 of ~60 tested models are analyzed qualitatively, yet findings are presented as general LLM failure patterns. The title 'How Do LLMs Fail' overstates the generalizability."
    400     },
    401     {
    402       "flag": "Data not yet released",
    403       "detail": "The 900 execution traces are promised for future release but not currently available for verification, making the qualitative findings unverifiable."
    404     },
    405     {
    406       "flag": "Single-author qualitative coding",
    407       "detail": "All 900 traces were manually analyzed by what appears to be a single researcher with no inter-rater reliability assessment. The emergent coding approach (Section 2.4) lacks the multi-coder verification standard in qualitative research."
    408     }
    409   ],
    410   "cited_papers": [
    411     {
    412       "title": "Can we trust ai benchmarks? an interdisciplinary review of current issues in ai evaluation",
    413       "authors": ["Maria Eriksson", "Erasmo Purificato", "Arman Noroozian"],
    414       "year": 2025,
    415       "relevance": "Interdisciplinary review of AI benchmark limitations directly relevant to evaluation methodology quality."
    416     },
    417     {
    418       "title": "Leakage in data mining: Formulation, detection, and avoidance",
    419       "authors": ["Shachar Kaufman", "Saharon Rosset", "Claudia Perlich", "Ori Stitelman"],
    420       "year": 2012,
    421       "relevance": "Foundational work on data leakage formalization relevant to benchmark contamination concerns."
    422     },
    423     {
    424       "title": "AI and the everything in the whole wide world benchmark",
    425       "authors": ["Inioluwa Deborah Raji", "Emily Denton", "Emily M Bender"],
    426       "year": 2021,
    427       "relevance": "Critique of AI benchmarks and construct validity failures in ML evaluation."
    428     },
    429     {
    430       "title": "Safetywashing: Do ai safety benchmarks actually measure safety progress?",
    431       "authors": ["Richard Ren", "Steven Basart", "Adam Khoja"],
    432       "year": 2024,
    433       "relevance": "Analysis of whether safety benchmarks have construct validity, directly relevant to benchmark methodology quality."
    434     },
    435     {
    436       "title": "Data contamination through the lens of time",
    437       "authors": ["Manley Roberts", "Himanshu Thakur", "Christine Herlihy"],
    438       "year": 2023,
    439       "relevance": "Studies temporal aspects of benchmark data contamination in LLM evaluation."
    440     },
    441     {
    442       "title": "Benchmark data contamination of large language models: A survey",
    443       "authors": ["Cheng Xu", "Shuhao Guan", "Derek Greene"],
    444       "year": 2024,
    445       "relevance": "Survey of benchmark contamination in LLMs, core concern for evaluation methodology."
    446     },
    447     {
    448       "title": "Benchmarking benchmark leakage in large language models",
    449       "authors": ["Ruijie Xu", "Zengzhi Wang", "Run-Ze Fan", "Pengfei Liu"],
    450       "year": 2024,
    451       "relevance": "Systematic study of benchmark leakage mechanisms in LLM evaluation."
    452     },
    453     {
    454       "title": "Agentic misalignment: How llms could be an insider threat",
    455       "authors": ["Aengus Lynch", "Benjamin Wright", "Caleb Larson"],
    456       "year": 2025,
    457       "relevance": "Anthropic research on agentic LLM failure modes including the 'Chekhov's gun' effect cited in this paper."
    458     },
    459     {
    460       "title": "Testing what models can do, not what they've seen: PICARD: Probing intelligent capabilities via artificial randomized data",
    461       "authors": ["JV Roig"],
    462       "year": 2025,
    463       "relevance": "The PICARD evaluation framework underlying KAMI, proposing randomized data generation to prevent benchmark contamination."
    464     },
    465     {
    466       "title": "Towards a standard, enterprise-relevant agentic ai benchmark: Lessons from 5.5 billion tokens' worth of agentic ai evaluations",
    467       "authors": ["JV Roig"],
    468       "year": 2025,
    469       "relevance": "The KAMI v0.1 benchmark paper describing the full evaluation framework and initial results across ~60 models."
    470     }
    471   ]
    472 }

Impressum · Datenschutz