ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27073B)


      1 {
      2   "paper": {
      3     "title": "Making LLMs Reliable When It Matters Most: A Five-Layer Architecture for High-Stakes Decisions",
      4     "authors": ["Alejandro R. Jadad"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2511.07669",
      8     "doi": "10.48550/arXiv.2511.07669"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["qualitative", "case-study"],
     13   "key_findings": "The paper proposes a five-layer protection architecture and seven-stage sequential calibration process for human-AI partnership in high-stakes decisions, developed through qualitative sessions with 7 frontier LLMs across 3 simulated business vignettes. The author reports that one-shot prompting consistently failed to maintain partnership state, that partnership degrades with session length, and that 5 of 7 models could achieve partnership state after calibration. The paper generates 9 falsifiable hypotheses for future empirical validation but provides no quantitative evidence for its claims.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code, scripts, or repository URLs are provided anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No session transcripts, logs, or data artifacts are released. The calibration prompts (described as ~4,000 words) are characterized in prose but not provided as downloadable artifacts."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper states models were accessed through 'standard commercial interfaces' (Section 2.2) but provides no specific API versions, interface details, or environment specifications."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The four-stage prompt architecture is described conceptually but the actual prompt texts are not included, making reproduction impossible."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "Purely qualitative study with no quantitative results reported."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No statistical comparisons are made. All assessments are qualitative."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No quantitative effects are measured. All findings are descriptive and qualitative."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper uses 7 LLMs and 3 vignettes with 1 evaluator. No justification is given for why these numbers are sufficient, no saturation analysis is reported, and the n=1 evaluator design is not justified beyond listing the author's credentials."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No quantitative results to report variance across. All observations are qualitative."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper mentions 'naive model use' and 'human-only' as potential comparisons in Hypothesis H4, but these are proposed for future work. No actual baseline comparison is conducted — models are evaluated only under the proposed framework."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No formal baselines are used. The paper references existing approaches (explainable AI toolkits, comprehensive prompting, RAG, human-in-the-loop) in the introduction but does not compare its framework against them systematically."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The framework has 5 layers and 7 calibration stages. H7 explicitly proposes layer-ablation experiments as future work, acknowledging this has not been done: 'whether some layers are redundant in practice and could be removed.'"
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Section 2.5 lists four assessment dimensions (calibration responsiveness, partnership state sustainability, drift self-detection, dissolution discipline), but these are qualitative impressions without operationalized metrics or measurement instruments."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The entire evaluation methodology is based on human expert evaluation by the author, who assessed LLM behaviors through iterative dialogue. Section 2.5 describes assessment through 'qualitative, conducted through iterative dialogue between human and AI participants.'"
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Not applicable to qualitative research with no quantitative evaluation on datasets."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Section 3.2 provides a brief qualitative characterization (5 models achieved partnership state, 2 required extended calibration) but no systematic per-model or per-dimension breakdown with comparable observations."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses multiple failure patterns: session-length degradation (Section 3.3), performance mode reversion indicators (Section 2.7), ChatGPT-4o and Llama requiring extended calibration (Section 3.2), and one-shot prompting consistently failing (Section 6.1)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 6.1 reports that 'Comprehensive one-shot prompting consistently failed to produce a protective partnership state across the seven model families.' Section 3.3 reports systematic degradation with session duration."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The abstract claims 'partnership state is achievable through ordered calibration' and 'Cross-model validation revealed systematic performance differences.' These are supported only by the author's qualitative impressions from simulated scenarios with no quantitative evidence. The term 'validation' in the abstract overstates what was done — single-evaluator qualitative assessment is not cross-model validation."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes causal claims throughout: 'Sequential calibration remains necessary' (Section 6.1), 'Partnership state unlocked model capabilities systematically suppressed in default interactions' (Section 3.2), 'enables bias self-monitoring.' The DSR methodology with n=1 evaluator and no controlled comparison is inadequate for causal inference."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims 'Making LLMs Reliable When It Matters Most' based on 3 simulated business vignettes evaluated by 1 person. While Section 6.2 acknowledges the single-author limitation, the title, abstract, and Section 4 make broad claims about enterprise deployment, multi-trillion-dollar valuations, and regulatory posture that far exceed the evidence from simulated scenarios."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not consider that the perceived 'partnership state' could be explained by the author's expertise alone, by confirmation bias in the evaluator, or by the models simply performing well on the types of tasks presented. Section 6.2 mentions the meta-challenge of distinguishing authentic partnership from mimicry, but this is about AI cognition, not about alternative explanations for the framework's apparent success."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures behavioral patterns in simulated sessions and claims this demonstrates reliability for 'high-stakes decisions where verification arrives after commitment.' The gap between observing conversational behaviors in simulated vignettes and preventing real-world regret in actual high-stakes decisions is enormous and not adequately acknowledged. No actual high-stakes decisions were made."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Section 2.2 footnote lists: 'Claude Sonnet 4.5; ChatGPT-5; ChatGPT-4o; DeepSeek; Gemini 2.5; Llama; and Grok 4.' These are marketing names without specific API versions, snapshot dates, or size variants. 'DeepSeek' and 'Llama' are not even specific model names."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper describes the four-stage prompt architecture in natural language (Section 2.3) including 'a single ~4,000-word canonical artifact,' but the actual prompt text is never provided. Readers cannot reconstruct what was sent to the models."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No API hyperparameters (temperature, top-p, max tokens) are mentioned anywhere in the paper despite using 7 different LLM APIs."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The paper describes direct conversational interaction with LLMs through standard commercial interfaces."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The vignettes are described only at a high level (Section 2.4: 'solo-founder unicorn,' 'senior living facilities,' 'loneliness venture'). The actual scenario specifications, deliberation transcripts, and how qualitative observations were recorded and analyzed are not documented."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6.2 'Limitations and Research Directions' is a dedicated limitations section with substantive discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 6.2 discusses specific threats: 'single-author hypothesis generation based on simulated and anonymised scenarios,' 'Model self-reports of internal computational experience... cannot be independently verified,' and the meta-challenge of distinguishing authentic partnership from mimicry."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 6.2 states: 'This work should be read as single-author hypothesis generation based on simulated and anonymised scenarios across evolving model families.' Section 2.1 notes 'no operational or policy actions were taken solely on model output.' The paper also identifies specific priority replication targets (H1, H7, H8)."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No session transcripts, chat logs, or raw observations are released. The qualitative data underlying all claims is entirely unavailable for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "Section 2.1 states the work spanned August 29 to October 20, 2025, and sessions ranged from 2-6 hours. However, the paper does not describe how observations were recorded, whether sessions were logged, how qualitative assessments were documented, or what data was retained from each session."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The model selection criteria are described (Section 2.2: frontier-grade, 100k+ context, publicly available). However, the n=1 human evaluator design is not justified — no discussion of why a single evaluator is sufficient, whether inter-rater reliability was considered, or what biases a single expert evaluator introduces."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The process flow is described at a high level (initialization → vignettes → refinement → cross-model deployment), but there is no documentation of how qualitative observations were coded, categorized, or analyzed. The path from raw sessions to the reported findings is opaque."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information or acknowledgments section appears in the paper. The author lists affiliations with multiple organizations (Centre for Digital Therapeutics founder, USC adjunct, Vivenxia consultant) but does not disclose whether any funded this work."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The author's affiliations are clearly listed: 'Founder, Centre for Digital Therapeutics, Toronto, Canada; Research Professor (Adjunct), Keck School of Medicine, University of Southern California; Consultant, Vivenxia.'"
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, making independence assessment impossible. The author is a consultant (Vivenxia) and founder (Centre for Digital Therapeutics) who could commercially benefit from frameworks that position human-AI partnership as requiring expert facilitation."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial disclosure statement appears in the paper. The author's consultancy and center could represent financial interests in this area."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate LLMs on benchmarks. It assesses qualitative interaction behavior in conversational sessions, not model knowledge."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No benchmark evaluation is performed. The study tests conversational partnership behavior, not memorized knowledge."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No benchmarks are used in this study."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "This is a single-author Design Science Research study, not a human subjects study with recruited participants."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "The sole human participant is the author conducting self-study through DSR methodology. No external human subjects are involved."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No recruited human participants. The author describes his own qualifications (Section 2.2) but this is a single-investigator study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants recruited. Model eligibility criteria are described (Section 2.2) but this is not a human subjects study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human subjects study. The design is sequential qualitative assessment, not a randomized experiment."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human subjects study. The single evaluator (author) knew which model was being tested in each session."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No recruited human participants. Single-investigator self-study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The paper proposes a method requiring multi-hour sessions with frontier LLMs (sessions of 2-6 hours described in Section 2.1) but reports no API costs, token counts, or cost-per-session estimates despite claiming practical applicability."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total compute budget is reported. The paper mentions sessions spanning August 29 to October 20, 2025, with 7 models and 3 vignettes, but provides no quantification of total API spend or tokens consumed."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "Partnership state is achievable through ordered sequential calibration but requires emergent maintenance protocols that could not have been anticipated a priori.",
    296       "evidence": "Section 3.1 describes a seven-element calibration sequence that emerged through iterative sessions. The paper states it 'was possible to verify that a reproducible partnership state shift had occurred, and that it could be re-established after degradation using the same ordered elements.' Evidence is the author's qualitative assessment only.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "Comprehensive one-shot prompting consistently failed to produce a protective partnership state across seven model families.",
    301       "evidence": "Section 6.1 states this as an 'Operational Invariant.' No quantitative data, controlled comparison, or session logs are provided to support this claim — it rests entirely on the author's qualitative observation.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "Partnership state sustainability degrades systematically with session duration due to architectural rather than model-specific constraints.",
    306       "evidence": "Section 3.3 describes 'decreasing correction acceptance rates,' 'increasing linguistic fluency without corresponding epistemic justification,' and 'accelerating drift toward premature closure.' No quantitative degradation curves, metrics, or timeline data are provided.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "Five models (Claude 4.5, ChatGPT-5, Grok 4, Gemini 2.5, DeepSeek) demonstrated partnership state after initial calibration, while ChatGPT-4o and Llama required extended calibration and showed less stable state maintenance.",
    311       "evidence": "Section 3.2 provides this claim with the caveat that 'These observations represent preliminary patterns from single-investigator assessment rather than controlled comparison.' No structured evaluation criteria, inter-rater reliability, or blinding were used.",
    312       "supported": "weak"
    313     },
    314     {
    315       "claim": "Bidirectional error detection (mutual protection) prevents commitments to sophisticated but fundamentally wrong directions that unilateral oversight misses.",
    316       "evidence": "Section 6.1 states this as an 'Operational Invariant' but provides no controlled comparison of bidirectional vs. unilateral oversight. No examples of prevented wrong commitments are described in detail.",
    317       "supported": "unsupported"
    318     },
    319     {
    320       "claim": "The framework addresses multi-trillion-dollar AI valuation risks by demonstrating reliability in high-stakes decisions.",
    321       "evidence": "Sections 1 and 4 make this claim, citing Bank of England and Morgan Stanley sources. However, the framework was tested only on simulated vignettes with no actual high-stakes decisions, no actual economic outcomes, and no enterprise deployment.",
    322       "supported": "unsupported"
    323     }
    324   ],
    325   "red_flags": [
    326     {
    327       "flag": "N=1 evaluator with no inter-rater reliability",
    328       "detail": "The sole evaluator is the author, who designed the framework, conducted all sessions, and assessed all results. There is no independent evaluator, no blinding, and no inter-rater reliability check. The author's confirmation bias about his own framework's effectiveness is entirely uncontrolled."
    329     },
    330     {
    331       "flag": "No quantitative evidence for any claim",
    332       "detail": "All assessments are purely qualitative and subjective. Despite testing 7 models across 3 vignettes in sessions spanning nearly 2 months, no quantitative metrics, measurements, or structured coding of observations are reported."
    333     },
    334     {
    335       "flag": "Simulated scenarios only — no real high-stakes decisions",
    336       "detail": "Section 2.1 states 'no operational or policy actions were taken solely on model output.' The paper claims to address high-stakes decision-making but tested only on simulated business vignettes with no actual consequences, commitments, or stakeholders."
    337     },
    338     {
    339       "flag": "Core artifacts not released — irreproducible",
    340       "detail": "The actual prompt texts (~4,000-word Partnership Calibration Prompt, Co-Intelligence Partnership Handoff, etc.) are described but never provided. Without these artifacts, no one can reproduce or independently verify the results."
    341     },
    342     {
    343       "flag": "Claims dramatically outrun evidence",
    344       "detail": "The paper extrapolates from one person's qualitative impressions of simulated conversations to claims about enterprise AI deployment, multi-trillion-dollar valuations, preventing 'another prolonged AI winter,' and regulatory posture. Section 4 reads as a sales pitch rather than research implications."
    345     },
    346     {
    347       "flag": "Potential undisclosed conflict of interest",
    348       "detail": "The author is founder of a Centre for Digital Therapeutics and consultant at Vivenxia. The framework positions human-AI partnership as requiring expert facilitation — a service the author could commercially provide. No competing interests statement is included."
    349     },
    350     {
    351       "flag": "Circular methodology — framework designer evaluates own framework",
    352       "detail": "The author designed the partnership state criteria, conducted the sessions, judged whether partnership state was achieved, and reported the results. The evaluator's expectations inevitably shaped what was observed — a textbook example of experimenter demand effects."
    353     }
    354   ],
    355   "cited_papers": [
    356     {
    357       "title": "Sycophancy in large language models: Causes and mitigations",
    358       "authors": ["L. Malmqvist"],
    359       "year": 2025,
    360       "relevance": "Directly addresses LLM sycophancy bias, a core concern of the survey's methodology quality assessment."
    361     },
    362     {
    363       "title": "When truth is overridden: Uncovering the internal origins of sycophancy in large Language Models",
    364       "authors": ["K. Wang", "J. Li", "S. Yang", "Z. Zhang", "D. Wang"],
    365       "year": 2025,
    366       "arxiv_id": "2508.02087",
    367       "relevance": "Investigates mechanisms of LLM sycophancy, relevant to understanding LLM evaluation reliability."
    368     },
    369     {
    370       "title": "A comprehensive taxonomy of hallucinations in Large Language Models",
    371       "authors": ["M. Cossio"],
    372       "year": 2025,
    373       "arxiv_id": "2508.01781",
    374       "relevance": "Taxonomy of LLM hallucination types, relevant to understanding LLM reliability limitations."
    375     },
    376     {
    377       "title": "Why language models hallucinate",
    378       "authors": ["A. T. Kalai", "O. Nachum", "S. S. Vempala", "E. Zhang"],
    379       "year": 2025,
    380       "arxiv_id": "2509.04664",
    381       "relevance": "Theoretical analysis of hallucination mechanisms in language models."
    382     },
    383     {
    384       "title": "When combinations of humans and AI are useful: A systematic review and meta-analysis",
    385       "authors": ["M. Vaccaro", "A. Almaatouq", "T. Malone"],
    386       "year": 2024,
    387       "relevance": "Meta-analysis of human-AI team effectiveness, directly relevant to understanding when AI augmentation helps."
    388     },
    389     {
    390       "title": "Cognitive bias in clinical large language models",
    391       "authors": ["A. Mahajan", "Z. Obermeyer", "R. Daneshjou", "J. Lester", "D. Powell"],
    392       "year": 2025,
    393       "relevance": "Documents cognitive biases in LLMs applied to clinical settings, relevant to AI reliability assessment."
    394     },
    395     {
    396       "title": "How human-AI feedback loops alter human perceptual, emotional and social judgements",
    397       "authors": ["M. Glickman", "T. Sharot"],
    398       "year": 2025,
    399       "relevance": "Studies how AI interaction changes human judgment, core concern for human-AI team methodology."
    400     },
    401     {
    402       "title": "DeLLMa: Decision making under uncertainty with large language models",
    403       "authors": ["O. Liu", "D. Fu", "D. Yogatama", "W. Neiswanger"],
    404       "year": 2024,
    405       "arxiv_id": "2402.02392",
    406       "relevance": "Directly addresses LLM capability in uncertain decision-making, a benchmark for LLM decision quality."
    407     },
    408     {
    409       "title": "VERINA: Benchmarking Verifiable Code Generation",
    410       "authors": ["Z. Ye", "Z. Yan", "J. He", "T. Kasriel", "K. Yang", "D. Song"],
    411       "year": 2025,
    412       "arxiv_id": "2505.23135",
    413       "relevance": "Benchmark for verifiable code generation, relevant to LLM capability evaluation methodology."
    414     },
    415     {
    416       "title": "On the algorithmic bias of aligning large language models with RLHF: Preference collapse and matching regularization",
    417       "authors": ["J. Xiao", "Z. Li", "X. Xie", "E. Getzen", "C. Fang", "Q. Long"],
    418       "year": 2025,
    419       "relevance": "Analyzes bias introduced by RLHF alignment, relevant to understanding fundamental LLM limitations."
    420     },
    421     {
    422       "title": "Seven failure points when engineering a Retrieval Augmented Generation system",
    423       "authors": ["S. Barnett", "S. Kurniawan", "S. Thudumu", "Z. Brannelly", "M. Abdelrazek"],
    424       "year": 2024,
    425       "arxiv_id": "2401.05856",
    426       "relevance": "Systematic analysis of RAG system failure modes, relevant to understanding LLM system reliability."
    427     }
    428   ]
    429 }

Impressum · Datenschutz