ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (21331B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "position",
      4   "paper": {
      5     "title": "Epistemic Alignment: A Mediating Framework for User-LLM Knowledge Delivery",
      6     "authors": [
      7       "Nicholas Clark",
      8       "Hua Shen",
      9       "Bill Howe",
     10       "Tanushree Mitra"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2504.01205",
     15     "doi": "10.48550/arXiv.2504.01205"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract's core claims — that users lack structured mechanisms to specify epistemic preferences, that prompt sharing exists as folklore, and that providers have only partially addressed these challenges — are each substantiated by the Reddit thematic analysis (Section 5) and the provider content analysis (Section 6).",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": false,
     27         "answer": false,
     28         "justification": "The paper is primarily a framework proposal and descriptive analysis; it does not make causal claims about what produces what outcome in an experimental sense.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper generalizes findings from 128 Reddit custom instructions drawn from tech-savvy LLM subreddits to 'users' broadly, and from two providers' policy documents to 'current systems' generally, without bounding these generalizations to the sampled populations.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The finding that 92.1% of custom instructions address at least one challenge is presented as framework validation without considering that a 10-category framework derived after seeing the data could achieve high coverage through category breadth rather than genuine capture of user concerns.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Reddit custom instructions from four LLM-adjacent subreddits are treated as representative of 'user knowledge preferences in practice' without acknowledging that this population is systematically unrepresentative of general LLM users.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "There is no dedicated limitations or threats-to-validity section; the paper moves directly from Section 6 (platform evaluation) to Section 7 (discussion and conclusion) with no systematic treatment of limitations.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No specific threats are named — the Reddit sample bias, potential circularity in framework validation, and use of LLMs to label LLM-related data are all unaddressed.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The conclusion claims the framework 'avoids domain-specific problems' and is 'versatile for evaluation across contexts,' asserting broad applicability without stating what settings or user populations it does not address.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No acknowledgments or funding disclosure appears anywhere in the paper.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors are identified as affiliated with the University of Washington on the title page.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding is disclosed, making this criterion not applicable.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or financial disclosure is present in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key constructs are formally defined, including 'epistemic profile' as a three-component mathematical vector and the three framework dimensions (epistemic responsibility, personalization, testimonial reliability) defined in Section 3.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The introduction explicitly lists four contributions: the framework itself, its validation via thematic analysis, assessment of current systems, and interface design implications.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper situates itself within epistemology (Goldman, Hookway, de Ridder), epistemic cognition (Chinn & Rinehart AIR framework), and LLM literature (sycophancy, hallucination, uncertainty expression), distinguishing its contribution as a structured intermediary framework.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "position": {
    119       "argument_quality": {
    120         "argument_internally_consistent": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "The paper's logical flow is coherent: it derives a framework from established epistemology, validates that Reddit users encounter the identified challenges, and applies the framework to assess provider policies — each step follows from the prior.",
    124           "source": "haiku"
    125         },
    126         "counterarguments_addressed": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "The paper does not engage with the strongest counterarguments — e.g., that natural language may be adequate for most users, that structured epistemic interfaces could impose undue cognitive overhead, or that the 10 challenges are too abstract to be actionable.",
    130           "source": "haiku"
    131         },
    132         "analogies_appropriate": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "Analogies to Wikipedia's neutral-point-of-view policy, library science epistemic virtues (Fallis 2008), and hypothesis-testing Type I/II errors are appropriate and grounded in cited literature.",
    136           "source": "haiku"
    137         },
    138         "prescriptions_proportional": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "The four-component interface redesign proposed in Section 7 (structured preference specification, transparency annotations, adaptive personalization, contextual guidance) exceeds what a Reddit thematic analysis and policy document review can support.",
    142           "source": "haiku"
    143         },
    144         "evidence_for_claims_cited": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "Factual claims are consistently backed by citations — over-abstention cites Varshney et al. (2023) and Cheng et al. (2024), sycophancy cites Sharma et al. (2023), citations inflating trust cites Ding et al. (2025).",
    148           "source": "haiku"
    149         },
    150         "alternatives_discussed": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "The paper proposes its framework and interface redesign without discussing alternative approaches to the epistemic alignment problem, such as preference-tuned models, RLHF-based personalization, or structured prompting templates.",
    154           "source": "haiku"
    155         },
    156         "historical_context_accurate": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "References to the epistemology tradition (Goldman 1991, Hookway 1994, 2003), Wikipedia's editorial policies, and library science epistemic virtues appear accurate and properly cited.",
    160           "source": "haiku"
    161         }
    162       },
    163       "clarity_and_scope": {
    164         "key_terms_defined_precisely": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "'Epistemic alignment problem' is defined formally as d(Eu, Es) > θ where epistemic profiles are expressed as mathematical vectors; all three framework dimensions are defined with specific components in Section 3.",
    168           "source": "haiku"
    169         },
    170         "engages_with_existing_literature": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "The paper substantively engages with epistemology (de Ridder, Hookway), epistemic cognition (Chinn & Rinehart AIR framework), social epistemology (Goldman, Lackey), and recent LLM research, showing how each informs a distinct part of the framework.",
    174           "source": "haiku"
    175         },
    176         "intended_audience_clear": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "The abstract and conclusion explicitly identify AI developers and users as the target audience, stating the framework 'offers concrete guidance for supporting diverse approaches to knowledge' for developers and 'works toward information delivery' for users.",
    180           "source": "haiku"
    181         },
    182         "assumptions_stated": {
    183           "applies": true,
    184           "answer": false,
    185           "justification": "The paper assumes epistemological frameworks from academic philosophy map cleanly onto LLM interactions, that Reddit users proxy for LLM users generally, and that policy documents reflect actual system behavior — none of these assumptions are explicitly stated or defended.",
    186           "source": "haiku"
    187         },
    188         "scope_of_applicability_discussed": {
    189           "applies": true,
    190           "answer": false,
    191           "justification": "The paper claims the framework avoids 'domain-specific problems' and is a 'versatile tool for evaluation across contexts' without specifying where it would not apply or what its boundary conditions are.",
    192           "source": "haiku"
    193         }
    194       }
    195     }
    196   },
    197   "claims": [
    198     {
    199       "claim": "Users cannot effectively specify epistemic preferences to LLMs using current natural language interfaces.",
    200       "evidence": "Supported by thematic analysis of 128 Reddit custom instructions showing elaborate workarounds, and by content analysis finding neither OpenAI nor Anthropic provides structured controls for epistemic dimensions.",
    201       "supported": "moderate"
    202     },
    203     {
    204       "claim": "92.1% of analyzed custom instructions address at least one epistemic alignment challenge.",
    205       "evidence": "Based on GPT-4o classification of 128 Reddit custom instructions with human inter-rater reliability κ=0.8875 validation; however, the framework's breadth makes high coverage unsurprising.",
    206       "supported": "moderate"
    207     },
    208     {
    209       "claim": "OpenAI's Model Spec addresses all ten epistemic challenges at the policy specification level.",
    210       "evidence": "Content analysis found explicit references to all ten challenges in the Model Spec, though specific methodology for hedging and viewpoints was noted to be limited.",
    211       "supported": "moderate"
    212     },
    213     {
    214       "claim": "Both OpenAI and Anthropic lack structured interface mechanisms for users to specify or verify epistemic preferences.",
    215       "evidence": "Content analysis of model cards, changelogs, and blog posts found no structured controls for citation standards, uncertainty expression, or perspective balance in either platform's interface.",
    216       "supported": "moderate"
    217     },
    218     {
    219       "claim": "A 'prompt sharing folklore' pattern exists where community-specific prompts are shared through trust relationships without measured efficacy.",
    220       "evidence": "Observational claim based on the existence of Reddit sharing behavior; no baseline comparison to systematic sharing or efficacy measurement is provided.",
    221       "supported": "weak"
    222     },
    223     {
    224       "claim": "The Reddit custom instruction analysis validates that the Epistemic Alignment Framework captures challenges users actually face.",
    225       "evidence": "Finding instances of all 10 challenges in Reddit instructions is presented as validation, but the framework was derived prior to and then mapped onto data — potential circularity is not addressed.",
    226       "supported": "weak"
    227     }
    228   ],
    229   "methodology_tags": [
    230     "qualitative",
    231     "theoretical"
    232   ],
    233   "key_findings": "The paper proposes a 10-challenge Epistemic Alignment Framework derived from academic epistemology, covering uncertainty expression, perspective diversity, source reliability, and knowledge personalization in LLM interactions. A thematic analysis of 128 Reddit custom instructions found 92.1% addressed at least one framework challenge, and content analysis of OpenAI and Anthropic policy documents found partial coverage of all 10 challenges at the policy level but no structured interface mechanisms for preference specification or verification. The paper identifies three categories of user folk theories (Suppressing Default Behavior, Expert Persona, Parameter Configuration) that emerge as workarounds in the absence of structured epistemic controls. The authors call for redesigned interfaces with structured preference controls, transparency annotations, adaptive personalization, and contextual guidance.",
    234   "red_flags": [
    235     {
    236       "flag": "Biased sample",
    237       "detail": "The Reddit custom instructions sample (128 comments from r/ChatGPT, r/ChatGPTPro, r/OpenAI, r/Anthropic) is drawn from self-selected, technically sophisticated users and generalized to 'users' broadly without acknowledgment of this limitation."
    238     },
    239     {
    240       "flag": "Circular validation",
    241       "detail": "The framework was derived from epistemology literature, then 'validated' by finding instances of each challenge in Reddit data. A 10-category framework applied to a targeted corpus will almost always find coverage for each category."
    242     },
    243     {
    244       "flag": "LLM-labeled LLM data",
    245       "detail": "GPT-4o-mini and GPT-4o were used to extract and label custom instructions about LLM behavior; the potential bias of using the evaluated system type to assess user experiences of that system type is not discussed."
    246     },
    247     {
    248       "flag": "No limitations section",
    249       "detail": "The paper contains no dedicated limitations or threats-to-validity section, omitting discussion of sample bias, measurement validity, and generalizability constraints entirely."
    250     },
    251     {
    252       "flag": "No funding disclosure",
    253       "detail": "The paper contains no acknowledgments or funding statement, providing no information about potential financial conflicts of interest."
    254     },
    255     {
    256       "flag": "Prescriptions exceed evidence",
    257       "detail": "The four-component interface redesign proposed in Section 7 is not grounded in user studies or design validation — it is presented as concrete design guidance based solely on a Reddit analysis and policy document review."
    258     }
    259   ],
    260   "cited_papers": [
    261     {
    262       "title": "Survey of Hallucination in Natural Language Generation",
    263       "relevance": "Foundational reference on LLM hallucination that motivates the epistemic alignment problem"
    264     },
    265     {
    266       "title": "Towards Understanding Sycophancy in Language Models",
    267       "relevance": "Core empirical reference for the sycophancy challenge within the framework"
    268     },
    269     {
    270       "title": "A Roadmap to Pluralistic Alignment",
    271       "relevance": "Framework for pluralistic LLM responses directly used for the range-of-viewpoints challenge"
    272     },
    273     {
    274       "title": "Towards Bidirectional Human-AI Alignment: A Systematic Review for Clarifications, Framework, and Future Directions",
    275       "relevance": "Cited for the bidirectional alignment framing that contextualizes the epistemic alignment problem"
    276     },
    277     {
    278       "title": "Citations and Trust in LLM Generated Responses",
    279       "relevance": "Key empirical finding that citations increase user trust even when randomly generated — central evidence for citation verification challenge"
    280     },
    281     {
    282       "title": "Enabling Large Language Models to Generate Text with Citations",
    283       "relevance": "Technical approaches to grounded citation generation in LLMs"
    284     },
    285     {
    286       "title": "Can Large Language Models Faithfully Express Their Intrinsic Uncertainty in Words?",
    287       "relevance": "Empirical study on LLM uncertainty expression relevant to calibration and hedging challenges"
    288     },
    289     {
    290       "title": "Online Illusions of Understanding",
    291       "relevance": "Core epistemological concept motivating the framework's concern about LLMs masking shallow inquiry"
    292     },
    293     {
    294       "title": "Personalization of Large Language Models: A Survey",
    295       "relevance": "Survey on LLM personalization techniques relevant to the epistemic personalization dimension"
    296     },
    297     {
    298       "title": "Toward an Epistemology of Wikipedia",
    299       "relevance": "Establishes epistemic virtues (reliability, power, speed, fecundity) used to compare LLMs against legacy knowledge institutions"
    300     }
    301   ],
    302   "engagement_factors": {
    303     "practical_relevance": {
    304       "score": 2,
    305       "justification": "Directly relevant to LLM users and developers seeking to improve how knowledge preferences are specified in AI interfaces."
    306     },
    307     "surprise_contrarian": {
    308       "score": 1,
    309       "justification": "The 'prompt sharing as folklore' framing is mildly novel but the core argument — that LLMs lack structured preference mechanisms — is not surprising to practitioners."
    310     },
    311     "fear_safety": {
    312       "score": 1,
    313       "justification": "Touches on sycophancy and misinformation risks but does not emphasize dramatic safety consequences."
    314     },
    315     "drama_conflict": {
    316       "score": 1,
    317       "justification": "Critiques OpenAI and Anthropic for gaps in epistemic support, but the critique is measured and academic rather than confrontational."
    318     },
    319     "demo_ability": {
    320       "score": 0,
    321       "justification": "The framework is purely conceptual; no tool, prototype, or interactive demo is provided or referenced."
    322     },
    323     "brand_recognition": {
    324       "score": 1,
    325       "justification": "University of Washington affiliation and analysis of OpenAI/Anthropic products provides moderate recognition, but no famous lab or product is being introduced."
    326     }
    327   },
    328   "hn_data": {
    329     "threads": [
    330       {
    331         "hn_id": "47634936",
    332         "title": "Reasoning models encode tool choices before they start reasoning",
    333         "points": 3,
    334         "comments": 0,
    335         "url": "https://news.ycombinator.com/item?id=47634936"
    336       },
    337       {
    338         "hn_id": "45116073",
    339         "title": "Towards Agentic OS: An LLM Agent Framework for Linux Schedulers",
    340         "points": 3,
    341         "comments": 0,
    342         "url": "https://news.ycombinator.com/item?id=45116073"
    343       },
    344       {
    345         "hn_id": "43729852",
    346         "title": "MageSQL: Enhancing In-Context Learning for Text-to-SQL Applications with LLMs",
    347         "points": 2,
    348         "comments": 0,
    349         "url": "https://news.ycombinator.com/item?id=43729852"
    350       },
    351       {
    352         "hn_id": "42724278",
    353         "title": "Abundant Water from Early Supernovae at Cosmic Dawn",
    354         "points": 2,
    355         "comments": 0,
    356         "url": "https://news.ycombinator.com/item?id=42724278"
    357       },
    358       {
    359         "hn_id": "44756018",
    360         "title": "Ask HN: Is manually discovering and configuring MCP servers the only way?",
    361         "points": 1,
    362         "comments": 3,
    363         "url": "https://news.ycombinator.com/item?id=44756018"
    364       },
    365       {
    366         "hn_id": "47622971",
    367         "title": "When a reasoning LLM chooses, which comes first: thought or decision?",
    368         "points": 1,
    369         "comments": 0,
    370         "url": "https://news.ycombinator.com/item?id=47622971"
    371       },
    372       {
    373         "hn_id": "44605627",
    374         "title": "Long-Sequence Memory with Temporal Kernels and Dense Hopfield Functionals",
    375         "points": 1,
    376         "comments": 0,
    377         "url": "https://news.ycombinator.com/item?id=44605627"
    378       },
    379       {
    380         "hn_id": "44276405",
    381         "title": "Relic: Evaluating Compositional Instruction Following via Language Recognition",
    382         "points": 1,
    383         "comments": 0,
    384         "url": "https://news.ycombinator.com/item?id=44276405"
    385       },
    386       {
    387         "hn_id": "43358918",
    388         "title": "The Countable Reals (2024)",
    389         "points": 1,
    390         "comments": 0,
    391         "url": "https://news.ycombinator.com/item?id=43358918"
    392       },
    393       {
    394         "hn_id": "40220945",
    395         "title": "Search for gravitationally lensed interstellar transmissions",
    396         "points": 1,
    397         "comments": 0,
    398         "url": "https://news.ycombinator.com/item?id=40220945"
    399       }
    400     ],
    401     "top_points": 3,
    402     "total_points": 16,
    403     "total_comments": 3
    404   }
    405 }

Impressum · Datenschutz