scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21300B)
      1 {
      2   "paper": {
      3     "title": "Audit Trails for Accountability in Large Language Models",
      4     "authors": [
      5       "Victor Ojewale",
      6       "Harini Suresh",
      7       "Suresh Venkatasubramanian"
      8     ],
      9     "year": 2026,
     10     "venue": "Manuscript submitted to ACM",
     11     "arxiv_id": "2601.20727",
     12     "doi": null
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper provides a GitHub link for the proof-of-concept library: https://github.com/victorojewale/audit-trail-PoC (footnote 1, Section 6)."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No dataset is released. The paper includes an illustrative audit trail excerpt in Appendix A but this is just a small example, not a research dataset. The PoC repository reportedly includes a small illustrative audit_trail.jsonl, but no research data per se is released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications are provided. The paper describes a Python library but does not mention requirements.txt, Dockerfile, dependency versions, or setup instructions beyond what is in the README."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper mentions the PoC repository includes a README walkthrough (Section 6.4), but the paper itself does not contain step-by-step reproduction instructions. The README is referenced but its contents are not provided in the paper."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "This is a theoretical/design paper with no empirical experiments that produce quantitative results requiring confidence intervals."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No comparative empirical claims are made; the paper proposes a framework and proof-of-concept implementation without quantitative comparisons."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No empirical experiments are conducted; there are no effects to report sizes for."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No empirical study with samples is conducted. This is a theoretical/design paper."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experimental runs are conducted that would produce variance across runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper discusses related work (model cards, datasheets, MLOps tooling, HELM) but does not formally compare the proposed audit trail framework against these alternatives on any evaluation criteria."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No formal baseline comparison is conducted. Related work is discussed qualitatively but not evaluated against the proposed system."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "The system is presented as a conceptual framework with a proof-of-concept; there is no multi-component empirical evaluation where ablations would apply."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No quantitative evaluation is performed; there are no metrics to report."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The paper proposes a system for auditors and developers but includes no user study, expert evaluation, or usability assessment of the tool or framework. The paper itself acknowledges the need for future 'empirical studies [that] could examine how developers, auditors, and regulators actually engage with audit trail data' (Section 9)."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No dataset-based evaluation is performed; this criterion is structurally inapplicable."
     94       },
     95       "per_category_breakdown": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "No quantitative evaluation is performed that could be broken down by categories."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 8 (Discussion) discusses epistemic limits ('audit trails provide chronological and structural traceability but not causal explanation'), scalability challenges, ethical tensions (surveillance risks, privacy concerns), and conditions under which audit trails would be insufficient."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No experiments are conducted, so there are no negative experimental results. While limitations are discussed, no empirical findings show what did not work."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims three contributions: (1) a lifecycle framework, (2) a reference architecture, and (3) a Python implementation. All three are presented in Sections 4, 5, and 6 respectively. The abstract does not overclaim empirical validation."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The paper does not make causal claims. It proposes a framework and describes what audit trails 'can support' and 'would' enable, using conditional and future-oriented language throughout."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper is clear that the PoC 'implements a representative subset of the reference architecture' (Section 6), is 'intentionally not a full governance platform' (Section 6), and future work is needed for empirical evaluation and adoption studies (Section 9)."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": false,
    129         "answer": false,
    130         "justification": "The paper presents no empirical results for which alternative explanations would need to be considered. It is a design/theoretical contribution."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "The paper does not evaluate any LLM. It proposes a framework for auditing LLM systems but does not itself use or test specific models."
    138       },
    139       "prompts_provided": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No prompting is used in this paper. It is a framework/tool design paper."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No model training or evaluation experiments are conducted that would involve hyperparameters."
    148       },
    149       "scaffolding_described": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No agentic scaffolding is used or evaluated in this paper."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No data is collected or preprocessed for empirical analysis. This is a theoretical/design paper."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 8 (Discussion) contains substantive discussion of limitations including epistemic limits, scalability challenges, complex system composition issues, and ethical/institutional tensions."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 8 discusses specific limitations: audit trails cannot establish causation (epistemic limits), scaling to millions of records poses practical challenges, multi-component architectures require synchronized schemas, and logs could be misused for surveillance."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The paper explicitly states the PoC 'is intentionally not a full governance platform' and 'implements a representative subset of the reference architecture' (Section 6). Section 8 notes audit trails provide traceability 'but not causal explanation.' Section 9 explicitly calls out what the paper does not show: no empirical user studies, no field experiments, no differential privacy implementation."
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": false,
    180         "answer": false,
    181         "justification": "No empirical data is collected. This is a theoretical/design paper with a proof-of-concept implementation."
    182       },
    183       "data_collection_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No data collection is performed. The paper proposes a framework and implements a PoC library."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No participants or data subjects are involved. This is a design paper."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No data pipeline exists for this paper's own methodology. The paper describes how audit trail data pipelines would work in practice, but this is the contribution itself, not a research methodology."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or sponsors."
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "All three authors are identified with their affiliation at the Department of Computer Science, Brown University, USA."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of any funding disclosure means this criterion cannot be satisfied."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No competing interests statement is present in the paper."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "The paper does not evaluate any pre-trained model on a benchmark. It proposes a framework and tool for audit trails."
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No pre-trained model is evaluated on any benchmark."
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No benchmark evaluation is conducted."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants or experimental conditions are involved."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants or experimental conditions are involved."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved in this study."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "This is a theoretical/design paper proposing a framework. It does not run experiments with LLM inference costs."
    280       },
    281       "compute_budget_stated": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is a theoretical/design paper. No significant computation was performed beyond implementing a small Python library."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "LLM audit trails can serve as a sociotechnical mechanism for continuous accountability by linking technical provenance with governance records in a chronological, tamper-evident ledger.",
    291       "evidence": "The paper presents a conceptual framework (Section 4), reference architecture (Section 5), and proof-of-concept implementation (Section 6) to demonstrate feasibility, but provides no empirical validation of effectiveness.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "The proposed audit trail framework can be integrated into existing LLM workflows with minimal code changes.",
    296       "evidence": "Section 6 describes a Python library (llm-audit-trail) with a HuggingFace TrainerCallback, FastAPI middleware, and governance CLI. Code snippets are shown in Figures 2-4. However, 'minimal' is not quantified and no integration cost study is conducted.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Existing documentation approaches (model cards, datasheets, MLOps tooling) leave an accountability gap because they are static snapshots or lack governance context.",
    301       "evidence": "Section 3 argues that model cards are 'curated and static' and MLOps tools are 'oriented toward developer needs, not governance.' The argument is based on analysis of existing tools rather than empirical measurement of the gap.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Hash-chain integrity verification makes tampering or deletion detectable in the audit trail.",
    306       "evidence": "Section 6.2 describes SHA-256 hash chaining where each event's curr_hash covers the serialized payload plus the previous hash. Section 6.3 describes a verify_log function. This is a standard cryptographic technique and the claim is well-grounded.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": [
    311     "theoretical"
    312   ],
    313   "key_findings": "The paper proposes LLM audit trails as a reusable accountability layer consisting of three components: a lifecycle framework specifying what to log across pretraining, adaptation, deployment, and monitoring; a three-layer reference architecture (Capture, Store, Use) with emitter-based event collection, append-only tamper-evident storage, and auditor-facing query tools; and a proof-of-concept Python library demonstrating integration with HuggingFace Trainer, FastAPI, and a governance CLI. The work is entirely theoretical and design-oriented, with no empirical evaluation of the framework's effectiveness, adoption costs, or impact on accountability outcomes.",
    314   "red_flags": [
    315     {
    316       "flag": "No empirical evaluation",
    317       "detail": "The paper proposes a framework and implements a proof-of-concept but conducts no empirical evaluation whatsoever -- no user studies, no case deployments, no measurement of adoption costs, integration overhead, or actual accountability improvements. The motivating scenarios in Section 2 are hypothetical, and Section 7 revisits them only to describe how the audit trail 'would' work, not how it actually performed."
    318     },
    319     {
    320       "flag": "Feasibility claims without evidence",
    321       "detail": "The paper claims the audit trail can be integrated with 'minimal code changes' and 'modest overhead' (Section 6.5, Section 9) but provides no measurements of integration effort, runtime overhead, storage costs, or developer experience. These are qualitative assertions about the PoC, not empirical findings."
    322     }
    323   ],
    324   "cited_papers": [
    325     {
    326       "title": "AI auditing: The Broken Bus on the Road to AI Accountability",
    327       "authors": ["Abeba Birhane", "Ryan Steed", "Victor Ojewale", "Briana Vecchione", "Inioluwa Deborah Raji"],
    328       "year": 2024,
    329       "doi": "10.1109/SaTML59370.2024.00037",
    330       "relevance": "Directly addresses AI auditing gaps and accountability infrastructure, relevant to evaluating methodology of AI safety/governance research."
    331     },
    332     {
    333       "title": "Towards AI Accountability Infrastructure: Gaps and Opportunities in AI Audit Tooling",
    334       "authors": ["Victor Ojewale", "Ryan Steed", "Briana Vecchione", "Abeba Birhane", "Inioluwa Deborah Raji"],
    335       "year": 2025,
    336       "doi": "10.1145/3706598.3713301",
    337       "relevance": "Surveys AI audit tooling landscape, directly relevant to the survey's scope on AI governance and accountability tooling."
    338     },
    339     {
    340       "title": "Auditing large language models: a three-layered approach",
    341       "authors": ["Jakob Mökander", "Jonas Schuett", "Hannah Rose Kirk", "Luciano Floridi"],
    342       "year": 2024,
    343       "doi": "10.1007/s43681-023-00289-2",
    344       "relevance": "Proposes a layered LLM audit framework (governance, model, application levels), directly relevant to AI safety and evaluation methodology."
    345     },
    346     {
    347       "title": "Closing the AI Accountability Gap: Defining an End-to-End Framework for Internal Algorithmic Auditing",
    348       "authors": ["Inioluwa Deborah Raji", "Andrew Smart", "Rebecca N. White", "Margaret Mitchell", "Timnit Gebru", "Ben Hutchinson", "Jamila Smith-Loud", "Daniel Theron", "Parker Barnes"],
    349       "year": 2020,
    350       "doi": "10.1145/3351095.3372873",
    351       "relevance": "Defines an end-to-end internal algorithmic auditing framework, foundational work for AI accountability research."
    352     },
    353     {
    354       "title": "Holistic Evaluation of Language Models",
    355       "authors": ["Percy Liang", "Rishi Bommasani", "Tony Lee"],
    356       "year": 2023,
    357       "relevance": "HELM benchmark and evaluation methodology for LLMs, directly relevant to the survey's scope on evaluation methodology and rigor."
    358     },
    359     {
    360       "title": "On the Opportunities and Risks of Foundation Models",
    361       "authors": ["Rishi Bommasani", "Drew A. Hudson"],
    362       "year": 2021,
    363       "arxiv_id": "2108.07258",
    364       "relevance": "Foundational survey on foundation model risks and opportunities, relevant to the survey's scope on AI safety and capability assessment."
    365     },
    366     {
    367       "title": "Ecosystem Graphs: Documenting the Foundation Model Supply Chain",
    368       "authors": ["Rishi Bommasani", "Dilara Soylu", "Thomas I. Liao", "Kathleen A. Creel", "Percy Liang"],
    369       "year": 2024,
    370       "doi": "10.1609/aies.v7i1.31629",
    371       "relevance": "Maps the foundation model supply chain ecosystem, relevant to understanding AI development accountability and methodology."
    372     },
    373     {
    374       "title": "AI Supply Chains: An Emerging Ecosystem of AI Actors, Products, and Services",
    375       "authors": ["Aspen Hopkins", "Sarah H. Cen", "Andrew Ilyas", "Isabella Struckman", "Luis Videgaray", "Aleksander Mądry"],
    376       "year": 2025,
    377       "arxiv_id": "2504.20185",
    378       "relevance": "Characterizes the AI supply chain ecosystem, relevant to understanding distributed accountability in AI development."
    379     },
    380     {
    381       "title": "On the Dangers of Stochastic Parrots: Can Language Models Be Too Big?",
    382       "authors": ["Emily M. Bender", "Timnit Gebru", "Angelina McMillan-Major", "Shmargaret Shmitchell"],
    383       "year": 2021,
    384       "doi": "10.1145/3442188.3445922",
    385       "relevance": "Influential paper on LLM risks and limitations, relevant to the survey's AI safety scope."
    386     },
    387     {
    388       "title": "OLMo: Accelerating the Science of Language Models",
    389       "authors": ["Dirk Groeneveld", "Iz Beltagy", "Pete Walsh"],
    390       "year": 2024,
    391       "arxiv_id": "2402.00838",
    392       "relevance": "Open language model with documented training provenance, relevant as an example of transparency practices in LLM development."
    393     },
    394     {
    395       "title": "Are emergent abilities of large language models a mirage?",
    396       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    397       "year": 2023,
    398       "relevance": "Challenges claims about emergent LLM abilities, relevant to evaluation methodology and claims about LLM capabilities."
    399     }
    400   ]
    401 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs