scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19120B)
      1 {
      2   "paper": {
      3     "title": "Sensorium Arc: AI Agent System for Oceanic Data Exploration and Interactive Eco-Art",
      4     "authors": ["Noah Bissell", "Ethan Paley", "Joshua Harrison", "Juliano Calil", "Myungin Lee"],
      5     "year": 2025,
      6     "venue": "NeurIPS 2025 Creative AI Track",
      7     "arxiv_id": "2511.15997",
      8     "doi": "10.48550/arXiv.2511.15997"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "The paper references the LLMUnity framework [7] and a video demo [10], but does not provide a repository URL for the Sensorium Arc system code itself."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The conversational corpus used for RAG is released on Zenodo (ref [23], doi: 10.5281/ZENODO.16777228). NASA EarthData datasets used are publicly available."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper lists hardware (i9-13900HX, RTX 4090, 64GB RAM) and Unity version (6000.0.24f1) but provides no dependency specifications, requirements files, or library versions beyond naming the models used."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided. The methodology section describes the system architecture but not how to set it up or run it."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": false,
     38         "answer": false,
     39         "justification": "This is a system/art paper that does not report quantitative experimental results requiring statistical analysis."
     40       },
     41       "significance_tests": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "No comparative claims are made that would require significance tests."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "No quantitative comparisons are reported."
     50       },
     51       "sample_size_justified": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "No empirical study with sample sizes is conducted."
     55       },
     56       "variance_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No experimental runs with variance to report."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No baselines or comparisons with other systems are provided. The paper only describes Sensorium Arc without comparing it to alternative approaches."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No baselines are included at all."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "The system has multiple components (Visualization Decider, RAG, Query Rewriter, Responder) but no ablation study examines their individual contributions."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No formal evaluation metrics are reported. The paper mentions 'average response latencies under 4 seconds' but provides no other quantitative evaluation."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No human evaluation of system outputs is reported. The paper mentions exhibitions but provides no user study data or systematic human assessment."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No dataset-based evaluation is performed; the system is an interactive installation, not a benchmark evaluation."
     92       },
     93       "per_category_breakdown": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "No quantitative evaluation is performed that could be broken down by category."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 4 (Discussion) describes limitations including RAG reliability issues, prompt interference in single-model designs, context window filling with too many retrieved paragraphs, and latency trade-offs with larger models."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper reports that single-model designs 'suffered from issues such as prompt interference and opaque errors' and that retrieving more paragraphs (higher k) 'began to fill up the final LLM step's context window and muddle the response generation.'"
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "The abstract claims the system 'demonstrates the potential of conversational AI agents to mediate affective, intuitive access to high-dimensional environmental data' but no empirical evidence (user studies, metrics) supports this claim in the paper."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "The paper claims earlier Sensorium iterations 'reveal[ed] that while sensory engagement was high, audiences sought direct dialogue with the ocean' and that 'this insight motivated' the AI narrator. No evidence is provided for this causal claim about audience preferences."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The abstract proposes 'a new paradigm for human-machine-ecosystem' interaction based on a single installation with no formal evaluation. The conclusion claims a 'generalizable framework' without evidence of generalization."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper does not consider alternative explanations for why participants engage with the system (novelty effect, exhibition context, etc.) or whether the AI component specifically drives engagement versus the immersive hardware."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper claims the system enables 'affective, intuitive access' to data and can 'shift environmental data from static representation to participatory narrative' but does not measure these outcomes or acknowledge the gap between the system description and these claimed effects."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Specific models are named with sizes: Llama-3.2-3B, Gemma 3 12B, Qwen 8B (ref [8] specifies Qwen3-8B), all-MiniLM-L12-v2 for embeddings, Whisper Tiny for STT, Jets for TTS."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "Prompts are described in natural language (e.g., 'few-shot examples embedded in the system prompt', 'primed with general knowledge') but the actual prompt text is never provided."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Only k=2 for retrieval and the 50cm proximity threshold are stated. No LLM hyperparameters (temperature, top-p, max tokens) are reported for any of the three agents."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The multi-agent architecture is described in detail in Section 3 with a system diagram (Figure 2), including the Visualization Decider Agent, RAG/Query Rewriter Agent, and Responder Agent, with their communication flow and GBNF grammar constraints."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 3 describes preprocessing: 'unstructured resources were pre-processed into paragraphs, further split into sentences, embedded into a 384-dimensional vector space using the local embedding model all-MiniLM-L12-v2, and indexed in a lightweight local database.'"
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 4 (Discussion) contains substantive discussion of limitations organized into four directions: RAG reliability, LLM agent performance, artistic advancement, and scientific/social impact."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Specific limitations are discussed: context window filling with retrieved paragraphs, GPU resource competition between LLM inference and Unity rendering, prompt interference in single-model designs, and the need for controlled user studies."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper does not explicitly state what the results do NOT show. The Discussion identifies future work directions but does not bound the current claims to the tested setting."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No raw data from system usage or exhibitions is released. The RAG corpus is on Zenodo but no interaction data or evaluation data is available."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The data sources are described: NASA EarthData for scientific datasets, Harrison archive materials for the RAG corpus, with specific documents named (The Time of the Force Majeure, Apologia Mediterraneo, Peninsula Europe I catalog)."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants study was conducted. Exhibition visitors are mentioned but not studied as participants."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline from raw text to embedded vectors is documented: unstructured resources → paragraphs → sentences → 384-dim embeddings via all-MiniLM-L12-v2 → indexed in local database with ANN via usearch."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Funding is disclosed in the Acknowledgments: Kuali Research Award (314284-00001), Arts for All, UMD Immersive Media Design, AlloSphere, Getty Foundation PST2024, Metabolic Studio of the Annenberg Foundation, and Joan and Irwin Jacobs Family Fund."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All author affiliations are listed: University of Maryland College Park (3 authors), UC Santa Cruz Center for the Study of the Force Majeure, and Virtual Planet Technologies."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Funders are academic institutions and art foundations (UMD, Getty Foundation, Annenberg Foundation) with no apparent financial stake in the system's evaluation outcomes."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is included. One author is affiliated with Virtual Planet Technologies, a company in the immersive media space, but no financial interests disclosure is made."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. The LLMs are used as components of an interactive system, not evaluated for their knowledge."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "No benchmark evaluation is conducted."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No benchmark evaluation is conducted."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human subjects study is conducted. The paper describes an art installation system without formal user studies."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human subjects study is conducted."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human subjects study is conducted."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human subjects study is conducted."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human subjects study is conducted."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human subjects study is conducted."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human subjects study is conducted."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Latency is reported: 'average response latencies under 4 seconds after user input' with CUDA acceleration on RTX 4090."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Hardware is specified: Intel Core i9-13900HX, 64GB RAM, NVIDIA RTX 4090 Laptop GPU, with CUDA acceleration and configurable GPU layer offloading. Unity version 6000.0.24f1 on Windows 11."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "Sensorium Arc demonstrates the potential of conversational AI agents to mediate affective, intuitive access to high-dimensional environmental data",
    294       "evidence": "System description in Section 3 and exhibition mentions in Section 4, but no empirical evaluation of user affect or intuition is provided.",
    295       "supported": "unsupported"
    296     },
    297     {
    298       "claim": "Earlier Sensorium iterations revealed that audiences sought direct dialogue with the ocean, motivating the AI narrator",
    299       "evidence": "Mentioned in Section 2 (Related Works) but no data from these prior exhibitions is presented.",
    300       "supported": "weak"
    301     },
    302     {
    303       "claim": "The modular multi-agent architecture improved control and debuggability over a single-model approach",
    304       "evidence": "Section 3 states single-model designs 'suffered from issues such as prompt interference and opaque errors' and cites multi-agent literature [31], but no comparative evaluation is provided.",
    305       "supported": "weak"
    306     },
    307     {
    308       "claim": "The system achieves average response latencies under 4 seconds",
    309       "evidence": "Stated in Section 3 (Hardware and Runtime Environment) with specific hardware configuration.",
    310       "supported": "moderate"
    311     }
    312   ],
    313   "methodology_tags": ["case-study"],
    314   "key_findings": "Sensorium Arc is a multimodal interactive AI installation that personifies the ocean as a poetic speaker using a multi-agent LLM architecture with RAG grounding. The system uses Llama 3.2 3B, Gemma 3 12B, and Qwen 8B models in specialized roles (visualization selection, query rewriting, response generation) with a curated eco-art corpus. The paper is primarily a system description for an art installation presented at NeurIPS Creative AI Track, with no formal evaluation of its claimed effects on user engagement or understanding.",
    315   "red_flags": [
    316     {
    317       "flag": "No formal evaluation",
    318       "detail": "The paper makes claims about the system's potential to mediate 'affective, intuitive access' to environmental data and proposes a 'new paradigm,' but provides no user study, no metrics, and no empirical evidence. The Discussion acknowledges the need for 'controlled user studies on retention, empathy, and behavioral change' as future work."
    319     },
    320     {
    321       "flag": "Claims significantly outrun evidence",
    322       "detail": "The abstract claims a 'new paradigm for human-machine-ecosystem' interaction and the conclusion claims a 'generalizable framework,' but the evidence is a single art installation with no formal evaluation. These are aspirational claims presented as findings."
    323     }
    324   ],
    325   "cited_papers": [
    326     {
    327       "title": "Why Do Multi-Agent LLM Systems Fail?",
    328       "authors": ["M. Cemri", "M. Z. Pan", "S. Yang"],
    329       "year": 2025,
    330       "arxiv_id": "2503.13657",
    331       "doi": "10.48550/arXiv.2503.13657",
    332       "relevance": "Directly addresses failure modes in multi-agent LLM systems, relevant to agentic AI reliability."
    333     },
    334     {
    335       "title": "Small LLMs Are Weak Tool Learners: A Multi-LLM Agent",
    336       "authors": ["W. Shen", "C. Li", "H. Chen"],
    337       "year": 2024,
    338       "doi": "10.48550/ARXIV.2401.07324",
    339       "relevance": "Multi-agent LLM architecture for tool use, relevant to agentic workflow design."
    340     },
    341     {
    342       "title": "Query Rewriting for Retrieval-Augmented Large Language Models",
    343       "authors": ["X. Ma", "Y. Gong", "P. He"],
    344       "year": 2023,
    345       "arxiv_id": "2305.14283",
    346       "doi": "10.48550/arXiv.2305.14283",
    347       "relevance": "RAG query rewriting technique used in the system, relevant to LLM retrieval methodology."
    348     },
    349     {
    350       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    351       "authors": ["J. Wei", "X. Wang", "D. Schuurmans"],
    352       "year": 2023,
    353       "arxiv_id": "2201.11903",
    354       "doi": "10.48550/arXiv.2201.11903",
    355       "relevance": "Foundational prompting technique used in the system's query rewriter agent."
    356     },
    357     {
    358       "title": "A Survey on RAG with LLMs",
    359       "authors": ["M. Arslan", "H. Ghanem", "S. Munawar", "C. Cruz"],
    360       "year": 2024,
    361       "doi": "10.1016/j.procs.2024.09.178",
    362       "relevance": "Survey of retrieval-augmented generation techniques relevant to LLM capability evaluation."
    363     },
    364     {
    365       "title": "Reasoning LLMs for User-Aware Multimodal Conversational Agents",
    366       "authors": ["H. Rahimi", "J. Cattoni", "M. Beghili"],
    367       "year": 2025,
    368       "doi": "10.48550/ARXIV.2504.01700",
    369       "relevance": "User-aware multimodal LLM agent design, relevant to agentic AI interaction patterns."
    370     }
    371   ]
    372 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs