ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (16639B)


      1 {
      2   "paper": {
      3     "title": "Multi-Stakeholder Alignment in LLM-Powered Collaborative AI Systems: A Multi-Agent Framework for Intelligent Tutoring",
      4     "authors": ["Alexandre P. Uchoa", "Carlo E. T. Oliveira", "Cláudia L. R. Motta", "Daniel Schneider"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2510.23245",
      8     "doi": "10.48550/arXiv.2510.23245"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["theoretical"],
     13   "key_findings": "The paper proposes the Advisory Governance Layer (AGL), a non-intrusive multi-agent framework for governing LLM-powered Intelligent Tutoring Systems across multiple stakeholders (students, parents, teachers, regulators). It introduces a policy taxonomy (hard constraints, soft preferences, temporal rules, hierarchical rules) and a four-agent architecture (Stakeholder, Negotiation, Audit, Oversight). The work is entirely conceptual with no implementation or empirical validation, which the authors explicitly acknowledge as a limitation.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code or repository is provided. The paper is a conceptual architecture with no implementation."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No data is released. The paper presents a theoretical framework with no empirical data."
     25       },
     26       "environment_specified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "No software was built or run; this is a purely theoretical/architectural paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": false,
     33         "answer": false,
     34         "justification": "No experiments to reproduce; the paper proposes a conceptual architecture only."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No experiments or quantitative results are reported; this is a theoretical paper."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No comparative claims based on data. Theoretical paper with no quantitative evaluation."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No quantitative results. Theoretical paper."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No data collection or samples. Theoretical paper."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experimental runs. Theoretical paper."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper discusses related work in Section 2 but does not systematically compare the AGL against existing governance approaches or alternative architectures."
     69       },
     70       "baselines_contemporary": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No empirical baselines to evaluate for recency. Theoretical paper."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No implemented system to ablate. The architecture is conceptual."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No evaluation metrics; the paper is a theoretical framework proposal."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No system outputs to evaluate. Theoretical paper."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No data or experiments. Theoretical paper."
     94       },
     95       "per_category_breakdown": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "No results to break down. Theoretical paper."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 4 (Open Research Challenges) discusses potential failure modes: collusion among agents sharing the same LLM, alert fatigue, privacy violations, and manipulation of negotiation protocols. The limitations section acknowledges the framework has not been validated."
    104       },
    105       "negative_results_reported": {
    106         "applies": false,
    107         "answer": false,
    108         "justification": "No experiments that could produce negative results. Theoretical paper."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims are appropriately hedged for a theoretical paper: it 'introduces' the AGL and 'contributes a reference architecture and technical specifications.' These are descriptive claims about the paper's content, which is indeed presented."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The paper makes no empirical causal claims. It proposes an architecture and uses language like 'designed to enable' and 'provides,' which are descriptive of intent rather than empirical causation."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper bounds its scope to ITS governance and explicitly acknowledges in Section 6 that claims of effectiveness 'have not yet been empirically validated through user studies, simulations, or prototype implementation.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": false,
    129         "answer": false,
    130         "justification": "No empirical results to explain alternatively. This is a theoretical/architectural paper."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "No measurements are taken. Theoretical paper."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No models are used. The paper proposes a conceptual architecture."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No prompting is performed. The paper discusses LLM-driven techniques conceptually but does not implement them."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No experiments run, no hyperparameters to report."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The multi-agent architecture is described in detail in Section 3: four agent types (SH, MSN, AG, SO), their roles, the governance lifecycle, integration hooks, and data flows are all specified."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No data to preprocess. Theoretical paper."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6 (Discussion and Conclusion) contains a dedicated 'Limitations' subsection with substantive discussion of the theoretical nature of the work, cultural considerations, and the transparency/cognitive-overload tradeoff."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The limitations are specific to this work: 'The AGL is presented as a conceptual architecture, and its core claims of effectiveness and non-intrusiveness have not yet been empirically validated.' It also discusses the risk of cognitive overload from excessive alerts and dependence on 'robust underlying knowledge infrastructure.'"
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states scope boundaries: it addresses ITS governance specifically, acknowledges no empirical validation, and outlines a staged validation plan (simulations → prototype → user studies → scalability) in Section 6."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No data collected. Theoretical paper."
    187       },
    188       "data_collection_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No data collection. Theoretical paper."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No participants or data sources. Theoretical paper."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No data pipeline. Theoretical paper."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is mentioned anywhere in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: all four authors are from Universidade Federal do Rio de Janeiro (UFRJ), NCE."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding disclosure, so independence of funder cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No pre-trained model is evaluated on any benchmark. This is a theoretical architecture paper."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No model evaluation. Theoretical paper."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No benchmarks used. Theoretical paper."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants. Theoretical architecture paper."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Purely theoretical paper with no implementation or experiments."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Purely theoretical paper."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "The AGL provides non-intrusive governance oversight without altering the ITS's core pedagogical logic.",
    296       "evidence": "Architectural description in Section 3 shows the AGL as an advisory overlay that sends recommendations but does not control the ITS decision engine. The ITS 'retaining full pedagogical autonomy, makes the final selection' (Section 3.2, Phase 3).",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "The privacy-preserving distributed evaluation model protects stakeholder autonomy by exchanging only structured votes, not underlying policies.",
    301       "evidence": "Section 3.4 describes federated evaluation where each SH agent evaluates locally and returns only vote outcomes, confidence scores, and opaque justification IDs. Techniques like differential privacy are mentioned as applicable.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "The policy taxonomy enables reliable translation of natural-language stakeholder policies into machine-executable logic.",
    306       "evidence": "Section 3.3 and Table 1 present four policy types (Hard Constraints, Soft Preferences, Temporal Rules, Hierarchical Rules) with LLM processing strategies for each. However, no implementation or evaluation demonstrates this actually works.",
    307       "supported": "unsupported"
    308     }
    309   ],
    310   "red_flags": [
    311     {
    312       "flag": "No empirical validation whatsoever",
    313       "detail": "The paper presents a conceptual architecture with no prototype, no simulation, no user study, and no empirical evidence that the framework works as described. The authors acknowledge this in the limitations section, but the claims about 'enabling distributed stakeholder participation' and 'providing structured, auditable governance advice' are entirely speculative."
    314     },
    315     {
    316       "flag": "Running vignette substitutes for evaluation",
    317       "detail": "Section 3.6 presents a hypothetical scenario as the sole 'demonstration' of the framework. The scenario is author-constructed and cherry-picked to show the framework working ideally, with no adversarial testing or edge cases explored."
    318     },
    319     {
    320       "flag": "Hand-waving about LLM capabilities",
    321       "detail": "The paper assumes LLMs can reliably classify policies, extract semantic content, perform ontology-based reasoning, and apply differential privacy — all complex unsolved problems — without providing evidence these capabilities exist at the required reliability level for educational governance."
    322     }
    323   ],
    324   "cited_papers": [
    325     {
    326       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    327       "authors": ["E. Hubinger"],
    328       "year": 2024,
    329       "relevance": "Directly relevant to AI safety — demonstrates persistent deceptive alignment through safety training."
    330     },
    331     {
    332       "title": "AI Control: Improving Safety Despite Intentional Subversion",
    333       "authors": ["R. Greenblatt"],
    334       "year": 2024,
    335       "relevance": "Addresses control strategies for AI systems that may be intentionally subversive."
    336     },
    337     {
    338       "title": "A survey on large language model based autonomous agents",
    339       "authors": ["L. Wang"],
    340       "year": 2024,
    341       "relevance": "Comprehensive survey of LLM-based agent architectures and capabilities."
    342     },
    343     {
    344       "title": "MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework",
    345       "authors": ["S. Hong"],
    346       "year": 2024,
    347       "relevance": "Multi-agent collaboration framework relevant to agentic AI workflows."
    348     },
    349     {
    350       "title": "Exploring Collaboration Mechanisms for LLM Agents: A Social Psychology View",
    351       "authors": ["J. Zhang"],
    352       "year": 2024,
    353       "relevance": "Studies collaboration dynamics in LLM agent systems."
    354     },
    355     {
    356       "title": "AgentSpec: Customizable Runtime Enforcement for Safe and Reliable LLM Agents",
    357       "authors": ["H. Wang"],
    358       "year": 2025,
    359       "relevance": "Runtime safety enforcement for LLM agents — directly related to agent governance."
    360     },
    361     {
    362       "title": "Enforcement Agents: Enhancing Accountability and Resilience in Multi-Agent AI Frameworks",
    363       "authors": ["S. Tamang", "D.J. Bora"],
    364       "year": 2025,
    365       "relevance": "Agent accountability frameworks relevant to AI governance and safety."
    366     },
    367     {
    368       "title": "Open Challenges in Multi-Agent Security: Towards Secure Systems of Interacting AI Agents",
    369       "authors": ["C.S. de Witt"],
    370       "year": 2025,
    371       "relevance": "Security challenges in multi-agent systems relevant to AI safety research."
    372     }
    373   ]
    374 }

Impressum · Datenschutz