scan.json (20063B)
1 { 2 "paper": { 3 "title": "Towards autonomous normative multi-agent systems for Human-AI software engineering teams", 4 "authors": ["Hoa Khanh Dam", "Geeta Mahala", "Rashina Hoda", "Xi Zheng", "Cristina Conati"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2512.02329", 8 "doi": "10.1145/nnnnnnn.nnnnnnn" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": false, 15 "justification": "No source code repository or download link is provided. The paper mentions implementing core components of BDIM-SE using AgentSpeak but does not release the code." 16 }, 17 "data_released": { 18 "applies": false, 19 "answer": false, 20 "justification": "This is a theoretical/vision paper with no experiments or datasets. There is no data to release." 21 }, 22 "environment_specified": { 23 "applies": false, 24 "answer": false, 25 "justification": "No experiments are conducted, so environment specifications are not applicable." 26 }, 27 "reproduction_instructions": { 28 "applies": false, 29 "answer": false, 30 "justification": "No experiments or empirical results to reproduce. The paper is a vision/position paper proposing a conceptual framework." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": false, 36 "answer": false, 37 "justification": "No experiments or quantitative results are presented. This is a theoretical vision paper." 38 }, 39 "significance_tests": { 40 "applies": false, 41 "answer": false, 42 "justification": "No comparative experiments or quantitative claims are made. This is a theoretical vision paper." 43 }, 44 "effect_sizes_reported": { 45 "applies": false, 46 "answer": false, 47 "justification": "No experiments are conducted, so effect sizes are not applicable." 48 }, 49 "sample_size_justified": { 50 "applies": false, 51 "answer": false, 52 "justification": "No experiments involving samples are conducted. This is a theoretical paper." 53 }, 54 "variance_reported": { 55 "applies": false, 56 "answer": false, 57 "justification": "No experimental runs or quantitative results are presented." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": false, 63 "answer": false, 64 "justification": "No empirical evaluation is conducted. The paper is a vision/position paper with no experiments to compare against baselines." 65 }, 66 "baselines_contemporary": { 67 "applies": false, 68 "answer": false, 69 "justification": "No empirical evaluation is conducted. No baselines are needed for a vision paper." 70 }, 71 "ablation_study": { 72 "applies": false, 73 "answer": false, 74 "justification": "No empirical evaluation is conducted. There is nothing to ablate without experiments." 75 }, 76 "multiple_metrics": { 77 "applies": false, 78 "answer": false, 79 "justification": "No empirical evaluation is conducted. No metrics are reported." 80 }, 81 "human_evaluation": { 82 "applies": false, 83 "answer": false, 84 "justification": "No evaluation of any kind is conducted in this vision paper. Human evaluation is planned for future work (Section 4) but not present." 85 }, 86 "held_out_test_set": { 87 "applies": false, 88 "answer": false, 89 "justification": "No experiments involving datasets are conducted." 90 }, 91 "per_category_breakdown": { 92 "applies": false, 93 "answer": false, 94 "justification": "No empirical results are presented to break down." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": false, 99 "justification": "Although this is a vision paper, discussing potential failure modes of the proposed framework would be valuable. The paper does not discuss where BDIM-SE might fail or what its limitations are." 100 }, 101 "negative_results_reported": { 102 "applies": false, 103 "answer": false, 104 "justification": "No experiments are conducted, so there are no negative results to report." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": false, 111 "justification": "The abstract claims agents will operate 'with a level of speed, reliability, and adaptability far beyond the current software development processes' and that the framework establishes 'a scalable, transparent and trustworthy framework.' These are unsupported aspirational claims with no empirical evidence provided." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": false, 116 "justification": "The paper makes implicit causal claims such as 'These innovations establish a scalable, transparent and trustworthy framework' and that norm-based coordination 'eliminates the need for manually encoding coordination plans.' No evidence or formal analysis supports these causal assertions." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The paper makes broad claims about autonomous SE agents being 'the primary driver of the core software development activities' and teams working '100 times faster' without bounding these claims to specific contexts, domains, or conditions. The title and abstract suggest general applicability." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": false, 126 "justification": "The paper does not discuss alternative approaches or why its specific combination of BDI + norms + LLMs is preferable to other plausible architectural choices. No consideration of why existing frameworks like MetaGPT or ChatDev might be sufficient with incremental improvements." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": false, 132 "answer": false, 133 "justification": "No experiments are conducted using any specific LLM. The paper describes a conceptual architecture that integrates a 'foundational LLM' without specifying which one." 134 }, 135 "prompts_provided": { 136 "applies": false, 137 "answer": false, 138 "justification": "No prompting experiments are conducted. The paper shows pseudo-code with ask_LLM calls but these are part of the conceptual design, not actual experiments." 139 }, 140 "hyperparameters_reported": { 141 "applies": false, 142 "answer": false, 143 "justification": "No experiments are conducted, so hyperparameters are not applicable." 144 }, 145 "scaffolding_described": { 146 "applies": true, 147 "answer": true, 148 "justification": "The paper describes the BDIM-SE cognitive architecture in detail (Section 2), including beliefs, desires, intentions, memory, plans, normative reasoning, and coordination mechanisms (Section 3). The AgentSpeak-based implementation with plan structures (Figures 2 and 3) provides substantial architectural detail." 149 }, 150 "data_preprocessing_documented": { 151 "applies": false, 152 "answer": false, 153 "justification": "No data is collected or processed. This is a theoretical vision paper." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": false, 160 "justification": "There is no limitations or threats-to-validity section. The paper has a 'Future Plans' section (Section 4) that describes planned work but does not discuss limitations of the proposed approach." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": false, 165 "justification": "No threats to validity are discussed. The paper does not acknowledge any specific challenges or risks with the proposed architecture." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "The paper does not state what the proposed framework cannot do or what settings it is not designed for. Broad claims are made without explicit scope boundaries." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": false, 176 "answer": false, 177 "justification": "No experiments or data collection. This is a theoretical vision paper." 178 }, 179 "data_collection_described": { 180 "applies": false, 181 "answer": false, 182 "justification": "No data is collected. This is a theoretical vision paper." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No participants or data samples are recruited. This is a theoretical vision paper." 188 }, 189 "data_pipeline_documented": { 190 "applies": false, 191 "answer": false, 192 "justification": "No data pipeline exists. This is a theoretical vision paper." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": false, 199 "justification": "No funding acknowledgments or disclosure section is present in the paper." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly stated: University of Wollongong (Dam, Mahala), Monash University (Hoda), Macquarie University (Zheng), and University of British Columbia (Conati). All are academic affiliations." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": false, 209 "justification": "No funding is disclosed, so independence of the funder cannot be assessed. The absence of funding disclosure is itself the issue." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests or financial interests statement is present in the paper." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": false, 220 "answer": false, 221 "justification": "No pre-trained model is evaluated on any benchmark. This is a theoretical vision paper." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": false, 225 "answer": false, 226 "justification": "No model evaluation on benchmarks is conducted." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": false, 230 "answer": false, 231 "justification": "No benchmark evaluation is conducted." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants are involved. Human studies are described as future work only." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants are involved in this paper." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants are involved in this paper." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are involved in this paper." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants or experimental conditions in this paper." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants or experimental conditions in this paper." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved in this paper." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "This is a theoretical vision paper with no experiments. No inference costs are applicable." 276 }, 277 "compute_budget_stated": { 278 "applies": false, 279 "answer": false, 280 "justification": "This is a theoretical vision paper with no computational experiments." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "BDIM-SE agents with beliefs, desires, intentions, and memory represent a 'significant departure' from existing LLM-based agents like MetaGPT and ChatDev.", 287 "evidence": "Section 2 describes the architecture conceptually and argues that existing frameworks 'rely on scripted workflows, prompt-engineered roles, and limited adaptability' and 'lack genuine agency.' No empirical comparison is provided.", 288 "supported": "unsupported" 289 }, 290 { 291 "claim": "The normative approach eliminates the need for manually encoding coordination plans as required in existing multi-agent frameworks like LangGraph or CrewAI.", 292 "evidence": "Section 3 describes how commitments can automatically generate coordination plans (Figure 3) and argues this 'eliminates the need for manually encoding coordination plans.' No empirical demonstration or formal proof is provided.", 293 "supported": "unsupported" 294 }, 295 { 296 "claim": "The NorMAS-SE framework can scale effectively to large and dynamic Human-AI software engineering teams.", 297 "evidence": "Section 3 asserts scalability as a benefit of norm-based coordination, but no scaling experiments, formal complexity analysis, or empirical evidence is provided.", 298 "supported": "unsupported" 299 }, 300 { 301 "claim": "The core components of the BDIM-SE agent architecture have been implemented, integrating a foundational LLM with belief, desire, intention, and memory modules.", 302 "evidence": "Section 4 states 'We have implemented the core components of our BDIM-SE agent architecture' but no code, demonstration, evaluation, or implementation details beyond pseudo-code are provided.", 303 "supported": "weak" 304 } 305 ], 306 "methodology_tags": ["theoretical"], 307 "key_findings": "This is a vision/position paper that proposes BDIM-SE, a cognitive architecture for autonomous software engineering agents based on beliefs, desires, intentions, and memory (extending classical BDI with LLM-powered memory). It introduces NorMAS-SE, a normative multi-agent system framework where human-AI software engineering teams coordinate through deontic norms (obligations, prohibitions, permissions) expressed as commitments. The paper provides a conceptual architecture and pseudo-code examples using AgentSpeak but no empirical evaluation. Future work plans include single-agent benchmarks, multi-agent experiments, and human user studies.", 308 "red_flags": [ 309 { 310 "flag": "No empirical evaluation", 311 "detail": "The paper proposes a framework and claims significant advantages (scalability, trustworthiness, speed) over existing approaches like MetaGPT, ChatDev, LangGraph, and CrewAI, but provides zero empirical evidence. All claims are aspirational." 312 }, 313 { 314 "flag": "Unsupported strong claims", 315 "detail": "The abstract claims the framework enables development 'with a level of speed, reliability, and adaptability far beyond the current software development processes.' The introduction envisions teams working '100 times faster.' No evidence supports these specific claims." 316 }, 317 { 318 "flag": "No limitations discussed", 319 "detail": "Despite proposing a complex multi-agent architecture that combines symbolic AI, LLMs, and normative reasoning, the paper does not discuss any limitations, challenges, or potential failure modes of the approach." 320 }, 321 { 322 "flag": "Implementation claims without evidence", 323 "detail": "Section 4 states 'We have implemented the core components of our BDIM-SE agent architecture' but no code is released, no demonstration is shown, and no evaluation of the implementation is provided." 324 } 325 ], 326 "cited_papers": [ 327 { 328 "title": "MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework", 329 "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"], 330 "year": 2024, 331 "relevance": "Key multi-agent framework for LLM-based software engineering teams that the paper positions itself against." 332 }, 333 { 334 "title": "ChatDev: Communicative Agents for Software Development", 335 "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"], 336 "year": 2024, 337 "relevance": "Another LLM-based multi-agent software development framework serving as a comparison point." 338 }, 339 { 340 "title": "LLM-Based Multi-Agent Systems for Software Engineering: Literature Review, Vision, and the Road Ahead", 341 "authors": ["Junda He", "Christoph Treude", "David Lo"], 342 "year": 2025, 343 "doi": "10.1145/3712003", 344 "relevance": "Survey of LLM-based multi-agent systems for software engineering, directly relevant to the survey scope." 345 }, 346 { 347 "title": "Large Language Model Based Multi-agents: A Survey of Progress and Challenges", 348 "authors": ["Taicheng Guo", "Xiuying Chen", "Yaqi Wang"], 349 "year": 2024, 350 "doi": "10.24963/ijcai.2024/890", 351 "relevance": "Comprehensive survey of LLM-based multi-agent systems covering challenges and progress." 352 }, 353 { 354 "title": "LLM Multi-Agent Systems: Challenges and Open Problems", 355 "authors": ["Shanshan Han", "Qifan Zhang", "Yuhang Yao"], 356 "year": 2024, 357 "arxiv_id": "2402.03578", 358 "relevance": "Identifies challenges and open problems in LLM multi-agent systems relevant to the survey." 359 }, 360 { 361 "title": "Cognitive Architectures for Language Agents", 362 "authors": ["Theodore R. Sumers", "Shunyu Yao", "Karthik Narasimhan", "Thomas L. Griffiths"], 363 "year": 2024, 364 "relevance": "Proposes cognitive architectures for language agents, directly relevant to agentic AI design." 365 }, 366 { 367 "title": "A Survey on Large Language Models for Code Generation", 368 "authors": ["Juyong Jiang", "Fan Wang", "Jiasi Shen"], 369 "year": 2024, 370 "arxiv_id": "2406.00515", 371 "relevance": "Survey on LLM-based code generation, relevant to AI-assisted software engineering." 372 }, 373 { 374 "title": "Software Testing With Large Language Models: Survey, Landscape, and Vision", 375 "authors": ["Junjie Wang", "Yuchao Huang", "Chunyang Chen"], 376 "year": 2024, 377 "doi": "10.1109/TSE.2024.3368208", 378 "relevance": "Survey on LLM-based software testing, relevant to AI in software engineering evaluation." 379 }, 380 { 381 "title": "A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions", 382 "authors": ["Lei Huang", "Weijiang Yu", "Weitao Ma"], 383 "year": 2025, 384 "doi": "10.1145/3703155", 385 "relevance": "Addresses LLM hallucination which is a core safety concern for autonomous AI agents in software engineering." 386 }, 387 { 388 "title": "An LLM-based multi-agent framework for agile effort estimation", 389 "authors": ["Long Bui", "Hoa Khanh Dam", "Rashina Hoda"], 390 "year": 2025, 391 "relevance": "LLM-based multi-agent framework for a specific SE task (effort estimation) from the same research group." 392 }, 393 { 394 "title": "Procedural Memory Is Not All You Need: Bridging Cognitive Gaps in LLM-Based Agents", 395 "authors": ["Schaun Wheeler", "Olivier Jeunen"], 396 "year": 2025, 397 "doi": "10.1145/3708319.3734172", 398 "relevance": "Discusses cognitive gaps in LLM-based agents, relevant to agent architecture design." 399 }, 400 { 401 "title": "The use of large language models for program repair", 402 "authors": ["Fida Zubair", "Maryam Al-Hitmi", "Cagatay Catal"], 403 "year": 2025, 404 "doi": "10.1016/j.csi.2024.103951", 405 "relevance": "Survey on LLM-based program repair, relevant to AI-assisted software maintenance." 406 } 407 ] 408 }