scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22266B)
      1 {
      2   "paper": {
      3     "title": "Towards Engineering Multi-Agent LLMs: A Protocol-Driven Approach",
      4     "authors": ["Zhenyu Mao", "Jacky Keung", "Fengji Zhang", "Shuo Liu", "Yifei Wang", "Jialong Li"],
      5     "year": 2025,
      6     "venue": "Asia-Pacific Software Engineering Conference",
      7     "arxiv_id": "2510.12120",
      8     "doi": "10.1109/APSEC66846.2025.00100"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "SEMAP, a protocol-layer methodology applying SE principles (behavioral contracts, structured messaging, lifecycle-guided execution) to multi-agent LLMs, reduces failures by up to 69.6% in function-level development and 56.7% in deployment-level development compared to a MetaGPT baseline. In vulnerability detection, failure reductions of up to 47.4% (Python) and 28.2% (C/C++) are observed. SEMAP shows more stable failure reduction across collaboration rounds than the baseline.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL or code release mentioned. The conclusion mentions 'releasing artifacts for reproducibility' as future work."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available datasets: HumanEval, ProgramDev, Devign (devign100 subset), and CVEFixes (vudenc100 subset). However, the specific 100-sample subsets (devign100, vudenc100) are constructed by the authors and not released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, dependency lists, or hardware details provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions provided. Artifacts release is listed as future work."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Tables I and II report only point estimates (raw counts and percentage changes) with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Claims of failure reduction (e.g., '69.6% reduction') are made by comparing raw counts without any statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Percentage reductions are reported with baseline context (e.g., 'from 256 to 92' = 64.1%), providing enough information to assess magnitude."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for why 100-sample subsets were chosen for vulnerability detection, or why HumanEval's 164 problems are sufficient."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures reported. Results appear to be from single runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "MetaGPT framework is used as the baseline system for both development and vulnerability detection tasks."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "Only one baseline (MetaGPT) is used. The conclusion acknowledges the need to 'compare against more baselines, including single-agent LLMs and domain-specific detectors' as future work."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "SEMAP has three components (contracts, messaging, lifecycle verification) but no ablation study isolates their individual contributions. The conclusion lists 'ablation studies' as future work."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Failures are categorized into three types (under-specification, inter-agent misalignment, task verification) and reported separately, plus total counts and round-by-round trends."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Failure categorization is done entirely by LLM-as-a-Judge (gpt-4o-2024-08-06), not human evaluation. No human review of outputs."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Standard benchmark test sets (HumanEval, ProgramDev, devign100, vudenc100) are used for evaluation, not for tuning."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Tables I and II provide per-failure-category breakdowns across all tasks and models."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No qualitative failure analysis or specific failure examples are discussed. Results are purely quantitative."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Some weak results are reported: e.g., ProgramDev with GPT-4.1-nano shows only 12.6% total reduction, and 0.0% reduction in inter-agent misalignment; devign100 with DeepSeek shows only 8.3% reduction."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims ('up to 69.6% reduction', '56.7%', '47.4%', '28.2%') are supported by Tables I and II."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper claims SEMAP 'reduces failures' and 'mitigates' issues (causal language), but the comparison is only against one baseline (MetaGPT) without controlling for confounds like the A2A infrastructure itself or the additional prompt engineering in SEMAP."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims a general 'protocol-driven approach' for 'multi-agent LLMs' but results are only from two models (DeepSeek-V3-0324 and gpt-4.1-nano) on four specific datasets. The paper does not bound its generalizations to these settings."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No discussion of alternative explanations. The improvements could be due to more detailed prompting, the A2A infrastructure, or simply more structured agent instructions rather than the three SE principles claimed."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures failure counts (as judged by an LLM) and frames this as 'system robustness' and 'effectiveness' without discussing the gap between LLM-judged failures and actual system quality."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Exact model versions are specified: 'DeepSeek-V3-0324' and 'gpt-4.1-nano-2025-04-14', plus 'gpt-4o-2024-08-06' for the judge."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No actual prompt text is provided. The paper describes the roles and contracts conceptually but does not provide the actual prompts or system instructions given to agents."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the models used."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The SEMAP protocol is described in detail in Section III, including behavioral contracts, structured messaging format, and lifecycle FSM. The architecture (centralized 5-agent for dev, decentralized 3-agent for vuln detection) is specified."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Dataset construction is documented: devign100 is '100-sample subset...constructed by randomly selecting 50 vulnerable and 50 safe C/C++ functions' from Devign; vudenc100 similarly from CVEFixes with labeling criteria described."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated limitations section. The conclusion mentions future work items that implicitly acknowledge limitations, but there is no substantive limitations discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity discussed. The paper does not address concerns like LLM-as-a-Judge reliability, single-baseline comparison, or single-run results."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The conclusion mentions scaling to 'larger datasets, agent populations, and longer workflows' as future work, implicitly acknowledging current scope, but does not explicitly state what the results do NOT show."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data (agent outputs, failure logs, LLM judge outputs) is made available."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Dataset construction procedure is described: random sampling from Devign and CVEFixes with balanced classes (50/50), labeling criteria for vulnerability defined."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All evaluation is automated using benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The pipeline from raw benchmark problems to failure counts is not fully documented. How the LLM-as-a-Judge categorizes failures, what its prompt looks like, and how ambiguous cases are handled is not described beyond citing [19]."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section present."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations (City University of Hong Kong, Waseda University) are clearly stated."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement present."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates stated for DeepSeek-V3-0324 or gpt-4.1-nano. HumanEval (2021) is likely in both models' training data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether HumanEval or other benchmark problems appeared in the models' training data."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "HumanEval was published in 2021 and is widely known to be contaminated in modern LLMs. This is not discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, API costs, or latency reported. The conclusion mentions 'measuring resource overhead' as future work."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No computational budget stated (API spend, GPU hours, wall-clock time)."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of multiple seeds or seed sensitivity. Results appear to be single-run."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Number of runs per experiment is not stated."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget reported. The number of collaboration rounds (5) is stated but no tuning process described."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No discussion of how the SEMAP configuration was selected or whether alternatives were tried."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Multiple comparisons across 4 datasets × 2 models × 3 failure categories with no statistical tests at all, let alone corrections."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement SEMAP and compare against MetaGPT baseline without acknowledging author-evaluation bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "SEMAP adds protocol overhead (contracts, structured messaging, lifecycle FSM) compared to MetaGPT baseline but compute cost differences are not discussed."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether HumanEval function-level tasks and 100-sample subsets are valid measures of multi-agent system quality."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "SEMAP uses A2A infrastructure while baseline uses MetaGPT — different scaffolds. Performance differences could be due to the scaffold rather than the SE principles. This confound is not addressed."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "HumanEval (2021) predates both models' training. ProgramDev and devign100/vudenc100 temporal relationships to model training not discussed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup provides information not available in realistic usage."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of independence between training and test data."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "SEMAP reduces total failures by up to 69.6% for function-level development (HumanEval with DeepSeek)",
    365       "evidence": "Table I: DeepSeek baseline 112 total failures → SEMAP 34 (69.6% reduction)",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "SEMAP reduces total failures by up to 56.7% for deployment-level development (ProgramDev with DeepSeek)",
    370       "evidence": "Table I: DeepSeek baseline 67 → SEMAP 29 (56.7% reduction)",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "SEMAP reduces vulnerability detection failures by up to 47.4% (Python with GPT-4.1-nano)",
    375       "evidence": "Table II: GPT-4.1-nano baseline 38 → SEMAP 20 (47.4% reduction on vudenc100)",
    376       "supported": "weak"
    377     },
    378     {
    379       "claim": "SEMAP promotes more stable failure reduction across collaboration rounds",
    380       "evidence": "Figure 2 shows SEMAP trends declining more steadily than baseline across 5 rounds",
    381       "supported": "weak"
    382     }
    383   ],
    384   "red_flags": [
    385     {
    386       "flag": "LLM-as-a-Judge evaluation",
    387       "detail": "All failure categorization is done by gpt-4o-2024-08-06 using the pipeline from [19]. The reliability of this automated judge is not validated in this paper, and no human evaluation corroborates the LLM's assessments."
    388     },
    389     {
    390       "flag": "Single baseline comparison",
    391       "detail": "Only MetaGPT is used as a baseline. No comparison against single-agent LLMs, other multi-agent frameworks (AutoGen, ChatDev), or ablated SEMAP variants."
    392     },
    393     {
    394       "flag": "No statistical rigor",
    395       "detail": "All claims of improvement are based on raw count comparisons from what appear to be single runs. No significance tests, confidence intervals, or variance reported."
    396     },
    397     {
    398       "flag": "Scaffold confound",
    399       "detail": "SEMAP is built on Google A2A while baseline uses MetaGPT. Observed improvements may be due to infrastructure differences rather than the proposed SE principles."
    400     },
    401     {
    402       "flag": "Cherry-picked 'up to' framing",
    403       "detail": "Abstract highlights best-case reductions ('up to 69.6%') while some configurations show much smaller improvements (8.3% for devign100 with DeepSeek, 12.6% for ProgramDev with GPT-4.1-nano)."
    404     },
    405     {
    406       "flag": "Benchmark contamination risk",
    407       "detail": "HumanEval (2021) is likely heavily contaminated in both DeepSeek-V3 and GPT-4.1 training data, potentially confounding the development task results."
    408     },
    409     {
    410       "flag": "No ablation study",
    411       "detail": "SEMAP combines three principles but no ablation isolates individual contributions. Impossible to know which components drive improvements."
    412     }
    413   ],
    414   "cited_papers": [
    415     {
    416       "title": "Why do multi-agent LLM systems fail?",
    417       "authors": ["M. Cemri"],
    418       "year": 2025,
    419       "arxiv_id": "2503.13657",
    420       "relevance": "Introduces MAST failure taxonomy used as the evaluation framework and provides the ProgramDev benchmark and LLM-as-a-Judge pipeline."
    421     },
    422     {
    423       "title": "A survey of AI agent protocols",
    424       "authors": ["Y. Yang"],
    425       "year": 2025,
    426       "arxiv_id": "2504.16736",
    427       "relevance": "Survey categorizing multi-agent communication protocols along context/inter-agent and general/domain-specific dimensions."
    428     },
    429     {
    430       "title": "Evaluating large language models trained on code",
    431       "authors": ["M. Chen"],
    432       "year": 2021,
    433       "arxiv_id": "2107.03374",
    434       "relevance": "Introduces HumanEval benchmark used for function-level code generation evaluation."
    435     },
    436     {
    437       "title": "LLM-based multi-agent systems for software engineering: Literature review, vision and the road ahead",
    438       "authors": ["J. He"],
    439       "year": 2024,
    440       "relevance": "Literature review of multi-agent LLMs for SE tasks, directly relevant to the survey scope."
    441     },
    442     {
    443       "title": "ChatDev: Communicative agents for software development",
    444       "authors": ["Q. Qian"],
    445       "year": 2023,
    446       "relevance": "Multi-agent framework for collaborative software development using LLM agents."
    447     },
    448     {
    449       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation",
    450       "authors": ["Q. Wu"],
    451       "year": 2023,
    452       "relevance": "Major multi-agent LLM framework enabling conversational agent workflows."
    453     },
    454     {
    455       "title": "Devign: Effective vulnerability identification by learning comprehensive program semantics via graph neural networks",
    456       "authors": ["Y. Zhou"],
    457       "year": 2019,
    458       "relevance": "Source dataset for vulnerability detection evaluation in multi-agent systems."
    459     },
    460     {
    461       "title": "MAGIS: LLM-based multi-agent framework for GitHub issue resolution",
    462       "authors": ["W. Tao"],
    463       "year": 2024,
    464       "arxiv_id": "2403.17927",
    465       "relevance": "Multi-agent framework for software maintenance tasks (issue resolution)."
    466     },
    467     {
    468       "title": "Multi-role consensus through LLMs discussions for vulnerability detection",
    469       "authors": ["Z. Mao"],
    470       "year": 2024,
    471       "relevance": "Prior work by same authors on multi-agent LLM vulnerability detection."
    472     }
    473   ]
    474 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs