scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26341B)
      1 {
      2   "paper": {
      3     "title": "The Trust Paradox in LLM-Based Multi-Agent Systems: When Collaboration Becomes a Security Vulnerability",
      4     "authors": ["Zijie Xu", "Minfeng Qi", "Shiqing Wu", "Lefeng Zhang", "Qiwen Wei", "Han He", "Ningran Li"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.18563"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": false,
     16         "justification": "The conclusion mentions 'release the scenario dataset and evaluation pipeline as a reproducible baseline' but no repository URL, archive link, or download location is provided anywhere in the paper."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "Same as above — the dataset is described as reusable and releasable but no actual download URL or repository link is provided in the paper."
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The paper names frameworks (AgentScope, AutoGen, LangGraph) and models but provides no requirements.txt, Dockerfile, library versions, or environment setup details."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No step-by-step reproduction instructions are provided. The experimental setup is described at a high level but lacks runnable commands or scripts."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Table III reports only point estimates for OER and AD. Box plots in Fig. 6 show distributions but no confidence intervals are reported for any main results."
     39       },
     40       "significance_tests": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper claims OER 'rises monotonically with τ' and ranks models by trust sensitivity without any statistical significance tests. Comparisons are made by inspecting numbers directly."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper reports effect sizes with baseline context: '∆OER = 0.36', '79.5% reduction relative to the no-defense baseline', 'OER drops by 22% (0.50 → 0.39)'. These provide magnitude and context."
     49       },
     50       "sample_size_justified": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "1,488 interaction chains are generated across the grid but no justification is provided for why this number is sufficient. The limitations section acknowledges the sample is 'adequate for aggregate trends but insufficient for tail-risk.'"
     54       },
     55       "variance_reported": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "Fixed random seeds are used with apparently single runs per cell. No variance across seeds or repeated runs is reported. AD captures variance across trust levels, not across replications."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper compares across 4 model backends, 3 frameworks, 3 trust levels, and evaluates two defenses against a no-defense baseline condition."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "DeepSeek, Qwen, GPT, and Llama-3-8B are contemporary models. AgentScope, AutoGen, and LangGraph are current orchestration frameworks."
     71       },
     72       "ablation_study": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Two defenses (Sensitive-Information Repartitioning and GA-Agent) are tested independently against no-defense baseline (Section V.E, Fig. 9), functioning as ablation of defense components."
     76       },
     77       "multiple_metrics": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Three metrics are used: Over-Exposure Rate (OER), Authorization Drift (AD), and task success rate. OER and AD are formally defined in Section IV.D."
     81       },
     82       "human_evaluation": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "OER is computed via 'a rules library and a classifier' (Section IV.D). No human evaluation of whether outputs actually constitute privacy violations or are contextually appropriate."
     86       },
     87       "held_out_test_set": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "All 19 scenarios are used for evaluation. There is no separation of development scenarios (for tuning prompts/defenses) from test scenarios."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Table III provides per-sub-scene OER and AD breakdowns across all 19 scenarios, 4 models, and 3 frameworks — extensive per-category reporting."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section V.F discusses scenarios where defenses fail: out-of-scope/emergency requests yield OER of 0.41. Section V.E notes defenses raise low-τ baseline (DeepSeek: 0.12→0.21)."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Defenses raising low-τ baseline OER is reported as a trade-off. AutoGen's high baseline OER despite low AD is discussed. GA-Agent's weaker slope reduction vs Repartitioning is acknowledged."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Abstract claims that higher trust improves task success but heightens exposure risks are supported by Table III, Figs. 3-6. Defense claims supported by Fig. 9. All stated trends are present in the data."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The study uses controlled experiments: 'Within each (S, m, f) tuple, only τ systematically varied' (Section VI.A). Fixed seeds, unified prompts, disabled tools/networking. This is adequate for the causal claim that τ modulates OER."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "The title claims broadly about 'LLM-Based Multi-Agent Systems' but experiments use only synthetic two-agent CK/SK scenarios with disabled networking, tools, and memory. The limitations section notes 'limiting ecological validity' but the title and abstract don't bound these claims."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Section VI.A discusses and rules out a 'length-risk artifact' via length-controlled resampling (r < 0.12, p > 0.1). The mechanistic account distinguishes model-level alignment priors from system-level coordination effects."
    128       },
    129       "proxy_outcome_distinction": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper clearly defines OER as measuring information beyond MNI baseline and AD as sensitivity to τ. These directly measure what is claimed (over-exposure and trust sensitivity). No proxy gap."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "Models are listed as 'DeepSeek', 'Qwen', 'GPT', 'Llama-3-8B' without specific version identifiers (no API version, snapshot date, or model ID like 'gpt-4-0613'). Section IV.B describes them generically."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Appendix A provides full scenario prompts for all 19 sub-scenes with CK-Agent and SK-Agent role descriptions. Appendix B provides the full GA-Agent prompt. These are the actual texts used."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper repeatedly mentions 'fixed decoding hyperparameters (temperature, top-p, max tokens)' but never states the actual values used for any of these parameters."
    150       },
    151       "scaffolding_described": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Table II describes each framework's key characteristics and experiment setup. Fig. 1 shows the workflow. Agent JSON configs are shown in figures. The round-robin scheduling and MNI-Gate hooks are described."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section IV.A describes scenario construction: 3 macro scenes, 19 sub-scenes, synthetic de-identified data, trust operationalized via κ(τ) and r(τ). The (S, τ, m, f) grid producing 1,488 chains is fully specified."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section VI.B 'Limitations and Future Work' provides substantive discussion of four specific limitations."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section VI.B identifies specific threats: 1,488 chains insufficient for tail-risk analysis, generation stochasticity despite fixed parameters, discrete τ only (0.1/0.5/0.9), and disabled networking/tools limiting ecological validity."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section VI.B explicitly states: 'networking, tools, and memory were disabled to control confounders, limiting ecological validity.' Also notes discrete τ limitation and plans for continuous/dynamic τ."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No interaction logs, raw outputs, or data archives are made available despite the paper claiming to release the dataset. No URL is provided."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section IV.A-E describe data generation: closed-loop A2A interactions, isolated chains, fixed seeds, cleared caches, standardized prompts. The full (S, τ, m, f) grid is specified."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants. All data is synthetically generated from agent-agent interactions on purpose-built scenarios."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section IV.E describes the two-step protocol: (i) fix framework, sweep models; (ii) fix model, sweep frameworks. Each cell produces an independent log; OER and AD are computed from outputs."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding source, grants, or sponsorship is mentioned anywhere in the paper."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are listed: Minzu University of China, City University of Macau, University of Adelaide. None of the authors are affiliated with the model providers being evaluated."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding statement leaves this unanswerable."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is included in the paper."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "The paper tests model behavior (information disclosure under trust manipulation), not model knowledge on benchmarks. Scenarios are novel synthetic constructions. Contamination of test scenarios in training data is not a meaningful concern here."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "Same as above — the evaluation tests behavioral tendencies under controlled trust conditions, not memorized knowledge. Standard contamination concerns do not apply."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "The scenarios are purpose-built synthetic interactions, not pre-existing benchmarks that could appear in training data."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants. All experiments involve agent-agent interactions."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "1,488 interaction chains across 4 model backends (including GPT API calls) with no mention of API costs, tokens consumed, or wall-clock time."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No compute budget, GPU hours, or total API spend is mentioned despite running Llama-3-8B locally and calling commercial APIs."
    287       }
    288     },
    289     "experimental_rigor": {
    290       "seed_sensitivity_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Seeds are fixed but only single-seed results are reported. No analysis of how results vary across seeds despite the paper acknowledging 'generation stochasticity persists' in limitations."
    294       },
    295       "number_of_runs_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "The total of 1,488 chains across the full grid is stated, but per-cell run counts are not explicitly given. It appears to be a single run per (S, τ, m, f) cell."
    299       },
    300       "hyperparameter_search_budget": {
    301         "applies": false,
    302         "answer": false,
    303         "justification": "No hyperparameter tuning is performed. Decoding parameters are fixed across all experiments."
    304       },
    305       "best_config_selection_justified": {
    306         "applies": false,
    307         "answer": false,
    308         "justification": "No configuration selection is performed. All conditions in the grid are reported."
    309       },
    310       "multiple_comparison_correction": {
    311         "applies": false,
    312         "answer": false,
    313         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    314       },
    315       "self_comparison_bias_addressed": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The authors propose and evaluate their own defenses (Repartitioning and GA-Agent) without acknowledging the bias of evaluating their own mitigations. No independent evaluation."
    319       },
    320       "compute_budget_vs_performance": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "Defenses add 'extra turns and latency' but no quantification of compute cost vs. safety improvement is provided. Llama-3-8B (local) vs API models have very different compute profiles, not discussed."
    324       },
    325       "benchmark_construct_validity": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The 19 synthetic scenarios are never validated as representative of real multi-agent trust dynamics. No discussion of whether synthetic CK/SK-Agent scenarios capture real-world information leakage risks."
    329       },
    330       "scaffold_confound_addressed": {
    331         "applies": true,
    332         "answer": true,
    333         "justification": "The paper explicitly tests three different frameworks (AgentScope, AutoGen, LangGraph) with the same models and analyzes how the framework shapes the trust-risk curve (Section V.D, RQ4). Framework is treated as a variable."
    334       }
    335     },
    336     "data_leakage": {
    337       "temporal_leakage_addressed": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "No discussion of whether models may have been exposed to similar trust-manipulation scenarios in training data, or whether the scenario structures resemble training examples."
    341       },
    342       "feature_leakage_addressed": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "Trust is injected directly into prompts (e.g., 'You trust Jay very much'). No discussion of whether this prompt-level manipulation constitutes a form of feature leakage or whether models respond to surface cues vs. genuine trust modeling."
    346       },
    347       "non_independence_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "The 19 scenarios share structural similarities (CK-Agent/SK-Agent pattern, similar secret types). No discussion of non-independence across scenarios or whether results are driven by shared structure."
    351       },
    352       "leakage_detection_method": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No leakage detection or prevention method is applied. The length-controlled resampling check (Section VI.A) addresses a confound, not data leakage."
    356       }
    357     }
    358   },
    359   "claims": [
    360     {
    361       "claim": "Higher inter-agent trust (τ) monotonically increases Over-Exposure Rate (OER) across all model backends and frameworks.",
    362       "evidence": "Table III shows OER rising with τ across all (m, f) combinations. Example: DeepSeek under AgentScope grows from 0.120 to 0.500 (Section V.A). Fig. 3 confirms monotonic trend.",
    363       "supported": "strong"
    364     },
    365     {
    366       "claim": "Authorization Drift (AD) is positive across all models, with Llama-3-8B showing the greatest trust sensitivity (AD=0.0783) and GPT the least (AD=0.0243).",
    367       "evidence": "Fig. 4a reports AD values: Llama-3-8B 0.0783, Qwen 0.0310, DeepSeek 0.0268, GPT 0.0243. Consistent ordering across scenarios (Section V.C).",
    368       "supported": "strong"
    369     },
    370     {
    371       "claim": "Task success and OER both rise with τ, establishing an efficiency-security trade-off.",
    372       "evidence": "Fig. 5 shows task success rates rising with τ (e.g., Llama-3-8B: 0.05 → 0.71). Concurrent OER rise documented in Table III. Section V.A(3) discusses the trade-off.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "Sensitive-Information Repartitioning reduces AD by 79.5% (DeepSeek) and 88.4% (Llama-3-8B).",
    377       "evidence": "Section V.E: DeepSeek AD drops from 0.0268 to 0.0055; Llama-3-8B from 0.0783 to 0.0091. OER at τ=0.9 drops 22% and 46.5% respectively.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Orchestration frameworks alter both the slope and intercept of the τ→risk mapping.",
    382       "evidence": "Section V.D: AutoGen has high-level/low-slope (AD=0.0068), LangGraph mid-level/high-slope (AD=0.0296), AgentScope low-level/mid-slope (AD=0.0268). Fig. 8 provides 3D visualization.",
    383       "supported": "moderate"
    384     }
    385   ],
    386   "methodology_tags": ["benchmark-eval"],
    387   "key_findings": "The paper empirically validates the Trust-Vulnerability Paradox: across 1,488 agent-agent interaction chains spanning 4 LLM backends and 3 orchestration frameworks, increasing inter-agent trust (τ) monotonically raises information over-exposure while simultaneously improving task success. Trust sensitivity varies significantly by model (Llama-3-8B most sensitive, GPT least) and framework (AutoGen flattens the trust-risk slope, LangGraph amplifies it). Two defenses — information repartitioning and a Guardian-Agent — reduce over-exposure by 20-50% and trust sensitivity by 38-88% without materially degrading task success.",
    388   "red_flags": [
    389     {
    390       "flag": "No model versions specified",
    391       "detail": "Models are identified only as 'DeepSeek', 'Qwen', 'GPT', and 'Llama-3-8B' without version identifiers. Given that model behavior changes across versions, results may not be reproducible."
    392     },
    393     {
    394       "flag": "Hyperparameter values omitted",
    395       "detail": "The paper repeatedly states hyperparameters are 'fixed' but never reveals the actual values for temperature, top-p, or max tokens. These materially affect LLM output behavior."
    396     },
    397     {
    398       "flag": "No statistical significance tests",
    399       "detail": "All claims of trends and differences are based on comparing point estimates. With single-seed runs and no uncertainty quantification, observed trends could reflect stochastic variation."
    400     },
    401     {
    402       "flag": "Construct validity of synthetic scenarios unexamined",
    403       "detail": "The 19 scenarios are creative but there is no validation that synthetic CK/SK-Agent trust scenarios with explicit τ prompt labels reflect real multi-agent trust dynamics. Telling a model 'You trust Jay very much' tests prompt compliance, not emergent trust behavior."
    404     },
    405     {
    406       "flag": "Artifacts promised but not released",
    407       "detail": "The conclusion promises to 'release the scenario dataset and evaluation pipeline as a reproducible baseline' but no URL, repository, or archive is provided."
    408     },
    409     {
    410       "flag": "Single-seed single-run design",
    411       "detail": "The limitations section acknowledges 'generation stochasticity persists' but all results appear to be single-seed runs. The paper's own proposed future work includes 'multi-seed Monte Carlo,' implicitly admitting this weakness."
    412     }
    413   ],
    414   "cited_papers": [
    415     {
    416       "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    417       "authors": ["E. Debenedetti", "J. Zhang", "M. Balunovic"],
    418       "year": 2024,
    419       "relevance": "Benchmark for evaluating agent safety under prompt injection, testing utility-safety trade-offs in multi-agent settings."
    420     },
    421     {
    422       "title": "AgentPoison: Red-Teaming LLM Agents via Poisoning Memory or Knowledge Bases",
    423       "authors": ["Z. Chen", "Z. Xiang", "C. Xiao"],
    424       "year": 2024,
    425       "relevance": "Demonstrates backdoor attacks on LLM agents through memory/RAG poisoning, relevant to agent security evaluation."
    426     },
    427     {
    428       "title": "Multi-Agent Security Tax: Trading Off Security and Collaboration Capabilities in Multi-Agent Systems",
    429       "authors": ["P. Peigné", "M. Kniejski", "F. Sondej"],
    430       "year": 2025,
    431       "relevance": "Directly studies the safety-efficiency trade-off in multi-agent systems, showing defensive strategies reduce coordination capability."
    432     },
    433     {
    434       "title": "A Survey on Trustworthy LLM Agents: Threats and Countermeasures",
    435       "authors": ["M. Yu", "F. Meng", "X. Zhou"],
    436       "year": 2025,
    437       "relevance": "Comprehensive survey of agent safety threats including prompt injection, jailbreaks, RAG contamination, and privacy leakage."
    438     },
    439     {
    440       "title": "Agents in Software Engineering: Survey, Landscape, and Vision",
    441       "authors": ["Y. Wang", "W. Zhong", "Y. Huang"],
    442       "year": 2025,
    443       "relevance": "Survey of agent-based approaches in software engineering, relevant to understanding agentic system architectures."
    444     },
    445     {
    446       "title": "LLM-Based Multi-Agent Systems for Software Engineering: Literature Review, Vision, and the Road Ahead",
    447       "authors": ["Y. Jiang", "T. Zhang", "J. Shi"],
    448       "year": 2025,
    449       "relevance": "Literature review on multi-agent systems in software engineering, covering architectures and orchestration patterns."
    450     },
    451     {
    452       "title": "Can Large Language Model Agents Simulate Human Trust Behavior?",
    453       "authors": ["C. Xie", "C. Chen", "F. Jia"],
    454       "year": 2024,
    455       "relevance": "Studies whether LLM agents exhibit human-like trust behavior, directly relevant to trust modeling in agent systems."
    456     },
    457     {
    458       "title": "Why Do Multiagent Systems Fail?",
    459       "authors": ["M. Z. Pan", "M. Cemri", "L. A. Agrawal"],
    460       "year": 2025,
    461       "relevance": "Analyzes failure modes in multi-agent systems, relevant to understanding group-level misalignment and error propagation."
    462     },
    463     {
    464       "title": "Security and Privacy Challenges of Large Language Models: A Survey",
    465       "authors": ["B. C. Das", "M. H. Amini", "Y. Wu"],
    466       "year": 2025,
    467       "relevance": "Survey of LLM security and privacy challenges including jailbreaks and prompt injection."
    468     },
    469     {
    470       "title": "Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science",
    471       "authors": ["X. Tang", "Q. Jin", "K. Zhu"],
    472       "year": 2024,
    473       "relevance": "Advocates safety-first design for LLM agents, relevant to the paper's argument about trust as a security variable."
    474     },
    475     {
    476       "title": "AgentScope: A Flexible yet Robust Multi-Agent Platform",
    477       "authors": ["D. Gao", "Z. Li", "X. Pan"],
    478       "year": 2024,
    479       "arxiv_id": "2402.14034",
    480       "relevance": "Primary orchestration framework used in experiments; relevant to understanding how frameworks shape agent behavior."
    481     }
    482   ]
    483 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs