scan.json (19994B)
1 { 2 "paper": { 3 "title": "Co-Sight: Enhancing LLM-Based Agents via Conflict-Aware Meta-Verification and Trustworthy Reasoning with Structured Facts", 4 "authors": ["Hongwei Zhang", "Ji Lu", "Shiqing Jiang", "Chenxiang Zhu", "Li Xie", "Chen Zhong", "Haoran Chen", "Yurui Zhu", "Yongsheng Du", "Yanqin Gao", "Lingjun Huang", "Baoli Wang", "Fang Tan", "Peng Zou"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2510.21557" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "The paper states 'Code available here' (footnote 1) but no actual URL is provided in the extracted text. The link appears to be a placeholder or broken reference." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses publicly available benchmarks: GAIA, HLE, and Chinese-SimpleQA, all with citations to their public sources." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No environment specifications, dependency lists, or hardware details are provided in the paper." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No reproduction instructions, README, or step-by-step guide is provided." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All results are reported as single point estimates (e.g., 84.4%, 35.5%, 93.8%) with no confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims Co-Sight outperforms baselines (e.g., 'exceeding the next best system by 1.0%') but provides no statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports percentage improvements with baseline context, e.g., '+8.6% over the baseline' in Table 2, and '13.4% over the backbone model' for HLE, giving readers enough context to assess magnitude." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification is given for the benchmark sizes used. GAIA has 300 questions and Chinese-SimpleQA has 3,000, but there is no discussion of whether these are sufficient for the claims made." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No variance, standard deviation, or multi-run statistics are reported. It is unclear whether experiments were run multiple times." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Multiple baselines are compared: Skywork Deep Research v2, AWorld, Su Zero Ultra on GAIA; Tongyi DeepResearch, Kimi-Researcher, Gemini Deep Research, OpenAI Deep Research on HLE; and majority voting, simple verification, oracle pass@N on Chinese-SimpleQA." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include 2024-2025 systems such as Skywork Deep Research v2, OpenAI Deep Research, and Gemini Deep Research, which are contemporary and competitive." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Table 2 provides a detailed component ablation on Chinese-SimpleQA with N=2 experts, isolating CAMV, TRSF, and SV contributions. Table 1 varies ensemble size." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": false, 78 "justification": "Only accuracy is reported across all benchmarks. No secondary metrics (e.g., cost, latency, F1) are used for evaluation." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation is included. All evaluations are automated benchmark accuracy scores. Given claims about 'trustworthy' and 'auditable' reasoning, human evaluation of output quality would be relevant." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "GAIA results are reported on the official test set via the HuggingFace leaderboard. HLE is also a separate test benchmark." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "GAIA results are broken down by difficulty level (Level 1, 2, 3) in Figure 3." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": false, 98 "justification": "No failure cases or error analysis is presented. The paper does not show examples where Co-Sight fails or discuss failure modes beyond the brief limitations section." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table 1 shows that CAMV underperforms oracle pass@N at larger ensemble sizes (N>=3), which is honestly reported and explained as a budget limitation." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims state-of-the-art on GAIA (84.4%) and HLE (35.5%), and strong results on Chinese-SimpleQA (93.8%). These are supported by Figures 3, 4, and Tables 1-2." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims like 'CAMV and TRSF jointly yield an overall gain of +8.6%' are supported by controlled ablation studies in Table 2 that manipulate single variables." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title claims to enhance 'LLM-Based Agents' generally, but results are on three specific benchmarks with one backbone model (Gemini 2.5 Pro). The paper does not explicitly bound its claims to this setting." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No alternative explanations for the results are discussed. For example, the improvement could stem from the specific backbone model choice or the additional compute from running multiple expert agents, but these are not considered." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper mentions 'Gemini 2.5 Pro' as the backbone but provides no snapshot date or API version. This is a marketing name without a precise version identifier." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "No actual prompt text is provided. The paper describes the system architecture and algorithms but does not include the prompts used for expert agents, the meta-verifier, or the planner/actor components." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "Key hyperparameters are missing: temperature settings for conservative vs radical agents, the consensus threshold θ, the budget Bmax, and sampling parameters are not specified." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The agentic scaffolding is described in detail: Section 3 covers the planner-actor-toolkit-facts architecture, DAG-based task decomposition, ReAct execution, the four-stage CAMV pipeline, and the three-tier TRSF context compression." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "No data preprocessing steps are documented. It is unclear how benchmark inputs were prepared or whether any filtering was applied." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 5.1 'Limitations' provides a dedicated discussion of limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 5.1 discusses specific limitations: incomplete expert plans can miss salient errors, multimodal pipeline bounded by vision/parsing module accuracy, and robustness in safety-critical settings not yet established." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what the results do NOT show. Limitations mention safety-critical settings but do not bound claims to the tested benchmarks, backbone model, or task types." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw data, intermediate outputs, or detailed per-question results are made available for independent verification." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "The benchmarks used are well-described with citations: GAIA (300 questions, 3 levels), HLE (100+ fields), Chinese-SimpleQA (3,000 questions, 6 domains)." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants; all data comes from standard benchmarks." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": false, 191 "justification": "The pipeline from benchmark input to final score is not documented. How expert agent outputs are collected, how the meta-verifier processes them in practice, and how final scores are computed against ground truth are not described in operational detail." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding information or acknowledgments section is present in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "All authors are affiliated with Zhongxing Telecom Equipment (ZTE), China, which is clearly stated." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding is disclosed. Since all authors are from ZTE (a corporate entity), the funder's independence cannot be assessed." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interest declarations are present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The training data cutoff for Gemini 2.5 Pro is not stated, making it impossible to assess whether benchmark data was in the training set." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of potential train/test overlap. GAIA was published in 2024 and Chinese-SimpleQA in 2024, both potentially within the training data of Gemini 2.5 Pro." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "Benchmark contamination is not discussed despite using public benchmarks with a model that may have trained on them." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference cost, latency, or token consumption is reported despite the system using multiple expert agents and a meta-verification agent, which implies significant computational overhead." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget, API costs, or hardware specifications are stated." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Co-Sight achieves state-of-the-art accuracy of 84.4% on the GAIA test benchmark.", 286 "evidence": "Figure 3 shows Co-Sight at 84.4% overall, exceeding Skywork Deep Research v2 (83.4%), AWorld (81.7%), and Su Zero Ultra (80.4%).", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "Co-Sight achieves 35.5% on the HLE benchmark, outperforming all competitors.", 291 "evidence": "Figure 4 shows Co-Sight at 35.5% vs Tongyi DeepResearch (32.9%) and others. The improvement is 2.6% over the strongest competitor.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "The synergy between CAMV and TRSF produces an 8.6% gain over the baseline on Chinese-SimpleQA.", 296 "evidence": "Table 2 shows the full Co-Sight (CAMV+TRSF) at 91.2% vs baseline 82.6%, with controlled ablations isolating each component's contribution.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "CAMV outperforms oracle pass@N for small ensemble sizes (N<=2).", 301 "evidence": "Table 1 shows CAMV at 88.3% vs pass@N at 85.0% for N=1, and 91.2% vs 90.0% for N=2.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Co-Sight provides a scalable paradigm for reliable long-horizon reasoning.", 306 "evidence": "Scalability claim is supported only by benchmark results on three datasets. No analysis of computational scaling, latency, or cost is provided.", 307 "supported": "weak" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "Co-Sight introduces a conflict-aware meta-verification (CAMV) and structured facts (TRSF) framework for LLM-based multi-agent reasoning. It achieves state-of-the-art on GAIA (84.4%) and HLE (35.5%) benchmarks using Gemini 2.5 Pro as the backbone. Ablation studies on Chinese-SimpleQA demonstrate that both CAMV and TRSF contribute complementary gains, with the full system achieving +8.6% over the baseline. The approach concentrates verification on disagreement hotspots among expert agents rather than full reasoning chains.", 312 "red_flags": [ 313 { 314 "flag": "No uncertainty quantification", 315 "detail": "All results are single-run point estimates with no error bars, confidence intervals, or variance across runs. The margins of improvement (e.g., 1.0% on GAIA) could be within noise." 316 }, 317 { 318 "flag": "Missing cost analysis", 319 "detail": "The paper claims scalability and cost-efficiency but reports no actual costs, latency, or token consumption. Running multiple expert agents plus a meta-verification agent with Gemini 2.5 Pro implies significant compute cost that is never quantified." 320 }, 321 { 322 "flag": "Single backbone model", 323 "detail": "All experiments use Gemini 2.5 Pro. It is unclear whether Co-Sight's improvements generalize to other LLMs." 324 }, 325 { 326 "flag": "Corporate evaluation of own product", 327 "detail": "All authors are from ZTE. While they are not evaluating a ZTE product per se, Co-Sight appears to be a ZTE system being promoted without independent evaluation or disclosed funding." 328 }, 329 { 330 "flag": "Missing hyperparameters", 331 "detail": "Critical hyperparameters (temperature settings for conservative/radical agents, consensus threshold θ, audit budget Bmax) are defined formally but their actual values are never reported." 332 } 333 ], 334 "cited_papers": [ 335 { 336 "title": "ReAct: Synergizing reasoning and acting in language models", 337 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"], 338 "year": 2023, 339 "relevance": "Foundational agent scaffolding paradigm that Co-Sight builds upon." 340 }, 341 { 342 "title": "GAIA: a benchmark for general AI assistants", 343 "authors": ["Grégoire Mialon", "Clémentine Fourrier", "Thomas Wolf", "Yann LeCun", "Thomas Scialom"], 344 "year": 2024, 345 "relevance": "Primary benchmark used for evaluating agentic reasoning systems." 346 }, 347 { 348 "title": "Humanity's last exam", 349 "authors": ["Long Phan", "Alice Gatti", "Ziwen Han"], 350 "year": 2025, 351 "arxiv_id": "2501.14249", 352 "relevance": "Frontier-difficulty benchmark used to evaluate advanced reasoning capabilities." 353 }, 354 { 355 "title": "Graph of thoughts: Solving elaborate problems with large language models", 356 "authors": ["Maciej Besta"], 357 "year": 2024, 358 "relevance": "Graph-structured prompting method for LLM reasoning that Co-Sight extends." 359 }, 360 { 361 "title": "Reflexion: Language agents with verbal reinforcement learning", 362 "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"], 363 "year": 2023, 364 "relevance": "Self-verification approach for LLM agents relevant to Co-Sight's meta-verification." 365 }, 366 { 367 "title": "Improving factuality and reasoning in language models through multi-agent debate", 368 "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba", "Joshua B Tenenbaum", "Igor Mordatch"], 369 "year": 2024, 370 "relevance": "Multi-agent debate framework that Co-Sight's conflict-aware approach improves upon." 371 }, 372 { 373 "title": "Detecting hallucinations in large language models using semantic entropy", 374 "authors": ["Sebastian Farquhar", "Jannik Kossen", "Lorenz Kuhn", "Yarin Gal"], 375 "year": 2024, 376 "relevance": "Hallucination detection method that motivates Co-Sight's verification approach." 377 }, 378 { 379 "title": "Self-consistency improves chain of thought reasoning in language models", 380 "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"], 381 "year": 2022, 382 "arxiv_id": "2203.11171", 383 "relevance": "Self-consistency voting method used as a baseline comparison." 384 }, 385 { 386 "title": "PlanGEN: A multi-agent framework for generating planning and reasoning trajectories for complex problem solving", 387 "authors": ["Mihir Parmar"], 388 "year": 2025, 389 "arxiv_id": "2502.16111", 390 "relevance": "Multi-agent planning framework that represents the current state of agentic reasoning." 391 }, 392 { 393 "title": "Why do multi-agent LLM systems fail?", 394 "authors": ["Mert Cemri"], 395 "year": 2025, 396 "arxiv_id": "2503.13657", 397 "relevance": "Analysis of failure modes in multi-agent LLM systems, directly relevant to understanding reliability of agentic approaches." 398 } 399 ] 400 }