scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24106B)
      1 {
      2   "paper": {
      3     "title": "CoLM: Collaborative Large Models via A Client-Server Paradigm",
      4     "authors": ["Siqi Huang", "Sida Huang", "Hongyuan Zhang"],
      5     "year": 2025,
      6     "venue": "AAAI 2026",
      7     "arxiv_id": "2511.06991"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. No supplementary materials with code are referenced."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available benchmarks: MME, SEED Bench, MMBench, AI2D, OCRBench, MMMU, AlpacaEval 2.0, Arena-Hard, and MT-Bench. All are standard public benchmarks that were not modified."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided. The paper mentions models are accessed via APIs or run locally using 'open-source inference frameworks' but does not specify which frameworks, library versions, or hardware."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The method description gives a high-level overview of the pipeline but not enough detail to reproduce experiments without guessing."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results in Tables 1-4 are reported as single point estimates with no confidence intervals, error bars, or uncertainty measures."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims (e.g., CoLM improves over baselines) based solely on comparing raw numbers. No statistical significance tests (t-tests, bootstrap tests, etc.) are used."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports absolute improvements (delta values) in Table 3 and percentage/point improvements throughout (e.g., Janus-Pro-7B going from 47.55 to 58.06 average score). The baseline context is always provided alongside the improved results."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for the choice of benchmarks, number of benchmarks, or why particular model combinations were selected. No power analysis or sample size discussion."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "All results appear to be from single runs. No standard deviation, variance across seeds, or multi-run statistics are reported anywhere in the paper."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares against standalone model performance (each model without collaboration) and against MoA (Mixture-of-Agents, Wang et al. 2024a) in Table 2. The server output is also included as an upper bound reference."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "MoA (Wang et al. 2024a) is a recent and representative collaborative framework. The standalone models used (GPT-4o, Qwen2.5-VL-7B, Janus-Pro-7B) are contemporary models."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper includes three ablation studies: (1) individual client contributions (Table 3), (2) effect of collaborative user scale (Figure 4), and (3) effect of collaboration rounds (Figure 5)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "For VLMs: MME-P, MME-R, SEEDBench, MMBench, OCRBench, AI2D, MMMU-Val, MMMU-Dev. For LLMs: MT-Bench (1st/2nd turn), AlpacaEval 2.0 (LC Win, Win), Arena-Hard."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The LLM benchmarks (AlpacaEval 2.0, Arena-Hard, MT-Bench) use LLM-as-judge evaluation, not human evaluation. No human evaluation of the system's outputs is performed. Given the paper claims about response quality improvement, human evaluation would be relevant."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "All benchmarks used (MME, SEED Bench, MMBench, MMMU, AlpacaEval, Arena-Hard, MT-Bench) are standard test sets with established evaluation protocols. There is no tuning on these sets."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 1 provides per-benchmark breakdowns for VLMs across 8 benchmarks. Table 2 provides per-benchmark breakdowns for LLMs. Table 3 provides per-client ablation breakdowns."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses diminishing returns with increasing collaboration rounds (Figure 5), performance drops in some metrics for some models (e.g., Qwen2.5-VL-7B* shows drops in MME-P and OCRBench in Table 1), and notes that using only LLaVA as a collaborator can hurt performance on some benchmarks (Table 3, negative deltas)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 1 shows some metrics decrease for certain models (e.g., Qwen2.5-VL-7B* drops on MME-P from 1693.53 to 1656.04, and OCRBench from 881 to 865). Table 3 shows collaboration with only LLaVA hurts Janus-Pro-7B on most benchmarks. The appendix Table 4 shows Qwen2.5-Coder-1.5B* drops significantly on MT-Bench."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims CoLM 'consistently improves model performance on previously failed queries.' The results in Tables 1-2 generally support consistent improvements across most benchmarks and models, though some individual metrics show decreases."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims that collaboration improves performance. The ablation studies (removing individual client models, varying number of clients, varying rounds) provide controlled manipulations that support these causal claims. Each ablation changes a single variable."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title 'Collaborative Large Models via A Client-Server Paradigm' and abstract claim broad applicability, but experiments are limited to specific model combinations (GPT-4o as server, specific 7B client models). The paper does not bound its claims to these specific configurations. The conclusion section acknowledges limitations about specialized client model availability but frames it as a future opportunity rather than a scope boundary."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, improvements could be partly due to the strong GPT-4o server model simply overriding weaker client answers rather than genuine 'collaboration.' The paper does not consider whether the gains are primarily from the server model's quality rather than the collaborative framework itself."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper uses 'GPT-4o' without specifying an API version or snapshot date. Other models are named (Qwen2.5-VL-7B-Instruct, Janus-Pro-7B, LLaVA-v1.5-7B) with some specificity, but GPT-4o is used as both a server model and client model without a version identifier."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper describes prompts in natural language (e.g., 'You are an expert in math' as a specialization prompt, and prompts that 'encourage consistency, factual accuracy') but never provides the actual full prompt text used in experiments. The prompts for the server aggregation step and the client refinement step are described only at a high level."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No temperature, top-p, max tokens, or other inference hyperparameters are reported for any of the models used. The paper uses multiple LLM APIs without stating sampling settings."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The three-stage client-server pipeline for LLMs and the two-step process for VLMs are described in the Method section: (1) client models generate responses independently, (2) server synthesizes, (3) clients refine. The VLM variant concatenates responses as contextual input. The workflow is depicted in Figures 2 and 6."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No description of how benchmark data was preprocessed, how images were handled for VLM inputs, or how queries were formatted for different models. The paper goes directly from naming the benchmarks to reporting results."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations section. The conclusion contains two sentences acknowledging that specialized client-side models 'are not yet widely deployed,' but this is a brief mention, not a substantive discussion of limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed. The brief limitation mention in the conclusion is about future deployment scenarios, not about specific threats to the validity of the current experimental results."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of which model types, tasks, or deployment scenarios are excluded from the claims. The framing is uniformly positive about broad applicability."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw experimental data (per-example predictions, model outputs, intermediate responses) is made available. Only aggregated benchmark scores are reported."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data collection is straightforward: all benchmarks used are named and cited (MME, SEED Bench, MMBench, AI2D, OCRBench, MMMU, AlpacaEval 2.0, Arena-Hard, MT-Bench). The models used are listed with their sources (OpenAI API, Hugging Face)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants were involved. All data comes from standard public benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The pipeline from query input through client generation, server aggregation, and client refinement is described at a high level, but the actual data processing pipeline (how benchmark queries are fed in, how responses are collected and scored) is not documented."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper. The authors are affiliated with China Telecom's AI institute (TeleAI), Northwestern Polytechnical University, and the University of Hong Kong, but no funding sources are disclosed."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Institute of Artificial Intelligence (TeleAI) at China Telecom, Northwestern Polytechnical University, and The University of Hong Kong."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The first author affiliation is with TeleAI (China Telecom's AI institute). Since no funding is disclosed, independence cannot be assessed. The paper does not evaluate China Telecom's own products, but the corporate affiliation and lack of funding disclosure means independence cannot be verified."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates pre-trained models (GPT-4o, Qwen, DeepSeek, LLaVA) on benchmarks but does not state any training data cutoff dates for these models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether the benchmark data could have been in the training data of any of the models used. Several benchmarks (e.g., MME, MMMU) are well-known and could be in training data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Many benchmarks used (e.g., SEED Bench 2023, MMBench 2024, MT-Bench 2023) were published before the models' likely training cutoffs. No contamination analysis is performed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants involved. This is a benchmark evaluation study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants involved."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants involved."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants involved."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants involved."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants involved."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants involved."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper proposes a method that makes multiple API calls (client models generate, server aggregates, clients refine) but reports no API costs, token counts, or latency measurements. This is especially relevant since the paper argues for 'efficient collaboration' and a 'deployment-friendly framework.'"
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No GPU hours, total API spend, hardware specifications, or total computation time are reported. The paper claims practical deployment benefits but provides no quantification of the computational overhead."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CoLM consistently improves model performance on previously failed queries through collaborative client-server reasoning.",
    286       "evidence": "Tables 1 and 2 show improvements across most benchmarks for both VLMs and LLMs. Table 1 shows average score improvements for all tested VLMs (e.g., Janus-Pro-7B from 47.55 to 58.06, LLaVA-1.5-7B from 38.62 to 56.13). Table 2 shows LLM improvements across MT-Bench, AlpacaEval 2.0, and Arena-Hard.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "Weaker models benefit more from collaboration than stronger models.",
    291       "evidence": "Table 1 shows larger absolute gains for weaker models: LLaVA-1.5-7B gains +17.51 average score points vs. GPT-4o gaining +1.02. The pattern holds for LLMs in Table 2 where initially weak models (e.g., Deepseek-Math-7B) show larger improvements.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Full collaboration with all client models outperforms collaboration with any single client model.",
    296       "evidence": "Table 3 shows that Janus-Pro-7B* (all models collaborating) achieves the best performance across all benchmarks compared to single-model collaboration variants.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Increasing the number of collaborative users consistently improves performance.",
    301       "evidence": "Figure 4 shows performance increasing with more clients across three LLM benchmarks, though with diminishing returns.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "CoLM is practical for real-world deployment under client-server architectures.",
    306       "evidence": "The paper provides theoretical motivation for the client-server paradigm alignment but no empirical evidence of deployment practicality (no latency, cost, or scalability measurements).",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "CoLM proposes a client-server paradigm for collaborative LLM/VLM reasoning where lightweight client models generate initial responses, a central server model (GPT-4o) synthesizes them, and clients refine based on server guidance. Experiments across 8 VLM benchmarks and 3 LLM benchmarks show consistent improvements, especially for weaker models (e.g., LLaVA-1.5-7B average score from 38.62 to 56.13). Ablation studies demonstrate that full multi-model collaboration outperforms single-model collaboration, with diminishing returns as more clients or rounds are added.",
    312   "red_flags": [
    313     {
    314       "flag": "No uncertainty quantification",
    315       "detail": "All results are single-run point estimates with no confidence intervals, error bars, standard deviations, or significance tests. Given the stochastic nature of LLM inference (especially with API-based models like GPT-4o), the reported improvements could be within noise ranges."
    316     },
    317     {
    318       "flag": "Missing cost analysis for a cost-efficiency-motivated method",
    319       "detail": "The paper motivates CoLM partly on deployment efficiency but reports zero cost metrics. The method requires multiple model inferences per query (all client models + server model + client refinement), which could be substantially more expensive than a single model call. Without cost data, the practical deployment claim is unsupported."
    320     },
    321     {
    322       "flag": "Strong server model confound",
    323       "detail": "GPT-4o serves as the server model. Improvements to weaker client models (7B parameter models) could largely be attributed to GPT-4o's superior capabilities simply overriding poor answers, rather than genuine collaborative synergy. The paper does not control for this by testing with a weaker server model."
    324     },
    325     {
    326       "flag": "No reproducibility materials",
    327       "detail": "No code, prompts, hyperparameters, or environment specifications are provided. The method description is at a high level, making independent reproduction difficult."
    328     },
    329     {
    330       "flag": "No limitations section",
    331       "detail": "The paper lacks a dedicated limitations or threats-to-validity section. Only two sentences in the conclusion briefly acknowledge the availability of specialized client models as a limitation."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "Mixture-of-Agents Enhances Large Language Model Capabilities",
    337       "authors": ["J. Wang", "J. Wang", "B. Athiwaratkun", "C. Zhang", "J. Zou"],
    338       "year": 2024,
    339       "arxiv_id": "2406.04692",
    340       "relevance": "Directly compared baseline (MoA) for multi-model collaborative LLM reasoning."
    341     },
    342     {
    343       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    344       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    345       "year": 2024,
    346       "relevance": "Cascading LLM inference method for cost-efficient model selection, related to deployment efficiency claims."
    347     },
    348     {
    349       "title": "RouteLLM: Learning to Route LLMs with Preference Data",
    350       "authors": ["I. Ong", "A. Almahairi", "V. Wu", "W.-L. Chiang", "T. Wu", "J. E. Gonzalez", "M. W. Kadous", "I. Stoica"],
    351       "year": 2024,
    352       "arxiv_id": "2406.18665",
    353       "relevance": "Dynamic model routing for cost-efficient LLM inference, related approach to multi-model deployment."
    354     },
    355     {
    356       "title": "Improving factuality and reasoning in language models through multiagent debate",
    357       "authors": ["Y. Du", "S. Li", "A. Torralba", "J. B. Tenenbaum", "I. Mordatch"],
    358       "year": 2023,
    359       "relevance": "Multi-agent debate framework for improving LLM reasoning through model collaboration."
    360     },
    361     {
    362       "title": "Rethinking Mixture-of-Agents: Is Mixing Different Large Language Models Beneficial?",
    363       "authors": ["W. Li", "Y. Lin", "M. Xia", "C. Jin"],
    364       "year": 2025,
    365       "arxiv_id": "2502.00674",
    366       "relevance": "Critical analysis of multi-model mixture approaches for LLMs, directly relevant to evaluating collaborative model paradigms."
    367     },
    368     {
    369       "title": "Think Deep, Think Fast: Investigating Efficiency of Verifier-free Inference-time-scaling Methods",
    370       "authors": ["J. Wang", "S. Zhu", "J. Saad-Falcon", "B. Athiwaratkun", "Q. Wu", "J. Wang", "S. L. Song", "C. Zhang", "B. Dhingra", "J. Zou"],
    371       "year": 2025,
    372       "arxiv_id": "2504.14047",
    373       "relevance": "Studies inference-time scaling methods for LLMs, relevant to understanding computational efficiency of collaborative approaches."
    374     },
    375     {
    376       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    377       "authors": ["L. Zheng", "W.-L. Chiang", "Y. Sheng", "S. Zhuang", "Z. Wu", "Y. Zhuang", "Z. Lin", "Z. Li", "D. Li", "E. P. Xing", "H. Zhang", "J. E. Gonzalez", "I. Stoica"],
    378       "year": 2023,
    379       "arxiv_id": "2306.05685",
    380       "relevance": "Defines MT-Bench and LLM-as-judge evaluation methodology used as a primary benchmark in this paper."
    381     },
    382     {
    383       "title": "Length-Controlled AlpacaEval: A Simple Way to Debias Automatic Evaluators",
    384       "authors": ["Y. Dubois", "B. Galambosi", "P. Liang", "T. B. Hashimoto"],
    385       "year": 2024,
    386       "arxiv_id": "2404.04475",
    387       "relevance": "AlpacaEval 2.0 benchmark with length-controlled debiasing, used as a primary LLM evaluation benchmark."
    388     },
    389     {
    390       "title": "Eagle: Efficient training-free router for multi-LLM inference",
    391       "authors": ["Z. Zhao", "S. Jin", "Z. M. Mao"],
    392       "year": 2024,
    393       "arxiv_id": "2409.15518",
    394       "relevance": "Training-free multi-LLM routing approach, related to model selection in multi-model inference systems."
    395     }
    396   ]
    397 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs