ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (23304B)


      1 {
      2   "paper": {
      3     "title": "CooperBench: Why Coding Agents Cannot be Your Teammates Yet",
      4     "authors": ["Arpandeep Khatua", "Hao Zhu", "Peter Tran", "Arya Prabhudesai", "Frederic Sadrieh", "Johann K. Lieberwirth", "Xinkai Yu", "Yicheng Fu", "Michael J. Ryan", "Jiaxin Pei", "Diyi Yang"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2601.13295"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper states 'We open-source this framework' and provides the URL https://cooperbench.com. The benchmark platform is released."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "CooperBench comprises 652 tasks across 12 repositories and is released as an open benchmark. The paper states 'We release CooperBench as an open benchmark.'"
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Section 2.3 Stage III describes containerized environments with automated setup scripts that clone repos at exact base commits, install dependencies, and verify test suites. Docker-based containers are used for agent execution (§3)."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper describes the full evaluation pipeline (§2.2), agent framework setup based on OpenHands v0.54 (§3), and Stage III describes automated setup scripts. The open-sourced framework is intended for reproducibility."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The paper states 'All error bars in Fig. 4 are 95% Wilson confidence intervals computed over task sets (App. C).'"
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The paper states 'The difference between \"with comm\" and \"no comm\" settings is not statistically significant' (§5), indicating significance testing was performed. Communication reducing merge conflicts is described as 'significantly' reducing them."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are reported throughout: 'around 50% lower than a Solo baseline', 'performance drops from 68.6% with 2 agents to 46.5% with 3 agents and further to 30.0% with 4 agents', specific success rates per model (e.g., GPT-5: 0.48 Solo vs 0.28 Coop)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The benchmark has 652 tasks across 12 repositories, but no justification is given for why this size is sufficient. The 3-to-4 agent scaling experiment uses only 46 tasks from 3 task sets with no power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "95% Wilson confidence intervals are reported for the main results in Fig. 4, which convey the spread/uncertainty of the estimates."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The Solo baseline (one agent doing both features) is the primary comparison. A 'no comm' baseline is also used to isolate the effect of communication (§5)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper evaluates GPT-5, Claude Sonnet 4.5, MiniMax-M2, and Qwen3 models — all contemporary frontier models at time of writing."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The communication ablation (with comm vs. no comm, §5) and the agent scaling experiment (2 to 4 agents, §4) serve as ablation-style studies isolating specific factors."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper reports success rate (task pass rate), conflict rate (merge compatibility), communication overhead (% of actions), and per-model breakdowns. Solo vs Coop gap is a distinct metric."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Manual qualitative coding of 50 failed Coop traces was performed (§6.2). The LLM-as-judge annotation for failure symptoms was validated with human evaluation (§6.1, App. G)."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "All 652 tasks are used for evaluation. There is no mention of a held-out test set or dev/test split — the entire benchmark is used for reporting results."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by model, by difficulty level (Fig. 4 Right), by communication type (Fig. 5c), by failure symptom category (Tab. 1), and by failure cause (Tab. 2)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 6 provides extensive failure analysis with a taxonomy of failure symptoms (Tab. 1), root causes (Tab. 2), and representative examples of each capability gap (§6.3)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The main finding IS a negative result: agents perform worse when cooperating. Communication does not improve success rates (§5). Prompt optimization yields only marginal improvements (App. D)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims agents achieve '30% lower success rates when working together' — the results show ~50% lower for top models (0.48→0.28 for GPT-5). The 30% is described as 'on average' across all models including weaker ones, which is consistent with the data shown."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims like 'communication reduces merge conflicts' are supported by controlled ablation (with/without communication). The 'curse of coordination' is demonstrated by comparing Solo vs Coop under controlled conditions with the same tasks and models."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper claims 'our findings generalize to any domain involving role and resource conflicts under partial observability' (§8) — a very broad generalization from software engineering tasks only. The title 'Why Coding Agents Cannot be Your Teammates Yet' generalizes beyond the 5 models and specific agent framework tested."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses alternative explanations: the spatial vs semantic coordination gap (§5), the trust paradox as an alternative explanation for expectation failures (§6.3), and that prompt optimization might help but yields only marginal improvements (App. D). The agent framework itself is acknowledged as a potential confound (§3)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are listed as 'GPT-5', 'Claude 4.5 Sonnet', 'MiniMax-M2', 'Qwen3-Coder-30B-A3B-Instruct', and 'Qwen3-30B-A3B-Instruct-2507'. No API snapshot dates or version identifiers are given for GPT-5, Claude, or MiniMax."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The system prompts and task prompts given to agents are described in natural language but the actual prompt text is not provided in the paper or appendix. The paper says agents receive feature descriptions in markdown but does not show the system prompt or scaffolding prompts."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No mention of temperature, top-p, max tokens, or other API hyperparameters for the model calls. The action limit of 100 is stated but sampling parameters are absent."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The agent framework is described: OpenHands v0.54 with a custom communication tool using SQL database for message passing (§3). Docker containers for isolation, real-time async communication, and the action space (communication + computer-use tools) are described in §2.1."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The three-stage dataset construction pipeline is thoroughly documented (§2.3, Fig. 3): repository/PR selection criteria, feature extraction and augmentation process, and environment setup. Inclusion criteria for PRs are stated."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion (§8) mentions future directions but does not substantively discuss limitations of the current study."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed. The paper does not address potential confounds like the specific agent framework choice, the 100-action limit, or whether results would differ with different communication mechanisms."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. Instead, §8 claims broad generalization: 'our findings generalize to any domain involving role and resource conflicts under partial observability.' No explicit scope boundaries are drawn."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The benchmark is open-sourced at cooperbench.com, which should include the task specifications, tests, and ground-truth solutions. Agent trajectories are analyzed and the framework is released."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 2.3 describes the three-stage pipeline in detail: repository selection criteria (1K+ stars, not in SWE-bench), PR selection constraints, feature extraction, and environment validation."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are recruited for evaluation. The benchmark evaluates AI agents, not humans. The dataset creators are the paper's co-authors."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline from PR selection through feature pool creation to evaluation is documented in §2.3 with Fig. 3. The evaluation pipeline (merge + test) is in §2.2. Statistics on task composition (77.3% conflicting ground-truth solutions) are provided."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgments section lists funding: ONR grant N000142412532, NSF grant IIS-2247357, DSO National Laboratories, SAP support, Google Cloud Platform and Modal Platform credits."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Stanford University and SAP Labs US. The paper evaluates third-party models (GPT-5, Claude, MiniMax, Qwen), not SAP products."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funders (ONR, NSF, DSO, SAP) do not have a direct financial stake in whether coding agents succeed or fail at cooperation. The paper evaluates third-party models, not products from its funders."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training cutoff dates are stated for any of the five models evaluated."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "The paper addresses contamination: 'Each repository exceeds one thousand GitHub stars and does not appear in SWE-Bench or Multi-SWE-Bench, reducing data contamination risk' (§2.3). Features are newly created by co-authors, reducing overlap with training data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "The benchmark is newly constructed with expert-written features, tests, and ground-truth code. The paper explicitly avoids repositories in SWE-Bench/Multi-SWE-Bench and creates new features rather than using existing issues/PRs directly, addressing contamination risk."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are studied. The paper evaluates AI agents on benchmark tasks."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants. The study evaluates AI agents."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the evaluation."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No API costs, token counts, or per-task inference costs are reported despite using multiple commercial API models (GPT-5, Claude, MiniMax) at scale across 652 tasks with multiple settings."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total compute budget is stated. Google Cloud Platform and Modal credits are acknowledged but the actual compute expenditure is not quantified."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Agents achieve on average 30% lower success rates when cooperating compared to solo performance (the 'curse of coordination').",
    286       "evidence": "Fig. 4 shows Solo vs Coop success rates: GPT-5 (0.48 vs 0.28), Claude (0.47 vs 0.26), MiniMax (0.36 vs 0.14), Qwen Coder (0.22 vs 0.13), Qwen (0.06 vs 0.05). 95% Wilson CIs reported.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Communication does not lead to better cooperation success despite heavy usage.",
    291       "evidence": "Fig. 5(a) shows with-comm vs no-comm success rates are not statistically significantly different across all models, while Fig. 5(c) shows agents spend up to 20% of actions on communication.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Communication reduces merge conflicts but not task success.",
    296       "evidence": "Fig. 5(b) shows significant reduction in conflict rates with communication for Claude, GPT-5, MiniMax, and Qwen Instruct. This is attributed to spatial vs semantic coordination gap (§5).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Coordination failures stem from three capability gaps: expectation (42%), commitment (32%), and communication (26%).",
    301       "evidence": "Table 2 reports these percentages based on manual review of 50 failed Coop traces (§6.2). Representative examples provided in §6.3.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Increasing the number of agents monotonically decreases success: 68.6% (2 agents) → 46.5% (3) → 30.0% (4).",
    306       "evidence": "Small-scale experiment using 46 tasks from 3 task sets (§4). No confidence intervals or significance tests reported for this specific experiment.",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval", "qualitative"],
    311   "key_findings": "CooperBench demonstrates a 'curse of coordination' where frontier coding agents (GPT-5, Claude Sonnet 4.5) achieve only ~25% success when two agents cooperate, roughly half their solo performance on the same workload. Communication between agents reduces merge conflicts but does not improve task success, because agents solve spatial coordination (avoiding overlapping edits) but fail at semantic coordination (ensuring compatible implementations). Qualitative analysis of failures reveals three root causes: expectation failures (42%), commitment failures (32%), and communication breakdowns (26%). Rare emergent coordination behaviors (role division, resource division, negotiation) appear in successful traces, suggesting the capability exists but is unreliable.",
    312   "red_flags": [
    313     {
    314       "flag": "No limitations section",
    315       "detail": "The paper lacks a dedicated limitations or threats-to-validity section. Potential confounds like the specific agent framework (OpenHands v0.54), communication mechanism (SQL-based message passing), and 100-action limit are not discussed as threats to validity."
    316     },
    317     {
    318       "flag": "Overbroad generalization",
    319       "detail": "Section 8 claims 'our findings generalize to any domain involving role and resource conflicts under partial observability' based solely on software engineering tasks with a specific agent framework and communication tool."
    320     },
    321     {
    322       "flag": "Small sample for scaling claim",
    323       "detail": "The 2-to-4 agent scaling experiment uses only 46 tasks from 3 task sets with no confidence intervals or significance tests reported, yet the result is presented as reinforcing the curse of coordination."
    324     },
    325     {
    326       "flag": "Missing hyperparameters",
    327       "detail": "Temperature, top-p, and other sampling parameters are not reported for any of the five models, despite these settings significantly affecting agent behavior."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Why do multi-agent llm systems fail?",
    333       "authors": ["Mert Cemri", "Melissa Z. Pan", "Shuyi Yang"],
    334       "year": 2025,
    335       "arxiv_id": "2503.13657",
    336       "relevance": "Directly relevant: analyzes failure modes in multi-agent LLM systems, finding inter-agent misalignment as a major failure category."
    337     },
    338     {
    339       "title": "OpenHands: An open platform for AI software developers as generalist agents",
    340       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    341       "year": 2025,
    342       "relevance": "The agent framework used in CooperBench experiments; a major open-source coding agent platform."
    343     },
    344     {
    345       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    346       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
    347       "year": 2024,
    348       "relevance": "Foundational single-agent coding benchmark that CooperBench extends to multi-agent cooperative settings."
    349     },
    350     {
    351       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    352       "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"],
    353       "year": 2024,
    354       "relevance": "Multi-agent framework that emulates software organizations with structured interaction, contrasted with CooperBench's free-form coordination."
    355     },
    356     {
    357       "title": "ChatDev: Communicative agents for software development",
    358       "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"],
    359       "year": 2024,
    360       "relevance": "Multi-agent coding framework using communicative agents, relevant to evaluating structured vs free-form agent collaboration."
    361     },
    362     {
    363       "title": "Magentic-One: A generalist multi-agent system for solving complex tasks",
    364       "authors": ["Adam Fourney", "Gagan Bansal", "Hussein Mozannar"],
    365       "year": 2024,
    366       "arxiv_id": "2411.04468",
    367       "relevance": "Multi-agent system using explicit orchestrators, representing the scaffolded approach that CooperBench argues masks coordination problems."
    368     },
    369     {
    370       "title": "CAMEL: Communicative agents for 'mind' exploration of large language model society",
    371       "authors": ["Guohao Li", "Hasan Abed Al Kader Hammoud"],
    372       "year": 2023,
    373       "relevance": "Foundational multi-agent conversation framework relevant to understanding agent communication and coordination."
    374     },
    375     {
    376       "title": "Agentless",
    377       "authors": ["Chunqiu Steven Xia"],
    378       "year": 2024,
    379       "relevance": "Single-agent coding approach achieving strong SWE-bench results, providing context for the solo vs cooperative performance gap."
    380     },
    381     {
    382       "title": "The Collaboration Gap",
    383       "authors": ["Tim R. Davidson", "Adam Fourney", "Saleema Amershi"],
    384       "year": 2025,
    385       "arxiv_id": "2511.02687",
    386       "relevance": "Directly related work finding that solo-capable models degrade when required to collaborate, corroborating CooperBench findings."
    387     },
    388     {
    389       "title": "Collaborative Gym: A framework for enabling and evaluating human-agent collaboration",
    390       "authors": ["Yijia Shao", "Vinay Samuel"],
    391       "year": 2025,
    392       "arxiv_id": "2412.15701",
    393       "relevance": "Framework for human-agent collaboration evaluation, complementary to CooperBench's agent-agent focus."
    394     },
    395     {
    396       "title": "Sotopia-pi: Interactive learning of socially intelligent language agents",
    397       "authors": ["Ruiyi Wang", "Haofei Yu"],
    398       "year": 2024,
    399       "relevance": "Training method for social intelligence in agents, proposed by CooperBench authors as a path to improving coordination."
    400     },
    401     {
    402       "title": "Scaling agents via continual pre-training",
    403       "authors": ["Liangcai Su", "Zhen Zhang"],
    404       "year": 2025,
    405       "arxiv_id": "2509.13310",
    406       "relevance": "Finding that multi-agent configurations degrade performance by 39-70% relative to single-agent baselines, corroborating curse of coordination."
    407     }
    408   ]
    409 }

Impressum · Datenschutz