ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27931B)


      1 {
      2   "paper": {
      3     "title": "AnimAgents: Coordinating Multi-Stage Animation Pre-Production with Human–Multi-Agent Collaboration",
      4     "authors": [
      5       "Wen-Fan Wang",
      6       "Chien-Ting Lu",
      7       "Jin Ping Ng",
      8       "Yi-Ting Chiu",
      9       "Ting-Ying Lee",
     10       "Miaosen Wang",
     11       "Bing-Yu Chen",
     12       "Xiang 'Anthony' Chen"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv",
     16     "arxiv_id": "2511.17906"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. The system is described in detail but no source code is released."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No dataset, interview transcripts, survey responses, or interaction logs are released. The paper describes collecting qualitative and quantitative data from participants but does not provide any download links."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions the system is built on AutoGen Core 0.6.1, uses FastAPI and Next.js, GPT-4.1-mini, ChromaDB, and Flux.1-Kontext-pro (Section 5.4), but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions sufficient to recreate the environment."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The system architecture is described but there are no instructions for replicating the experiments."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Results are reported as Likert-scale distributions with p-values from Wilcoxon signed-rank tests, but no confidence intervals or error bars are provided for the main survey results in Figures 9 and 10."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper uses Wilcoxon signed-rank tests throughout (Section 6.1.4), reporting p-values (p < .01 and p < .05) for comparisons between AnimAgents and the baseline. A paired t-test is also used for task completion time (Section 7.1.3). One-sample Wilcoxon signed-rank test is used for the open-ended task (Section 6.1.4)."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper reports p-values and percentage preferences but does not report standardized effect sizes (e.g., Cohen's d, r, or rank-biserial correlation). The 13% reduction in task time (Section 7.1.3) provides some baseline context but is not a standardized effect size measure."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The summative study uses N=16 and the field study uses N=4. No power analysis or justification for these sample sizes is provided. The paper does not acknowledge whether N=16 is sufficient for the statistical claims being made."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Table 3 reports mean ± standard deviation for interaction metrics (e.g., 'Total messages 22.00±7.47' for Baseline, '27.12±7.96' for AnimAgents). Task completion time is also reported with means (Section 7.1.3)."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "A single-agent baseline is included that uses the same AutoGen framework, LLM, and T2I model. It is described as 'intentionally stronger than typical industry workflows' (Section 6). The open-ended task also compares against participants' prior GenAI workflows."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The baseline uses the same contemporary models (GPT-4.1-mini, Flux.1-Kontext-pro) and framework (AutoGen Core 0.6.1). The paper acknowledges no other MAS supports tailored human-AI collaboration for animation pre-production, justifying why no MAS baseline is included (Section 6)."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No ablation study is conducted. The system has multiple components (Core Agent, Specialized Agents, stage-specific boards, block-based interactions) but none are individually removed to measure their contribution. The comparison is only against a single-agent baseline, not ablated versions of AnimAgents."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Multiple metrics are used: 17 Likert-scale survey items across three research dimensions (Figures 9 and 10), task completion time, interaction log analysis with four message categories (Table 3), and qualitative interview data."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The entire evaluation is human-centered: 16 participants rated both systems on Likert scales, provided qualitative feedback in interviews, and 4 participants conducted a week-long field study with diary documentation and follow-up interviews."
     93       },
     94       "held_out_test_set": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "This is a user study evaluating a creative tool, not a benchmark evaluation with train/test splits. The concept of held-out test sets does not apply."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down across three research dimensions (RQ1: Coordination & Continuity, RQ2: Organization & Traceability, RQ3: Agency & Creative Exploration) with per-question breakdowns in Figures 9 and 10 (17 individual questions), and per-category interaction analysis in Table 3."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Multiple failure cases are discussed: Core Agent misrouting tasks (Section 7.3.1, 9.1.2), stage rigidity issues (Section 7.1.1, P20, P23), divergence from scripts for long text inputs (Section 7.1.2, P15), memory retrieval failures (Section 9.1.3), and limitations in original animation contexts (Section 8.2.1, P24)."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Several non-significant results are reported: Q12 (full creative control) p = .06, Q11 (convergence on useful directions) p = .10 (Section 7.3). The field study reveals AnimAgents is less suitable for original animation with high narrative depth requirements (Section 8.2.1). Revisions decreased (Table 3), which the paper interprets positively but represents a trade-off."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims AnimAgents 'significantly outperformed a strong single-agent baseline' in 'coordination, consistency, information management, and overall satisfaction (p < .01),' which is supported by the Wilcoxon signed-rank test results in Figure 9 (Q1-Q5, Q6-Q7, Q15-Q16). The field deployment claim is supported by Section 8."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper uses a within-subjects experimental design (Section 6.1) with counterbalanced system order and topics, which is an adequate design for causal inference about the system's effects. The causal claims (e.g., 'AnimAgents significantly improved participants' ability to coordinate') are supported by this controlled comparison."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title and abstract present AnimAgents broadly as a system for 'animation pre-production' without bounding generalization. The study involves only 16 participants from Taiwan's animation community. The paper's conclusions in Section 10 claim improvements in 'coordination, information management, and creative agency' without explicitly bounding these to the tested population, culture, or project types."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 9.1 discusses alternative explanations including: stage rigidity as a limitation (Section 9.1.1), LLM instability creating errors that could confound results (Section 9.1.2), memory retrieval failures affecting performance (Section 9.1.3). The discussion also considers that single-agent systems may be more flexible for unexpected changes (Section 9.1.1). Section 9.5 notes reliance on self-report as a limitation."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper specifies 'GPT-4.1-mini' and 'GPT-4.1-nano' (Section 5.4.1) with a footnote linking to the model page, but these are marketing names without a snapshot date or API version. 'Flux.1-Kontext-pro' is similarly specified only by product name."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper describes the Core Agent's system prompt contents at a high level (Section 5.2.1: 'base system prompt defining its high-level responsibilities, stage definitions, management rules, delegation logic, and user interaction guidelines') but does not provide the actual prompt text. Stage-specific prompts and Specialized Agent prompts are described functionally but not reproduced."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No hyperparameters are reported: no temperature, top-p, max tokens, or other LLM API settings are mentioned anywhere in the paper."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The agentic scaffolding is described in considerable detail: Section 5.2 describes the Core Agent and Specialized Agent architecture, task delegation flow, result validation, parallel execution, and direct communication modes. Section 5.4 covers memory management (indexing-retrieval-reading), error handling with CancellationTokens, and the image generation pipeline. Figure 4 provides a system architecture diagram."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "For the formative study, thematic analysis is described (Section 4.3). For the summative study, message classification into four categories is documented (Section 7.4). Interview transcription and coding procedures are described (Section 6.1.4). The survey instrument design process and question derivation from research questions are documented."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 9.5 is titled 'Limitation and Future Work' and discusses study design limitations, centralized orchestration issues, cross-board relation challenges, and potential for role-play. Additional limitations are discussed throughout Section 9 (Sections 9.1-9.4)."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The paper discusses specific threats: self-report methodology without external expert reviews (Section 9.5), centralized orchestration causing misdirected tasks, lack of inter-agent communication, cross-board lineage becoming challenging with many blocks, and the specific issue that only small short-term deployments were tested while animation pre-production spans weeks or months (Section 9.3)."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "While the paper discusses limitations, it does not explicitly state what the results do NOT show or what populations/settings are excluded. It acknowledges 'large-scale adoption remains untested' (Section 9.3) but does not systematically bound the scope of its claims regarding cultural context, project types, team sizes, or participant experience levels."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data is available: survey responses, interview transcripts, interaction logs, and task outputs are not released for independent verification."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Data collection procedures are described for all three studies: formative study interviews (1.5-2 hours, Section 4.2), summative study procedure (3 hours including tutorials, tasks, questionnaires, and interviews, Section 6.1.2), and field study (week-long deployment with diaries and 60-minute interviews, Section 8.1)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Recruitment methods are described: formative study recruited 'through personal referrals in the local animation community and direct email to studios' (Section 4.1). For the summative study, 5 participants from the formative study returned, and others were recruited (Section 6.1.1). Field study participants included 2 summative study volunteers and 2 recruited through 'direct outreach, including a system demo and interview' (Section 8.1)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The analysis pipeline is documented: formative study used thematic analysis with initial coding by one author refined with co-authors (Section 4.3). Summative study used Wilcoxon signed-rank tests for Likert data (Section 6.1.4). Interaction logs were classified into four categories (Section 7.4). Interview coding was led by first author with animation studio experience and refined with two researchers (Section 6.1.4)."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Section 'Acknowledgments' states: 'This work was supported by the National Science and Technology Council' (Section 10/Acknowledgments)."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly disclosed: seven authors from National Taiwan University, one from Google DeepMind, and one from UCLA. These are listed on the first page."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "The National Science and Technology Council is a government funding body that has no financial interest in the outcome of the research on animation pre-production tools."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is provided. One author is affiliated with Google DeepMind, and the system uses GPT-4.1 (OpenAI) — no statement about whether any authors hold patents, equity, or other financial interests related to the findings."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This paper evaluates a human-AI creative tool through user studies, not a pre-trained model's capability on a benchmark. The LLMs are used as components of the system, not evaluated for their knowledge."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "No benchmark evaluation of model knowledge is conducted. The study evaluates a collaborative system through user studies, so train/test overlap is not relevant."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No benchmark evaluation is conducted. The study uses user studies with creative tasks, not standardized benchmarks."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No pre-registration link (OSF, AsPredicted, etc.) is mentioned for any of the three studies."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No IRB or ethics board approval is mentioned for any of the three studies involving human participants (formative study with 12, summative study with 16, field study with 4)."
    252       },
    253       "demographics_reported": {
    254         "applies": true,
    255         "answer": true,
    256         "justification": "Table 1 provides detailed demographics: age, years of experience, industry, role (creative director vs. independent animator), and GenAI tools used for all 24 participants across all three studies."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "The paper states participants were 'professional animation creators' but does not specify formal inclusion or exclusion criteria. No screening process is described beyond being in the animation field."
    262       },
    263       "randomization_described": {
    264         "applies": true,
    265         "answer": true,
    266         "justification": "The summative study uses a within-subjects design where 'system order and topics were counterbalanced' (Section 6.1.2), which describes the randomization/counterbalancing procedure for the controlled experiment."
    267       },
    268       "blinding_described": {
    269         "applies": true,
    270         "answer": false,
    271         "justification": "No blinding is described. Participants could clearly see which system they were using (AnimAgents has a multi-board interface vs. the single-board baseline), and no mention is made of whether this was considered or how it might affect results."
    272       },
    273       "attrition_reported": {
    274         "applies": true,
    275         "answer": false,
    276         "justification": "No attrition information is provided. The paper does not state whether all recruited participants completed all tasks or whether any dropped out."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No API costs, tokens consumed, or per-session costs are reported, despite the system making multiple calls to GPT-4.1-mini, GPT-4.1-nano, and Flux.1-Kontext-pro per user session."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total computational budget, API spend, or hardware specifications for running the system are reported."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "AnimAgents significantly outperformed the single-agent baseline in coordination, consistency, information management, and overall satisfaction (all p < .01).",
    295       "evidence": "Within-subjects study with 16 professional creators; Wilcoxon signed-rank tests on 7-point Likert scale responses shown in Figure 9 (Q1-Q5 for coordination, Q6-Q9 for organization, Q15-Q16 for satisfaction). Most comparisons reach p < .01, some at p < .05.",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "AnimAgents reduced task completion time by approximately 13% compared to the baseline (Mean 34.4 vs 39.6).",
    300       "evidence": "Paired t-test on task completion times for 16 participants in the within-subjects study (Section 7.1.3), p < .01.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "AnimAgents shifted interaction patterns toward higher-level control, with more directive messages (22% vs 10%) and fewer revisions (30% vs 40%).",
    305       "evidence": "Log analysis of user interactions classified into four categories (Table 3, Section 7.4). Directive messages increased for 13/16 participants.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "In the open-ended task, 100% of participants preferred AnimAgents for cross-stage coordination, 94% for enhancing overall creative process, and 94% for overall satisfaction.",
    310       "evidence": "One-sample Wilcoxon signed-rank test against neutral midpoint on 7-point preference Likert scale (Figure 10, Section 6.1.4). All 16 participants rated the open-ended task.",
    311       "supported": "moderate"
    312     },
    313     {
    314       "claim": "AnimAgents' value diverges by context: increasing efficiency in commercial studios but facing limitations in original works.",
    315       "evidence": "Field study with 4 participants (Section 8.2). P5 (commercial) valued speed; P24 (original) found narrative depth and visual quality insufficient. Qualitative evidence only from 4 participants.",
    316       "supported": "weak"
    317     }
    318   ],
    319   "methodology_tags": [
    320     "case-study",
    321     "qualitative"
    322   ],
    323   "key_findings": "AnimAgents, a human-multi-agent collaborative system for animation pre-production, significantly outperformed a single-agent baseline in a within-subjects study with 16 professional creators on measures of coordination, consistency, information management, and overall satisfaction (p < .01), while reducing task completion time by approximately 13%. Interaction log analysis showed users shifted toward more directive, high-level control and fewer corrective revisions with AnimAgents. A week-long field study with 4 creators revealed context-dependent value: the system increased efficiency in commercial projects but was insufficient for original animation requiring high narrative depth and visual rigor.",
    324   "red_flags": [
    325     {
    326       "flag": "Small sample size for statistical claims",
    327       "detail": "The summative study has N=16 and the field study has N=4. No power analysis justifies these sizes. With 16 participants, the Wilcoxon signed-rank test has limited power, and the many comparisons (17 questions) increase the risk of Type I errors with no correction for multiple comparisons mentioned."
    328     },
    329     {
    330       "flag": "Self-report only evaluation",
    331       "detail": "The evaluation relies entirely on participant self-reports via Likert scales and interviews. No external expert evaluation of output quality, no blind evaluation of creative artifacts, and no objective measures of coordination quality are included. The paper acknowledges this limitation in Section 9.5."
    332     },
    333     {
    334       "flag": "Recruitment from local network",
    335       "detail": "Participants were recruited through 'personal referrals in the local animation community' and direct studio outreach, all from Taiwan. This convenience sampling may introduce selection bias and limits generalizability to other cultural contexts and animation industries."
    336     },
    337     {
    338       "flag": "No multiple comparison correction",
    339       "detail": "The paper reports 17 separate statistical tests on Likert-scale items (Figure 9) without applying any correction for multiple comparisons (e.g., Bonferroni, Holm, FDR). This inflates the familywise error rate substantially."
    340     },
    341     {
    342       "flag": "Overlap between formative and summative participants",
    343       "detail": "5 of the 16 summative study participants (P8-P12) had previously participated in the formative study that informed the system's design, potentially biasing their evaluation toward the system since their feedback shaped it."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    349       "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"],
    350       "year": 2024,
    351       "relevance": "Multi-agent LLM framework for collaborative software development, directly relevant to survey scope on agentic AI systems."
    352     },
    353     {
    354       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    355       "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig"],
    356       "year": 2024,
    357       "relevance": "Agentic system for automated software engineering tasks, core paper in agentic AI for coding."
    358     },
    359     {
    360       "title": "MapCoder: Multi-Agent Code Generation for Competitive Problem Solving",
    361       "authors": ["Md. Ashraful Islam", "Mohammed Eunus Ali", "Md Rizwan Parvez"],
    362       "year": 2024,
    363       "relevance": "Multi-agent approach to code generation, relevant to survey scope on LLM-based multi-agent systems for programming."
    364     },
    365     {
    366       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversations",
    367       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    368       "year": 2024,
    369       "relevance": "Multi-agent conversation framework used as the backbone for AnimAgents, foundational work in agentic AI."
    370     },
    371     {
    372       "title": "CodeAgent: Enhancing Code Generation with Tool-Integrated Agent Systems for Real-World Repo-level Coding Challenges",
    373       "authors": ["Kechi Zhang", "Jia Li", "Ge Li"],
    374       "year": 2024,
    375       "doi": "10.18653/v1/2024.acl-long.737",
    376       "relevance": "Tool-integrated agent system for code generation, relevant to survey scope on agentic AI for software engineering."
    377     },
    378     {
    379       "title": "Agent Laboratory: Using LLM agents as research assistants",
    380       "authors": ["Samuel Schmidgall", "Yusheng Su", "Ze Wang"],
    381       "year": 2025,
    382       "relevance": "LLM multi-agent system for scientific research tasks, relevant to survey scope on agentic AI capabilities."
    383     },
    384     {
    385       "title": "A survey on large language model based autonomous agents",
    386       "authors": ["Lei Wang", "Chen Ma", "Xueyang Feng"],
    387       "year": 2024,
    388       "relevance": "Comprehensive survey of LLM-based autonomous agents, relevant to the survey's coverage of agentic AI systems."
    389     },
    390     {
    391       "title": "The rise and potential of large language model based agents: A survey",
    392       "authors": ["Zhiheng Xi", "Wenxiang Chen", "Xin Guo"],
    393       "year": 2025,
    394       "relevance": "Survey on LLM-based agents covering capabilities and limitations, relevant to the broader survey scope."
    395     },
    396     {
    397       "title": "Multi-agent collaboration mechanisms: A survey of LLMs",
    398       "authors": ["Khanh-Tung Tran", "Dung Dao", "Minh-Duong Nguyen"],
    399       "year": 2025,
    400       "relevance": "Survey of multi-agent collaboration mechanisms for LLMs, directly relevant to understanding agentic AI coordination."
    401     },
    402     {
    403       "title": "MAGIS: LLM-Based Multi-Agent Framework for GitHub Issue Resolution",
    404       "authors": ["Wei Tao", "Yucheng Zhou", "Yanlin Wang"],
    405       "year": 2024,
    406       "relevance": "Multi-agent framework for software engineering (GitHub issue resolution), relevant to agentic AI for coding."
    407     },
    408     {
    409       "title": "Generative AI at work",
    410       "authors": ["Erik Brynjolfsson", "Danielle Li", "Lindsey Raymond"],
    411       "year": 2025,
    412       "relevance": "Empirical study of generative AI productivity impacts in the workplace, relevant to the survey's coverage of AI productivity research."
    413     },
    414     {
    415       "title": "Why do multi-agent LLM systems fail?",
    416       "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang"],
    417       "year": 2025,
    418       "relevance": "Analysis of failure modes in multi-agent LLM systems, relevant to understanding reliability and limitations of agentic AI."
    419     }
    420   ]
    421 }

Impressum · Datenschutz