ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27294B)


      1 {
      2   "paper": {
      3     "title": "An Empirical Study of Agent Developer Practices in AI Agent Frameworks",
      4     "authors": [
      5       "Yanlin Wang",
      6       "Xinyi Xu",
      7       "Jiachi Chen",
      8       "Tingting Bi",
      9       "Wenchao Gu",
     10       "Zibin Zheng"
     11     ],
     12     "year": 2025,
     13     "venue": "ACM (preprint, December 2025)",
     14     "arxiv_id": "2512.01939",
     15     "doi": "10.1145/nnnnnnn.nnnnnnn"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No repository URL, Zenodo archive, or other code release link is provided anywhere in the paper. The paper performs automated analysis using GPT-4o and GPT-4 but does not share scripts or code."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The study collects 1,575 GitHub repositories and 20,620 developer discussions but does not provide a download link or public dataset. The raw discussion corpus and cleaned datasets are not released."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No requirements.txt, Dockerfile, conda environment, or dependency list is provided. The paper only mentions using GPT-4o and GPT-4 without specifying any library versions or environment details."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No README, reproduction script, or step-by-step guide is provided. The methodology is described in natural language in Section 3 but not in a form that would allow a researcher to replicate the data collection and analysis pipeline."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "All results are reported as point estimates (percentages like '25.6%', '14%', '25%', '23%'). No confidence intervals, error bars, or uncertainty estimates are provided for any finding."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper makes comparative claims across frameworks (e.g., AutoGen vs. LangChain vs. others on learning cost, development efficiency) but performs no statistical significance tests. All comparisons are based on frequency counts alone."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper reports proportions (e.g., '28.40% of discussions relate to development efficiency') but these describe category frequencies, not effect sizes for comparisons between frameworks. No Cohen's d, relative risk, or comparable measures are reported."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The sample sizes (1,575 repos, 8,710 discussions, 11,910 discussions) are described by the filtering criteria but are never justified relative to statistical power or the claims being made. There is no power analysis or discussion of whether the sample size is sufficient."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No variance, standard deviation, or spread measure is reported for any result. All findings are presented as single aggregate proportions with no measure of uncertainty across frameworks or discussion categories."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The paper does not compare against prior empirical studies of agent frameworks or developer surveys. No baseline comparison study is included despite claiming to be 'the first' empirical study."
     72       },
     73       "baselines_contemporary": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "No baselines are included in this observational/mining study, so contemporariness of baselines cannot be assessed."
     77       },
     78       "ablation_study": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "This is an observational mining/qualitative study, not a system with components to ablate. No experimental interventions are performed."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper evaluates frameworks across five dimensions (learning cost, development efficiency, functional abstraction, performance optimization, maintainability) and uses multiple indicators for popularity (stars, forks, used-by counts, contributor counts, repo counts)."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Two domain experts independently reviewed all functional role annotations (Section 4.1.2) and a two-researcher open coding procedure was used in Section 6.1, both with Cohen's kappa reported (κ=0.82 and κ=0.81 respectively)."
     92       },
     93       "held_out_test_set": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "This is an observational study mining GitHub discussions; there is no predictive model and no train/test split. The concept of a held-out test set does not apply."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Results are broken down per framework for all five evaluation dimensions (Sections 6.2.1–6.2.5) and by challenge category (Logic, Tool, Performance, Version) across SDLC stages (Section 5.2)."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The entire RQ2 section (Section 5) is dedicated to analyzing failure cases and challenges developers face, with detailed examples such as infinite loops (Figure 7), memory management failures (Figure 8), and version incompatibilities (Figure 9)."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper explicitly reports negative findings including that performance optimization is 'a common shortcoming across all frameworks' (Finding 12) and that frameworks widely acclaimed (AutoGen, LangChain) face 'the highest maintenance complexity' (Finding 13)."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims (10 frameworks identified, taxonomy of four challenge categories, five-dimensional evaluation framework) are all substantiated in the results sections. Specific percentages in the abstract (25.6% task termination, 14% tool issues) match findings reported in Section 5."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper makes causal-sounding claims such as 'LangChain and CrewAI lower the technical threshold for beginners' and 'AutoGen and LangChain excel at rapid prototyping' derived from observational discussion data. No controlled comparison is made; these are associations from developer complaints, not causal evidence."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The study covers only GitHub open-source projects and this limitation is acknowledged in Section 7.3, but the abstract and findings present conclusions about 'agent developers' generally ('more than 80% of developers report difficulties') without consistently scoping claims to the GitHub open-source context."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Section 7.3 (Threats to Validity) discusses alternative explanations including LLM-generated bias in classification, topic tags not reflecting actual usage, and GitHub sample bias excluding closed-source environments. These are specific and relevant to the study's findings."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper refers to 'GPT-4o' and 'GPT-4' without specifying API version strings or snapshot dates. Section 5.1.2 mentions 'GPT-4' and Section 4.1.2 mentions 'GPT-4o' but neither provides a model version identifier, which matters significantly as model behavior varies across versions."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Figure 5 provides the actual system and user prompts used for discussion summarization verbatim, including the full prompt text. This allows replication of the LLM-based summarization step."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Section 7.3 states 'we standardize the generation process by setting the temperature to 0.1 and using top-1 sampling.' Temperature is explicitly reported for the LLM calls used in the study."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "The paper does not use agentic scaffolding; it uses LLMs as classification tools in a pipeline but not as autonomous agents. No agentic scaffolding applies here."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 3 describes the full data collection pipeline with counts at each stage: 10,265 raw discussions → 8,710 after removing duplicates and invalid content; and the filtering criteria are stated (keyword inclusion, non-empty README, stars>10, forks>5, recency). Criteria are explicit."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 7.3 'Threats to Validity' provides a dedicated limitations discussion covering internal validity (LLM bias, topic tag accuracy) and external validity (GitHub-only sample, enterprise environments)."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The threats are specific: LLM-generated bias from using GPT-4o for summarization, topic tags possibly not reflecting actual framework usage, and the exclusion of closed-source enterprise projects. These are study-specific, not generic disclaimers."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 7.3 explicitly states 'Our sample covers only GitHub open-source projects, excluding closed-source projects, which may limit the generalizability of our conclusions. In enterprise or closed-source environments, framework preferences... may differ.' This is a specific scope boundary."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The raw GitHub discussion corpus (8,710 and 11,910 discussions) is not made available for independent verification. No supplementary data files, dataset DOIs, or download links are provided."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 3 describes the data collection in detail: GitHub API search with compound keyword strategy, filtering criteria, time period (2022–July 2025), and metadata collected (stars, forks, used-by, contributors, topics, discussions)."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "This is a repository mining study with no human participants. GitHub projects and discussions are the data source, not recruited participants. The sampling criteria are fully described in Section 3."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Figure 1 shows the full pipeline from raw collection to analysis for both phases (RQ1/RQ2 and RQ3), with explicit counts at each step (10,265 raw → duplicates removed → invalid filtered → 8,710 cleaned). The pipeline steps and their outputs are all documented."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The Acknowledgements section discloses funding from the State Key Laboratory of Blockchain and Data Security at Zhejiang University, the National Natural Science Foundation of China (grants 62332004, 62276279), and the Guangdong Basic and Applied Basic Research Foundation (2024B1515020032)."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All author affiliations are disclosed in the paper header: Sun Yat-sen University (4 authors), Zhejiang University (1), University of Melbourne (1), Technical University of Munich (1). No authors are affiliated with the frameworks being evaluated."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "The funders (NSFC, Guangdong research foundation, State Key Lab at Zhejiang University) are government and academic bodies with no financial stake in any of the ten frameworks evaluated. The funder is independent of the study outcome."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "There is no competing interests statement, patent disclosure, or declaration of financial interests anywhere in the paper. Absence of such a statement means this criterion is not satisfied."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "This is a mining/observational study of GitHub repositories and discussions. The paper uses GPT-4o/GPT-4 as classifiers/summarizers of developer text, not to evaluate pre-trained model capability on a benchmark. Contamination criteria do not apply."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "No benchmark evaluation of pre-trained model capabilities is performed. The LLMs used are tools for text classification and summarization, not the subject of evaluation. Contamination criteria do not apply."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No capability benchmark is used. The study evaluates developer discussions about agent frameworks, not pre-trained model performance on code or reasoning tasks."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "This is a repository mining study with no human participants. Mining GitHub discussions does not constitute a human subjects study; no pre-registration applies."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants are involved. The study analyzes publicly available GitHub data; IRB/ethics approval is not applicable."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants. The study examines GitHub repositories and discussion threads, not recruited participants."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants. Repository selection criteria are described in Section 3 but these are data filtering criteria, not participant eligibility criteria."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants and no experimental assignment. This is an observational study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants and not an experimental study. Blinding is not applicable."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants and no longitudinal tracking of individuals. Attrition is not applicable."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "The paper uses GPT-4o and GPT-4 to summarize and classify 20,620 discussion threads but reports no API costs, token counts, or cost per example. Given the scale (tens of thousands of LLM calls), this omission is notable."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No GPU hours, API spend, wall-clock time, or hardware specifications are provided for the LLM-based analysis pipeline. The compute budget for running GPT-4/GPT-4o on the full dataset is not quantified."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "96% of top-starred GitHub agent projects adopt two or more different agent frameworks, indicating that single-framework solutions can no longer meet the complex demands of real-world agent applications.",
    294       "evidence": "Analysis of top 25% starred projects (approximately 394 of 1,575 repos) for co-occurrence of framework tags and dependency file checks (Section 4.2.2, Finding 2, Figure 3).",
    295       "supported": "moderate"
    296     },
    297     {
    298       "claim": "More than one-third of developer-reported agent development failures stem from internal logic control deficiencies; task termination and message cooling issues account for 21.63% and 9.86% of failures respectively.",
    299       "evidence": "Analysis of 8,710 LLM-generated issue summaries from GitHub discussions, mapped to SDLC stages (Section 5.2, Finding 5). Exact figures cited from the taxonomy analysis.",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "Version compatibility conflicts directly cause over 25% of technical obstacles, occurring most frequently during deployment and maintenance stages.",
    304       "evidence": "Frequency analysis of 8,710 developer discussion summaries classified into challenge categories (Section 5.2, Finding 8). Examples include LangChain/Pydantic conflicts and AutoGen v0.2-to-v0.4 breaking changes.",
    305       "supported": "moderate"
    306     },
    307     {
    308       "claim": "AutoGen and LangChain excel at rapid prototyping, with over 78% of developers in discussions citing these frameworks for rapid prototype verification.",
    309       "evidence": "Analysis of developer discussions in Section 6.2.2 (Finding 10). The 78% figure is derived from frequency analysis of the 11,910 framework-specific discussions, but no statistical test or confidence interval is provided.",
    310       "supported": "weak"
    311     },
    312     {
    313       "claim": "Performance optimization is a common shortcoming across all ten evaluated frameworks.",
    314       "evidence": "Qualitative analysis of performance-related discussions per framework in Section 6.2.4 (Finding 12), with per-framework breakdown of specific performance issues listed in Figure 13.",
    315       "supported": "moderate"
    316     },
    317     {
    318       "claim": "Community popularity (GitHub stars) is not a reliable predictor of real-world framework adoption (e.g., MetaGPT has 48.7K stars but appears in only 2 repos; LangGraph has fewer stars but appears in 26 repos).",
    319       "evidence": "Table 2 and Figure 4 comparing star counts, fork counts, and repository adoption counts for all ten frameworks (Section 4.2.3, Finding 3).",
    320       "supported": "strong"
    321     }
    322   ],
    323   "methodology_tags": [
    324     "observational",
    325     "qualitative"
    326   ],
    327   "key_findings": "This is a large-scale mining and qualitative analysis of 1,575 GitHub agent projects and 20,620 developer discussions covering ten major LLM-based agent frameworks. The study finds that 96% of highly-starred projects combine multiple frameworks, that over one-third of failures stem from logic control deficiencies (especially task termination), and that version conflicts cause over 25% of technical obstacles. A comparative evaluation across five dimensions reveals that AutoGen and LangChain lead in prototyping and functional abstraction but face the highest maintenance complexity, while performance optimization is a universal weakness across all frameworks.",
    328   "red_flags": [
    329     {
    330       "flag": "No statistical testing for comparative claims",
    331       "detail": "The paper makes comparative claims across ten frameworks (e.g., 'AutoGen and LangChain excel', 'CrewAI lowers the technical threshold') based solely on frequency counts of developer discussions, with no statistical tests. All differences between frameworks could be attributable to sampling variation or the unequal number of discussions per framework (LangChain has 6,006 vs. Swarm's 391)."
    332     },
    333     {
    334       "flag": "Unequal sample sizes confound framework comparisons",
    335       "detail": "Framework discussion counts range from 224 (CrewAI) to 6,006 (LangChain). Comparing proportions of discussion topics across frameworks with such different sample sizes without adjustment (e.g., normalization or confidence intervals) risks misleading conclusions about which frameworks have worse or better properties."
    336     },
    337     {
    338       "flag": "LLM used as classifier without validation of accuracy",
    339       "detail": "GPT-4o is used to semantically classify 8,710+ developer discussions and generate summaries. While the paper validates inter-rater agreement for the open coding step (κ=0.82), there is no systematic validation of the LLM classification accuracy or comparison against human-only coding for the automated summarization phase."
    340     },
    341     {
    342       "flag": "Causal language without causal design",
    343       "detail": "Findings use causal language ('LangChain and CrewAI lower the technical threshold', 'AutoGen and LangChain excel at rapid prototyping') from observational discussion data. These are associations between discussion content and framework choice, not evidence that the frameworks cause these outcomes."
    344     },
    345     {
    346       "flag": "Code and data not released",
    347       "detail": "The study claims to be a systematic empirical investigation but releases neither the collection scripts, the cleaned discussion corpus, nor the LLM-generated summaries. This makes independent verification of the key findings impossible."
    348     }
    349   ],
    350   "cited_papers": [
    351     {
    352       "title": "Why do multi-agent llm systems fail?",
    353       "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang", "Lakshya A Agrawal"],
    354       "year": 2025,
    355       "arxiv_id": "2503.13657",
    356       "relevance": "Provides a failure taxonomy for multi-agent LLM systems with 14 failure modes, directly relevant to agentic AI methodology evaluation."
    357     },
    358     {
    359       "title": "Autogen: Enabling next-gen LLM applications via multi-agent conversations",
    360       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang", "Yiran Wu", "Beibin Li"],
    361       "year": 2024,
    362       "relevance": "Introduces the AutoGen multi-agent framework, one of the ten frameworks studied; directly relevant to agentic workflow evaluation."
    363     },
    364     {
    365       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    366       "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"],
    367       "year": 2024,
    368       "relevance": "Presents MetaGPT framework for automated software project generation using multi-agent SOPs, relevant to agentic programming evaluation."
    369     },
    370     {
    371       "title": "Survey on evaluation of llm-based agents",
    372       "authors": ["Asaf Yehudai", "Lilach Eden", "Alan Li", "Guy Uziel", "Yilun Zhao"],
    373       "year": 2025,
    374       "arxiv_id": "2503.16416",
    375       "relevance": "Surveys evaluation strategies and gaps for LLM-based agents, highly relevant to methodology quality in agentic AI research."
    376     },
    377     {
    378       "title": "Agentless: Demystifying llm-based software engineering agents",
    379       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    380       "year": 2024,
    381       "arxiv_id": "2407.01489",
    382       "relevance": "Proposes a simpler approach to LLM-based software engineering without complex agent scaffolding; relevant to evaluating agentic frameworks."
    383     },
    384     {
    385       "title": "AI agents that matter",
    386       "authors": ["Sayash Kapoor", "Benedikt Stroebl", "Zachary S Siegel", "Nitya Nadgir", "Arvind Narayanan"],
    387       "year": 2024,
    388       "arxiv_id": "2407.01502",
    389       "relevance": "Critically examines evaluation practices for AI agents and identifies methodological flaws; directly relevant to methodology quality survey."
    390     },
    391     {
    392       "title": "Agent-SafetyBench: Evaluating the Safety of LLM Agents",
    393       "authors": ["Zhexin Zhang", "Shiyao Cui", "Yida Lu", "Jingzhuo Zhou", "Junxiao Yang"],
    394       "year": 2025,
    395       "arxiv_id": "2412.14470",
    396       "relevance": "Benchmarks safety of LLM agents across multiple environments; relevant to evaluating agentic AI methodology and safety evaluation practices."
    397     },
    398     {
    399       "title": "An Empirical Study of Testing Practices in Open Source AI Agent Frameworks and Agentic Applications",
    400       "authors": ["Mohammed Mehedi Hasan", "Hao Li", "Emad Fallahzadeh", "Gopi Krishnan Rajbahadur", "Bram Adams", "Ahmed E. Hassan"],
    401       "year": 2025,
    402       "arxiv_id": "2509.19185",
    403       "relevance": "Complementary empirical study of testing practices in the same agent framework ecosystem; directly relevant to the survey scope."
    404     },
    405     {
    406       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    407       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran"],
    408       "year": 2023,
    409       "relevance": "Introduces the ReAct framework for LLM agent reasoning and acting; foundational to the agentic AI landscape surveyed."
    410     },
    411     {
    412       "title": "Agents in software engineering: Survey, landscape, and vision",
    413       "authors": ["Yanlin Wang", "Wanjun Zhong", "Yanxian Huang", "Ensheng Shi"],
    414       "year": 2025,
    415       "relevance": "Survey of LLM agent applications in software engineering from the same research group; relevant for citation chasing in the agentic programming domain."
    416     },
    417     {
    418       "title": "LLM-Based Multi-Agent Systems for Software Engineering: Literature Review, Vision, and the Road Ahead",
    419       "authors": ["Junda He", "Christoph Treude", "David Lo"],
    420       "year": 2025,
    421       "relevance": "Literature review on multi-agent LLM systems for software engineering; directly relevant to agentic AI programming methodology survey."
    422     },
    423     {
    424       "title": "The prompt report: a systematic survey of prompt engineering techniques",
    425       "authors": ["Sander Schulhoff", "Michael Ilie", "Nishant Balepur"],
    426       "year": 2024,
    427       "arxiv_id": "2406.06608",
    428       "relevance": "Systematic survey of prompt engineering, relevant to evaluating how agent frameworks handle prompting and LLM interaction."
    429     }
    430   ]
    431 }

Impressum · Datenschutz