scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24643B)
      1 {
      2   "paper": {
      3     "title": "Automated test generation to evaluate tool-augmented LLMs as conversational AI agents",
      4     "authors": [
      5       "Samuel Arcadinho",
      6       "David Aparício",
      7       "Mariana S. C. Almeida"
      8     ],
      9     "year": 2024,
     10     "venue": "arXiv",
     11     "arxiv_id": "2409.15934"
     12   },
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper provides a GitHub link to the ALMITA dataset and all generated datasets: https://github.com/zendesk/almita-dataset (footnote 1, Section 1). This is the dataset; code for the pipeline itself is not explicitly mentioned as released, but the dataset artifacts are available."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "ALMITA and auto-ALMITA datasets are released at https://github.com/zendesk/almita-dataset, as stated in footnote 1: 'ALMITA, along with all other datasets generated using our pipeline and referenced in the paper, are available'."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency listings are provided in the paper. The paper does not describe the software environment used to run the pipeline."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "While all prompts are provided in the appendix, there are no step-by-step reproduction instructions, no README with commands to run, and no scripts to replicate the pipeline end-to-end."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Tables 2 and Supplementary Table 1 report only point estimates (e.g., '92.7', '75.2') with no confidence intervals, error bars, or uncertainty measures."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper makes comparative claims across models (e.g., 'GPT models outperform the others') but provides no statistical significance tests — comparisons are based solely on raw percentage differences."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No effect sizes (Cohen's d, odds ratios, etc.) are reported. The paper reports raw percentages but does not contextualize the magnitude of differences between models."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The dataset sizes (1,420 tests for ALMITA, 2,696 for auto-ALMITA) are reported but not justified. No power analysis or justification for why these sample sizes are adequate for the claims made."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance, standard deviation, or spread measures are reported for any results. All numbers appear to be single-run evaluations with no indication of variability across runs."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper compares multiple LLMs (GPT-4o, GPT-4, Claude3-sonnet, Mistral-NeMo-Instruct, Llama3.1-8b-Instruct) against each other in Table 2, providing baseline comparisons across models."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The baselines include GPT-4o (2024), Claude 3 Sonnet (2024), Mistral-NeMo-Instruct, and Llama3.1 — all contemporary models at the time of writing."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Section 4.2 presents an ablation study comparing conversations generated directly from procedures vs. using intermediate graph representations, showing graph representations improve validity from ~68% to ~88%."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper uses 7 evaluation dimensions: reply recall, correct reply, API recall, correct API, correct API parameters, test correctness, and conversation correctness (Section 4.3, Table 2)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Two annotators independently reviewed each datapoint in ALMITA for correctness (Section 4.1), and the ablation study in Section 4.2 uses manual curation by the same annotators for quality assessment."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "This is not a learning/training study — the paper evaluates LLMs on a fixed dataset without any tuning phase that would require train/test separation. The entire ALMITA dataset serves as the evaluation set."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 2 provides per-dimension breakdowns (reply recall, correct reply, API recall, correct API, correct parameters, test correct, conversation correct) rather than just a single aggregate metric."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 4.3 discusses failure patterns: GPT-4 tends to call APIs even when unnecessary (low reply recall), all models struggle with full conversation correctness, and the Limitations section (Section 6) discusses annotation limitations."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that all models have very low conversation correctness (4.2%-15.6%), which is a significant negative finding. GPT-4's low reply recall (53.2%) is also reported as a negative result."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims that LLMs 'perform well in single interactions' but 'struggle to handle complete conversations' — this is supported by Table 2 showing high test correctness (73-89%) but very low conversation correctness (1.6-15.6%)."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The main causal claim is that intermediate graph representations improve conversation quality (Section 4.2). The ablation study provides controlled single-variable manipulation: same procedures, same annotators, only difference is pipeline with/without graphs. This is adequate for the claim."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper claims the method 'is general and capable of AI agents for different domains' (abstract) but all experiments are in the customer support domain only. The title and abstract claim generality beyond what was tested. Section 1 states 'the same method could be applied, with some changes, to other domains' which slightly hedges, but the abstract does not."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper does not discuss alternative explanations for the results. For example, low conversation correctness could be due to error propagation, strict metric definitions, or prompt design — none of these alternatives are explored. The Limitations section mentions that metrics 'may be too strict' but does not substantively discuss this as an alternative explanation for the observed performance gaps."
    130       }
    131     },
    132     "setup_transparency": {
    133       "model_versions_specified": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Table 2 footnote specifies exact versions: 'gpt-4-0613, gpt-4o-2024-05-13, anthropic.claude-3-sonnet-20240229-v1:0'. Open-source models are specified by name and size (Llama3.1-8b-Instruct, Mistral-NeMo-Instruct)."
    137       },
    138       "prompts_provided": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "All prompts used in the pipeline are provided in full in Appendix A (A.1 through A.8), including system prompts and user prompts for each pipeline stage and the AI agent evaluation prompt."
    142       },
    143       "hyperparameters_reported": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "No temperature, top-p, max tokens, or other sampling hyperparameters are reported for any of the LLM calls in the pipeline or the evaluation. The noise generator mentions a 20% probability but API calling hyperparameters are absent."
    147       },
    148       "scaffolding_described": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "The 8-stage pipeline is described in detail in Section 3 with each component (intent generator, procedure generator, API extractor, flowgraph generator, conversation graph generator, noise generator, path sampler, conversation generator, test extractor) thoroughly explained with diagrams (Figures 1-4) and algorithms (Algorithm 1)."
    152       },
    153       "data_preprocessing_documented": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 4.1 documents the full data pipeline with counts at each stage: 84 intents → 168 procedures → 132 after filtering → 70 with valid APIs → 49 valid flowgraphs → 33 valid conversation graphs → 217 conversations → 192 after manual filtering → 1420 tests. Filtering criteria at each stage are described."
    157       }
    158     },
    159     "limitations_and_scope": {
    160       "limitations_section_present": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section 6 is a dedicated 'Limitations' section with substantive discussion of multiple limitations."
    164       },
    165       "threats_to_validity_specific": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 6 discusses specific threats: using only GPT-4 as the generator may bias evaluation, small number of annotators, single prompt used across models, lack of quantitative diversity evaluation, and metrics that may be too strict. These are specific to this study."
    169       },
    170       "scope_boundaries_stated": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "While the paper mentions focus on customer support, it does not explicitly state what the results do NOT show. It does not bound its claims to specific model versions, specific types of customer support, or acknowledge that results may not transfer to other conversational domains despite claiming generality."
    174       }
    175     },
    176     "data_integrity": {
    177       "raw_data_available": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The ALMITA and auto-ALMITA datasets are available at https://github.com/zendesk/almita-dataset, allowing independent verification of the generated tests and evaluation."
    181       },
    182       "data_collection_described": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 4.1 describes the data collection procedure in detail: LLM-generated intents as seeds, multi-stage pipeline with filtering at each step, two-annotator review process. Each stage's criteria and filtering rules are described in Section 3."
    186       },
    187       "recruitment_methods_described": {
    188         "applies": false,
    189         "answer": false,
    190         "justification": "No human participants were recruited as subjects. The two annotators are co-authors/team members performing curation, not research subjects. The data source is synthetically generated, not a standard benchmark."
    191       },
    192       "data_pipeline_documented": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Table 1 provides complete pipeline statistics showing counts at each stage (generated, after automatic filters, after manual filters). Section 4.1 documents each transformation with specific numbers of items removed and reasons."
    196       }
    197     },
    198     "conflicts_of_interest": {
    199       "funding_disclosed": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "No funding disclosure, acknowledgments section, or grant information is provided in the paper."
    203       },
    204       "affiliations_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "All three authors are listed with @zendesk.com email addresses, clearly identifying their affiliation with Zendesk, a customer support platform company."
    208       },
    209       "funder_independent_of_outcome": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "All authors are Zendesk employees. Zendesk is a customer support platform that would benefit from demonstrating both the need for AI agent evaluation (justifying their tools) and the effectiveness of automated test generation. The funder (Zendesk, implicitly) has a stake in the outcome."
    213       },
    214       "financial_interests_declared": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No competing interests statement or financial interest declaration is provided. The authors work at Zendesk, which is directly in the customer support AI space, but this potential conflict is not explicitly acknowledged."
    218       }
    219     },
    220     "contamination": {
    221       "training_cutoff_stated": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "The paper evaluates multiple LLMs on the ALMITA benchmark but does not state training data cutoff dates for any of the models. While the benchmark is newly created, the paper does not discuss whether any of the test content could overlap with training data."
    225       },
    226       "train_test_overlap_discussed": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No discussion of potential train/test overlap. The benchmark is synthetically generated, which reduces contamination risk, but this advantage is not explicitly discussed. GPT-4 was used to generate the test data and is also evaluated — this circular dependency is not addressed as a contamination concern."
    230       },
    231       "benchmark_contamination_addressed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "ALMITA is a newly created benchmark not available before the models' training cutoffs, so pre-existing benchmark contamination does not apply. However, the fact that GPT-4 generated the data could be considered a form of contamination — this is partially addressed in Section 4.3 ('this may be biased by the use of GPT-4 for test generation') and Limitations."
    235       }
    236     },
    237     "human_studies": {
    238       "pre_registered": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants were studied. The two annotators performed dataset curation, not a human subjects study."
    242       },
    243       "irb_or_ethics_approval": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants were studied. Annotation was performed by team members as part of dataset construction."
    247       },
    248       "demographics_reported": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants were studied."
    252       },
    253       "inclusion_exclusion_criteria": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants were studied."
    257       },
    258       "randomization_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants were studied."
    262       },
    263       "blinding_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants were studied."
    267       },
    268       "attrition_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants were studied."
    272       }
    273     },
    274     "cost_and_practicality": {
    275       "inference_cost_reported": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No inference costs, API costs, or tokens consumed are reported. The pipeline calls GPT-4 multiple times across 8 stages for each test but provides no cost or latency figures."
    279       },
    280       "compute_budget_stated": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No total computational budget, API spend, or hardware specifications are reported. The pipeline involves extensive LLM calls for generation and evaluation but the cost is not quantified."
    284       }
    285     }
    286   },
    287   "claims": [
    288     {
    289       "claim": "Intermediate graph representations improve the validity of generated conversations from ~68% to ~88%.",
    290       "evidence": "Section 4.2 ablation study: simplified pipeline without graphs yields 34/50 (68%) valid conversations; original pipeline with graphs yields 192/217 (88%) valid conversations, both manually curated by the same annotators.",
    291       "supported": "moderate"
    292     },
    293     {
    294       "claim": "Tool-augmented LLMs perform well in single interactions (API calling) but struggle with complete conversations.",
    295       "evidence": "Table 2: API correct exceeds 90% for most models, but conversation correctness ranges from 1.6% (Llama3.1-8b-I) to 15.6% (GPT-4o w/F).",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "GPT-4o achieves the highest test correctness (88.9%) among evaluated models on ALMITA.",
    300       "evidence": "Table 2 shows GPT-4o at 88.9% test correct, highest among the 5 models tested (next highest: Mistral-NeMo-I at 84.7%).",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "The fully automated pipeline (auto-ALMITA) produces rankings highly correlated with the manually curated dataset (ALMITA), with correlation of 0.98.",
    305       "evidence": "Section 4.4 and Figure 5: both datasets rank LLMs in the same order with correlation value 0.98 on the test correct metric.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "The proposed method is general and capable of evaluating AI agents for different domains beyond customer support.",
    310       "evidence": "No experiments outside customer support are conducted. This claim (from abstract and Section 1) is purely assertive with no supporting evidence.",
    311       "supported": "unsupported"
    312     }
    313   ],
    314   "methodology_tags": [
    315     "benchmark-eval"
    316   ],
    317   "key_findings": "The paper presents an automated pipeline for generating evaluation datasets for tool-augmented conversational AI agents, using intermediate graph structures to improve test quality (88% validity vs 68% without graphs). Evaluation of 5 LLMs on the ALMITA benchmark reveals that while models achieve high single-interaction accuracy (>85% for API calls), full conversation correctness is very low (1.6-15.6%), suggesting current LLMs are inadequate as fully autonomous customer support agents. The automated pipeline produces model rankings highly correlated (0.98) with manually curated datasets, suggesting it can replace expensive human curation.",
    318   "red_flags": [
    319     {
    320       "flag": "Generator-evaluator circularity",
    321       "detail": "GPT-4 is used to generate the ALMITA test dataset and is also one of the models being evaluated. The paper acknowledges this may bias correct reply scores toward GPT models (Section 4.3) but does not quantify or mitigate this bias."
    322     },
    323     {
    324       "flag": "No uncertainty quantification",
    325       "detail": "All results are single-run point estimates with no confidence intervals, error bars, or variance across runs. LLM outputs are stochastic, so single-run results may not be stable."
    326     },
    327     {
    328       "flag": "Company evaluating its own domain",
    329       "detail": "All authors are Zendesk employees (a customer support platform). The paper proposes automated evaluation methods for customer support AI agents — a direct product interest. No conflict of interest disclosure is provided."
    330     },
    331     {
    332       "flag": "Small ablation sample",
    333       "detail": "The ablation study comparing graph vs. no-graph pipelines uses only 50 conversations for the no-graph condition, which may be insufficient to draw robust conclusions about the difference (68% vs 88%)."
    334     },
    335     {
    336       "flag": "Missing hyperparameters",
    337       "detail": "Temperature, top-p, and other sampling parameters are not reported for any LLM calls, despite the pipeline involving 8+ LLM-dependent stages where these settings could substantially affect output quality."
    338     }
    339   ],
    340   "cited_papers": [
    341     {
    342       "title": "Autogen: Enabling next-gen LLM applications via multi-agent conversation framework",
    343       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    344       "year": 2023,
    345       "arxiv_id": "2308.08155",
    346       "relevance": "Foundational multi-agent LLM framework relevant to survey's coverage of agentic AI architectures."
    347     },
    348     {
    349       "title": "AgentBench: Evaluating LLMs as agents",
    350       "authors": ["Xiao Liu", "Hao Yu", "Hanchen Zhang"],
    351       "year": 2023,
    352       "arxiv_id": "2308.03688",
    353       "relevance": "Key benchmark for evaluating LLMs as agents with multi-step interactions, directly relevant to the survey's evaluation methodology scope."
    354     },
    355     {
    356       "title": "AgentTuning: Enabling generalized agent abilities for LLMs",
    357       "authors": ["Aohan Zeng", "Mingdao Liu", "Rui Lu"],
    358       "year": 2023,
    359       "arxiv_id": "2310.12823",
    360       "relevance": "Proposes tuning LLMs for agent capabilities using compiled agent datasets, relevant to survey's coverage of LLM agent capability enhancement."
    361     },
    362     {
    363       "title": "ToolLLM: Facilitating large language models to master 16000+ real-world APIs",
    364       "authors": ["Yujia Qin", "Shihao Liang", "Yining Ye"],
    365       "year": 2023,
    366       "arxiv_id": "2307.16789",
    367       "relevance": "Major work on tool-augmented LLMs and API calling capability, directly relevant to survey's scope on LLM tool use."
    368     },
    369     {
    370       "title": "Gorilla: Large language model connected with massive APIs",
    371       "authors": ["Shishir G Patil", "Tianjun Zhang", "Xin Wang"],
    372       "year": 2023,
    373       "arxiv_id": "2305.15334",
    374       "relevance": "Early influential work on LLMs connected to APIs, relevant to survey's coverage of tool-augmented models."
    375     },
    376     {
    377       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    378       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang"],
    379       "year": 2024,
    380       "relevance": "Addresses rigorous LLM evaluation methodology for code generation, relevant to survey's methodological quality assessment focus."
    381     },
    382     {
    383       "title": "Exploring and evaluating hallucinations in LLM-powered code generation",
    384       "authors": ["Fang Liu", "Yang Liu", "Lin Shi"],
    385       "year": 2024,
    386       "arxiv_id": "2404.00971",
    387       "relevance": "Studies LLM hallucination in code generation, relevant to survey's coverage of LLM reliability and evaluation."
    388     },
    389     {
    390       "title": "APIGen: Automated pipeline for generating verifiable and diverse function-calling datasets",
    391       "authors": ["Zuxin Liu", "Thai Hoang", "Jianguo Zhang"],
    392       "year": 2024,
    393       "arxiv_id": "2406.18518",
    394       "relevance": "Automated dataset generation for function-calling evaluation, directly comparable methodology to the paper under review."
    395     },
    396     {
    397       "title": "AgentInstruct: Toward generative teaching with agentic flows",
    398       "authors": ["Arindam Mitra", "Luciano Del Corro", "Guoqing Zheng"],
    399       "year": 2024,
    400       "relevance": "Framework for generating synthetic agent training data, relevant to survey's coverage of agentic AI data generation and evaluation."
    401     },
    402     {
    403       "title": "GAIA: A benchmark for general AI assistants",
    404       "authors": ["Grégoire Mialon", "Clémentine Fourrier", "Craig Swift"],
    405       "year": 2023,
    406       "arxiv_id": "2311.12983",
    407       "relevance": "Benchmark for evaluating general AI assistant capabilities with human-annotated questions, relevant to survey's evaluation methodology coverage."
    408     },
    409     {
    410       "title": "API-Blend: A comprehensive corpora for training and benchmarking API LLMs",
    411       "authors": ["Kinjal Basu", "Ibrahim Abdelaziz"],
    412       "year": 2024,
    413       "arxiv_id": "2402.15491",
    414       "relevance": "Combines multiple datasets for API-calling LLM evaluation, relevant to survey's coverage of tool-use benchmarks."
    415     }
    416   ]
    417 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs