scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (18497B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "EComStage: Stage-wise and Orientation-specific Benchmarking for Large Language Models in E-commerce",
      6     "authors": [
      7       "Kaiyan Zhao",
      8       "Zijie Meng",
      9       "Zheyong Xie",
     10       "Jinhao Duan",
     11       "Yao Hu"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2601.02752",
     16     "doi": "10.48550/arXiv.2601.02752"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All major abstract claims (seven tasks, three reasoning stages, 30+ LLMs evaluated, human annotation, stage-specific insights) are substantiated in the body via Table 2, Table 3, and Section 4.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper repeatedly attributes performance to specific causes ('likely due to its optimization for complex reasoning', 'benefits from more recent instruction tuning') without ablation studies or controlled experiments; the design is purely observational.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Claims are bounded to the 33 evaluated models on these seven e-commerce tasks; the Limitations section explicitly acknowledges finite scenario coverage and notes the benchmark does not encompass all e-commerce domains.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No alternative explanations are considered for observed performance patterns; possibilities such as Qwen3-based filtering creating artifacts favorable to Qwen models, translation quality variance, or cosine similarity metric biases are not discussed.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Open-ended tasks use cosine similarity with Qwen3-Embedding-8B as the metric but the paper does not discuss the gap between embedding similarity and actual response quality, task completion, or customer satisfaction.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated 'Limitations' section appears before the references, identifying two specific limitations: lack of cross-stage error propagation and finite scenario coverage.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Both stated threats are specific: stage tasks don't capture error propagation across stages, and the seven tasks don't encompass all e-commerce domains; these go beyond generic boilerplate.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The Limitations section explicitly states the benchmark 'does not capture error propagation across stages' and 'does not fully encompass all e-commerce domains, leaving room for future expansion.'",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment appears anywhere in the paper despite three authors being Xiaohongshu Inc. employees who used the company's proprietary operational data to construct the benchmark.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations (University of Tokyo, Zhejiang University, Xiaohongshu Inc.) are disclosed on the title page with institutional emails including caoshaosheng@xiaohongshu.com.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Three authors are employees of Xiaohongshu Inc. (a major e-commerce platform) and the benchmark is built entirely from Xiaohongshu's proprietary operational data; the institution providing resources is not independent of the domain benchmarked.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement, patent disclosure, or financial interest declaration appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The three stages (Perception, Planning, Action) are defined operationally via Figure 1 and task descriptions; customer-oriented vs. merchant-oriented orientations are explained with concrete scenario examples.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit contributions are listed: the stage-wise evaluation framework, seven human-annotated tasks covering both orientations, and actionable model-level insights; the contribution type (benchmark + evaluation) is clearly stated.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 discusses τ-bench, ECom-Bench, Mix-Ecom, and EComScriptBench in relation to EComStage; Table 1 explicitly compares scale, model coverage, orientation support, and stage-wise evaluation against prior benchmarks.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper argues that intermediate reasoning stages are critical to real-world agent performance and that existing benchmarks miss them; each task is mapped to a stage with a rationale grounded in the agent decision-making workflow.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No difficulty tiers (easy/medium/hard) are defined or measured; the paper does not characterize or analyze the distribution of difficulty across benchmark items.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Query Match scores cluster at 89-99% across all 33 models including small 3B models, indicating a strong ceiling effect, yet the paper neither identifies nor discusses this.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No human performance baseline is reported; annotators wrote ground-truth answers but their own accuracy on the benchmark tasks is never measured or compared to model performance.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Cosine similarity via Qwen3-Embedding-8B for open-ended tasks is stated without justification for why this metric (vs. LLM-as-judge, ROUGE, or human evaluation) is adequate for measuring response quality.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No contamination resistance measures (temporal splits, canary strings, dynamic generation) are mentioned; the benchmark is a static translated dataset with no anti-gaming provisions.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "No discussion of how the benchmark will remain discriminating as models improve beyond current levels, nor any plan for expansion or versioning.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The Limitations section mentions one failure mode (no cross-stage error propagation) but does not discuss benchmark-specific failure modes such as cosine similarity gaming, translation artifacts, or Qwen3-filtered data potentially favoring Qwen models.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "The paper links to https://github.com/KYuuto1006/EComStage for code and data, enabling reproduction of reported baseline results.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "Table 2 gives task statistics and Figures 4-15 show construction prompts and annotation guidelines, but there is no formal data card, no inter-annotator agreement statistics, and no annotator count or qualification details beyond 'employees with e-commerce experience.'",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "A GitHub repository is linked but no licensing terms are stated in the paper; it is unclear under what terms others may use or redistribute the benchmark, especially given data originates from a proprietary commercial platform.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "The paper explains intended use (evaluating LLM-based e-commerce agents) but does not specify what should NOT be concluded from results, nor does it describe the limits of valid use cases.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "No single model consistently excels across all stages (Perception, Planning, Action) or orientations (customer, merchant)",
    203       "evidence": "Table 3 shows model rankings vary substantially by task; Figure 3 shows top-5 models swap positions across stage and orientation dimensions",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "EComStage provides more comprehensive intermediate-stage evaluation than existing benchmarks",
    208       "evidence": "Table 1 shows ECom-Bench and Mix-Ecom lack stage-wise evaluation; EComStage adds this dimension with seven structured tasks",
    209       "supported": "moderate"
    210     },
    211     {
    212       "claim": "Claude Sonnet 4 achieves best overall average (84.21) among all evaluated models",
    213       "evidence": "Directly supported by Table 3 results across 33 models",
    214       "supported": "strong"
    215     },
    216     {
    217       "claim": "Qwen3-4B-Instruct (82.26) outperforms much larger models due to recent instruction tuning and alignment optimization",
    218       "evidence": "Performance gap shown in Table 3; causal attribution to instruction tuning is post-hoc speculation with no ablation",
    219       "supported": "weak"
    220     },
    221     {
    222       "claim": "All benchmark samples are human-annotated and quality-checked by professional annotators with e-commerce experience",
    223       "evidence": "Section 3.1.2 and Appendix A.3 describe annotation process; no inter-annotator agreement metric is reported",
    224       "supported": "moderate"
    225     },
    226     {
    227       "claim": "EComStage uniquely covers merchant-oriented scenarios absent from prior e-commerce benchmarks",
    228       "evidence": "Table 1 confirms ECom-Bench and Mix-Ecom are customer-oriented only; Table 2 shows Attitude Classification and Scenario Route as merchant tasks",
    229       "supported": "strong"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "benchmark-eval"
    234   ],
    235   "key_findings": "EComStage introduces a 4,804-sample benchmark spanning seven tasks across three reasoning stages (Perception, Planning, Action) for both customer and merchant e-commerce scenarios, with all samples human-annotated and translated from Chinese operational data. Evaluation of 33 models from 1B to 235B parameters shows no single model dominates all stages and orientations: top closed-source APIs score 81-85% overall but exhibit distinct weaknesses on merchant tasks. Query Match shows near-ceiling performance (89-99%) across nearly all models while Solution Decision (15-94%) and Scenario Route show higher variance, revealing that the benchmark is unevenly discriminating across its seven tasks.",
    236   "red_flags": [
    237     {
    238       "flag": "No human baseline",
    239       "detail": "Human annotators wrote ground-truth answers but no human performance is measured on the tasks, making it impossible to assess task difficulty or the gap between model and human capability."
    240     },
    241     {
    242       "flag": "Qwen3 filter circularity",
    243       "detail": "Qwen3-235B-A22B is used for answer-consistency filtering during dataset construction, and multiple Qwen3 model variants are also the subjects being evaluated; data filtered by Qwen3 judgment may systematically favor Qwen3 model outputs."
    244     },
    245     {
    246       "flag": "Ceiling effects in Query Match",
    247       "detail": "Query Match scores cluster at 89-99% across all 33 models including tiny 3B models (Qwen2.5-3B: 95%), indicating near-ceiling performance with very low discriminability for this sub-task."
    248     },
    249     {
    250       "flag": "Planning stage critically undersampled",
    251       "detail": "Scenario Route is the sole Planning task with only 164 samples—the smallest sub-task by far—making 'Planning stage' conclusions statistically fragile."
    252     },
    253     {
    254       "flag": "Cosine similarity metric unjustified",
    255       "detail": "Open-ended tasks (Query Rewrite, RAG-QA) are scored via cosine similarity with Qwen3-Embedding-8B without validating that this metric correlates with human judgments of response quality or task success."
    256     },
    257     {
    258       "flag": "Undisclosed Xiaohongshu conflict",
    259       "detail": "Three authors are Xiaohongshu Inc. employees and the dataset is built from Xiaohongshu's proprietary operational data; no conflict-of-interest disclosure or acknowledgment of potential institutional bias appears."
    260     },
    261     {
    262       "flag": "No inter-annotator agreement reported",
    263       "detail": "Professional annotators provided ground-truth labels but no inter-annotator agreement statistics are reported, leaving annotation reliability unverifiable."
    264     },
    265     {
    266       "flag": "Single run, no variance",
    267       "detail": "Section 4.1.3 explicitly states all experiments use a single run; no variance, confidence intervals, or statistical significance tests are reported for any comparison."
    268     },
    269     {
    270       "flag": "Translation quality unvalidated",
    271       "detail": "All data was machine-translated from Chinese to English using an LLM; translation quality is unvalidated and potential translation artifacts in benchmark results are not discussed."
    272     }
    273   ],
    274   "cited_papers": [
    275     {
    276       "title": "τ-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains",
    277       "relevance": "Key prior e-commerce agent benchmark EComStage is positioned against; covers retail and airline customer service scenarios"
    278     },
    279     {
    280       "title": "ECom-Bench: Can LLM Agent Resolve Real-World E-commerce Customer Support Issues?",
    281       "relevance": "Direct predecessor benchmark for e-commerce LLM evaluation, compared explicitly in Table 1"
    282     },
    283     {
    284       "title": "Mix-Ecom: Towards Mixed-Type E-commerce Dialogues with Complex Domain Rules",
    285       "relevance": "Competitor benchmark for mixed-type e-commerce dialogues covering multi-intent conversations, compared in Table 1"
    286     },
    287     {
    288       "title": "EcomScriptBench: A Multi-task Benchmark for E-commerce Script Planning via Step-wise Intention-Driven Product Association",
    289       "relevance": "Related step-wise e-commerce evaluation benchmark that EComStage extends in scope"
    290     },
    291     {
    292       "title": "AgentBench: Evaluating LLMs as Agents",
    293       "relevance": "General LLM agent benchmark providing methodological context for multi-task agent evaluation"
    294     },
    295     {
    296       "title": "WebShop: Towards Scalable Real-World Web Interaction with Grounded Language Agents",
    297       "relevance": "Foundational work on web-based e-commerce agent interaction and evaluation"
    298     },
    299     {
    300       "title": "Qwen3 Technical Report",
    301       "relevance": "Open-source model family prominently evaluated in EComStage and also used for dataset construction filtering—relevant to potential circularity concern"
    302     }
    303   ],
    304   "engagement_factors": {
    305     "practical_relevance": {
    306       "score": 2,
    307       "justification": "E-commerce practitioners can directly use EComStage to select and tune LLM-based agents; the stage-wise and orientation breakdown offers actionable deployment guidance."
    308     },
    309     "surprise_contrarian": {
    310       "score": 1,
    311       "justification": "The finding that 4B instruction-tuned models outperform much larger models on several tasks is mildly surprising, but the overall 'no single best model' conclusion is expected."
    312     },
    313     "fear_safety": {
    314       "score": 0,
    315       "justification": "No safety or AI risk concerns are raised; the paper is focused on commercial service performance benchmarking."
    316     },
    317     "drama_conflict": {
    318       "score": 1,
    319       "justification": "Implicit institutional competition (Xiaohongshu-derived benchmark evaluating competitor models from OpenAI, Google, Anthropic) but not framed as conflict in the paper."
    320     },
    321     "demo_ability": {
    322       "score": 2,
    323       "justification": "Code and data are available on GitHub; practitioners can run evaluations on the seven tasks with any LLM."
    324     },
    325     "brand_recognition": {
    326       "score": 1,
    327       "justification": "Xiaohongshu (RedNote) has growing international recognition and Zhejiang University is prominent in NLP, but this is not from a top-tier AI research lab."
    328     }
    329   },
    330   "hn_data": {
    331     "threads": [],
    332     "top_points": 0,
    333     "total_points": 0,
    334     "total_comments": 0
    335   }
    336 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs