scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (18869B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "EComStage: Stage-wise and Orientation-specific Benchmarking for Large Language Models in E-commerce",
      6     "authors": [
      7       "Kaiyan Zhao",
      8       "Zijie Meng",
      9       "Zheyong Xie",
     10       "Jin Duan",
     11       "Yao Hu"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2601.02752",
     16     "doi": "10.48550/arXiv.2601.02752"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims stage/orientation-specific strengths and weaknesses, which is supported by Table 3 and Figure 3. The claim that 'no single model consistently excels across all stages' is directly supported by the per-task results.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper makes multiple unsupported causal claims: 'likely due to its optimization for complex reasoning and tool use' (Section 4.2.1), 'likely benefiting from its large model capacity' (Section 4.2.4), 'likely due to its training on fine-grained dialogue understanding' (Section 4.2.4). These are speculative explanations without causal study design.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title claims to benchmark LLMs 'in E-commerce' generally, but the data comes from a single platform (Xiaohongshu, as indicated by author affiliations). The paper does not acknowledge that results from one e-commerce platform's data may not generalize to other platforms, markets, or cultural contexts.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper attributes performance differences to model-specific factors (training data, instruction tuning) without considering alternative explanations such as prompt sensitivity, language bias from translation, or task-specific confounds.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper measures accuracy on simplified tasks (e.g., classification, matching) and frames this as evaluating 'Perception, Planning, and Action' capabilities of e-commerce agents. The gap between controlled benchmark performance and real-world agent capability in live e-commerce settings is not discussed.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing the lack of error propagation evaluation and limited coverage of e-commerce domains.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The limitations mention specific threats: the benchmark 'does not capture error propagation across stages, which may occur in real-world deployments' and covers 'representative but finite e-commerce scenarios' that 'do not fully encompass all e-commerce domains.'",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The Limitations section explicitly states what the benchmark does not cover: cross-stage error propagation and domains beyond the seven covered tasks, noting room for future expansion.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source, acknowledgments section, or grant information is disclosed anywhere in the paper.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: The University of Tokyo, Zhejiang University, and Xiaohongshu Inc. The corresponding author's email is at xiaohongshu.com.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Three authors are from Xiaohongshu Inc., which is the apparent source of the benchmark data. The company has an interest in their platform's data being seen as a valuable source for AI evaluation, though they are not evaluating their own model.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or financial interest declarations are present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Perception, Planning, and Action are operationally defined with examples in Section 1 and Figure 1; 'agent-capable LLMs' is used descriptively without a formal definition but in a way that is sufficiently clear from context.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The contribution is explicitly itemized in three bullet points: the stage-wise framework, seven annotated tasks covering both orientations, and model evaluation insights.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 surveys related e-commerce datasets and benchmarks, and Table 1 provides a direct feature comparison with τ-bench, ECom-Bench, and Mix-Ecom showing where EComStage extends prior work.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "The mapping of seven tasks to three cognitive stages is asserted by design (tasks are named after stages) but never empirically validated; there is no argument or evidence that accuracy on Query Rewrite, for instance, is a valid proxy for 'Perception ability.'",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No difficulty tiers or difficulty distribution analysis is provided; item counts vary widely (Query Match: 1927, Scenario Route: 164) but this is not analyzed in terms of difficulty.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Query Match shows near-ceiling accuracy for nearly all evaluated models (89–99%), which is a clear ceiling effect that the paper does not acknowledge or discuss.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Although samples are human-annotated, no human performance baseline is reported; it is impossible to judge whether any model result represents 'good' performance relative to human capability.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Accuracy for close-ended tasks is appropriate but unargued; cosine similarity via Qwen3-Embedding-8B for open-ended tasks is adopted without validating its correlation with human judgment or comparing to alternative metrics like ROUGE or human evaluation.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper does not discuss contamination resistance; while proprietary Xiaohongshu data provides incidental protection, no explicit anti-gaming measures (temporal splits, canary strings, or dynamic generation) are mentioned.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "There is no discussion of how the benchmark will remain relevant as e-commerce scenarios, platform policies, or LLM capabilities evolve, nor any update plan.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "The Limitations section explicitly identifies that stage-isolated evaluation fails to capture cross-stage error propagation and that scenario coverage is finite — these are genuine failure modes of the benchmark itself.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "A GitHub repository is linked in footnote 1, and full implementation details (batch size, temperature, top-p, repetition penalty, max tokens, GPU configuration) are specified in Section 4.1.3.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "Table 2 provides item counts by task and orientation, and the construction pipeline is described, but there is no formal data card, no inter-annotator agreement statistics, no description of annotator demographics or qualification criteria beyond 'employees with e-commerce experience.'",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "A GitHub link is provided but no explicit license for the dataset or code is stated in the paper; terms of use for the underlying Xiaohongshu operational data are not discussed.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "The intended use (evaluating agent-capable LLMs in e-commerce) is stated, but the paper does not specify what should NOT be concluded — e.g., that benchmark scores do not predict real-world deployment performance or that results may not transfer outside Chinese e-commerce contexts.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "No single model consistently excels across all stages (Perception, Planning, Action) or orientations (customer, merchant).",
    203       "evidence": "Table 3 and Figure 3 show clear variation: e.g., Qwen2.5-72B leads on Planning but not Action; Claude Sonnet 4 leads on merchant tasks but not Solution Decision.",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "Claude Sonnet 4 achieves the highest average score among closed-source APIs (84.21).",
    208       "evidence": "Table 3 directly shows this result across all seven tasks.",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "Existing e-commerce benchmarks overlook intermediate reasoning stages and focus only on final task success.",
    213       "evidence": "Table 1 compares EComStage against ECom-Bench and Mix-Ecom, showing neither includes stage-wise evaluation; the claim is supported but the comparison set is limited to three benchmarks.",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "Stage-wise and orientation-wise evaluation provides more actionable insights than end-to-end evaluation.",
    218       "evidence": "The paper demonstrates that aggregate scores mask stage-specific weaknesses (e.g., GLM4-9B scores 73.16 overall but only 38.44 on Attitude Classification), but no direct comparison with an end-to-end evaluation condition is conducted.",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "LLM performance on merchant-oriented tasks varies more than on customer-oriented tasks.",
    223       "evidence": "Figure 3 shows wider spread among models on merchant vs. customer orientation; GPT-4o notably underperforms on merchant tasks.",
    224       "supported": "moderate"
    225     }
    226   ],
    227   "methodology_tags": [
    228     "benchmark-eval"
    229   ],
    230   "key_findings": "EComStage provides 4,804 human-annotated samples across 7 tasks spanning Perception, Planning, and Action stages for both customer- and merchant-oriented e-commerce scenarios. Evaluation of 33 LLMs reveals that no single model dominates across all stages or orientations: most models perform well on classification-style Perception tasks but diverge significantly on Planning and Action. Closed-source models (Claude Sonnet 4, Gemini 2.5-Pro) lead overall, while Qwen3-235B-A22B-Instruct achieves the best result among open-source models (85.61). Stage-wise evaluation surfaces weaknesses hidden by aggregate scores, such as GLM4-9B's severe underperformance on Attitude Classification despite a mediocre overall score.",
    231   "red_flags": [
    232     {
    233       "flag": "Single-platform data generalization",
    234       "detail": "All benchmark data derives from Xiaohongshu (Little Red Book), a single Chinese e-commerce platform, yet conclusions are framed as broadly applicable to 'real-world e-commerce' without bounding this to the source platform or culture."
    235     },
    236     {
    237       "flag": "No human baseline",
    238       "detail": "Despite human annotation, no human performance is reported on any task. Without this, it is impossible to assess whether model scores represent human-level performance, superhuman performance, or a trivially easy benchmark."
    239     },
    240     {
    241       "flag": "Ceiling effect unacknowledged",
    242       "detail": "Query Match shows 89–99% accuracy across nearly all 33 evaluated models, indicating a ceiling effect that makes this task non-discriminating; the paper does not flag or discuss this."
    243     },
    244     {
    245       "flag": "Construct validity unvalidated",
    246       "detail": "The assignment of seven tasks to three cognitive stages (Perception/Planning/Action) is definitional, not empirically validated. There is no evidence that accuracy on, e.g., Query Rewrite, is a valid proxy for 'Perception ability.'"
    247     },
    248     {
    249       "flag": "Unvalidated open-ended metric",
    250       "detail": "Cosine similarity via Qwen3-Embedding-8B is used for RAG-QA and Query Rewrite without validating its correlation with human judgment or comparing to alternative metrics."
    251     },
    252     {
    253       "flag": "Single-run evaluation, no error bars",
    254       "detail": "Section 4.1.3 explicitly states all experiments are conducted in 'a single run,' with no confidence intervals or variance estimates reported for any metric."
    255     },
    256     {
    257       "flag": "Undisclosed conflict of interest",
    258       "detail": "The corresponding author and data source are both from Xiaohongshu Inc., which has a financial interest in showcasing LLM performance on its operational tasks. No conflict of interest is declared."
    259     },
    260     {
    261       "flag": "Translation validity not assessed",
    262       "detail": "The dataset was originally in Chinese and translated to English by an LLM; no translation quality evaluation or comparison between Chinese and English performance is provided."
    263     }
    264   ],
    265   "cited_papers": [
    266     {
    267       "title": "τ-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains",
    268       "relevance": "Primary baseline benchmark for e-commerce agent evaluation; EComStage explicitly extends its scope to include stage-wise reasoning."
    269     },
    270     {
    271       "title": "ECom-Bench: Can LLM Agent Resolve Real-World E-commerce Customer Support Issues?",
    272       "relevance": "Direct prior work benchmarking LLM agents in e-commerce customer service; compared in Table 1."
    273     },
    274     {
    275       "title": "Mix-Ecom: Towards Mixed-Type E-Commerce Dialogues with Complex Domain Rules",
    276       "relevance": "Another e-commerce benchmark covering multi-intent dialogues; compared in Table 1 as lacking stage-wise evaluation."
    277     },
    278     {
    279       "title": "AgentBench: Evaluating LLMs as Agents",
    280       "relevance": "Foundational benchmark for evaluating LLMs as agents; provides methodological context for agent evaluation design."
    281     },
    282     {
    283       "title": "EcomScriptBench: A Multi-Task Benchmark for E-Commerce Script Planning via Step-wise Intention-Driven Product Association",
    284       "relevance": "Closest prior work to EComStage in introducing step-wise evaluation; cited as limited in scope compared to EComStage."
    285     },
    286     {
    287       "title": "The Llama 3 Herd of Models",
    288       "relevance": "Source of LLaMA3.2 and LLaMA3.3 models evaluated in the benchmark experiments."
    289     },
    290     {
    291       "title": "Qwen3 Technical Report",
    292       "relevance": "Source of Qwen3 model family evaluated extensively in the benchmark; covers models from 1.7B to 235B parameters."
    293     },
    294     {
    295       "title": "DeepSeek-V3 Technical Report",
    296       "relevance": "DeepSeek-V3 is one of the top-performing models evaluated; its performance relative to DeepSeek-R1 is specifically analyzed."
    297     }
    298   ],
    299   "engagement_factors": {
    300     "practical_relevance": {
    301       "score": 2,
    302       "justification": "Practitioners deploying LLMs for e-commerce customer or merchant service can directly use this benchmark to select models and identify stage-specific weaknesses."
    303     },
    304     "surprise_contrarian": {
    305       "score": 1,
    306       "justification": "The finding that small instruction-tuned models (Qwen3-4B-Instruct) outperform larger non-instruction-tuned variants mildly challenges the 'bigger is always better' assumption, but the overall narrative follows expected trends."
    307     },
    308     "fear_safety": {
    309       "score": 0,
    310       "justification": "No AI risk or safety concerns are raised; the paper focuses on performance evaluation, not failure modes or harms."
    311     },
    312     "drama_conflict": {
    313       "score": 0,
    314       "justification": "Standard benchmark paper with no controversy; the Xiaohongshu affiliation is not framed or likely to be perceived as contentious."
    315     },
    316     "demo_ability": {
    317       "score": 2,
    318       "justification": "Code and data are publicly released on GitHub, enabling readers to reproduce evaluations or test new models against EComStage."
    319     },
    320     "brand_recognition": {
    321       "score": 1,
    322       "justification": "Xiaohongshu (Little Red Book) is a recognized Chinese tech company but not a flagship AI lab; the benchmark evaluates well-known models (GPT-4o, Claude, Gemini) which adds recognizability."
    323     }
    324   },
    325   "hn_data": {
    326     "threads": [],
    327     "top_points": 0,
    328     "total_points": 0,
    329     "total_comments": 0
    330   }
    331 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs