scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28194B)
      1 {
      2   "paper": {
      3     "title": "EComStage: Stage-wise and Orientation-specific Benchmarking for Large Language Models in E-commerce",
      4     "authors": [
      5       "Kaiyan Zhao",
      6       "Zijie Meng",
      7       "Zheyong Xie",
      8       "Jin Duan",
      9       "Yao Hu",
     10       "Zuozhu Liu",
     11       "Shaosheng Cao"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv",
     15     "arxiv_id": "2601.02752",
     16     "doi": "10.48550/arXiv.2601.02752"
     17   },
     18   "scan_version": 2,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "EComStage benchmarks 33 LLMs across 7 e-commerce tasks decomposed into Perception, Planning, and Action stages, with both customer- and merchant-oriented scenarios. No single model excels across all stages and orientations: Claude Sonnet 4 leads on merchant-oriented planning tasks while Qwen3-235B-A22B-Instruct achieves the best overall score (85.61) among large open-source models. Most models perform well on classification-style Perception tasks but show significant variation on Planning and Action, with merchant-oriented tasks revealing particular weaknesses in models like GPT-4o.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper provides a GitHub repository link in footnote 1: 'Codes and data: https://github.com/KYuuto1006/EComStage'."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The same footnote 1 states 'Codes and data' are released at the GitHub link, and Table 1 lists Availability as checkmark for EComStage."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper states '8 NVIDIA H800 GPUs' and inference hyperparameters, but provides no requirements.txt, Dockerfile, or library version specifications needed to recreate the software environment."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions, README descriptions, or scripts for replicating experiments are mentioned in the paper."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Table 3 reports only point estimates (e.g., '84.21', '85.61') with no confidence intervals, error bars, or ± notation for any results."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper makes comparative claims (e.g., 'Claude Sonnet 4 achieves the best average score') based solely on comparing raw numbers. No statistical significance tests are used anywhere."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Only raw accuracy scores and cosine similarity values are reported. No effect sizes (Cohen's d, odds ratios, or contextualized improvement magnitudes) are provided."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper notes 'our Planning set contains only 164 samples' but provides no justification or power analysis for sample sizes across any of the 7 tasks."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Section 4.1.3 explicitly states 'All experiments are conducted on 8 NVIDIA H800 GPUs with a single run.' No variance, standard deviation, or multiple-run results are reported."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The paper evaluates 33 models against each other, providing extensive cross-model comparisons across all 7 tasks (Table 3)."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Evaluated models include very recent systems: Claude Sonnet 4, Gemini 2.5-Pro, Qwen3 series, DeepSeek-R1, and dots.llm1.inst, all from 2024-2025."
     82       },
     83       "ablation_study": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "This is a benchmark paper evaluating existing models, not a system with components to ablate. The stage-wise breakdown serves a similar diagnostic purpose."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The paper uses accuracy for close-ended tasks and cosine similarity (using Qwen3-Embedding-8B) for open-ended generation tasks (Section 4.1.2)."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "Evaluation of model outputs is entirely automated (accuracy and cosine similarity). No human evaluation of the models' generated responses is performed. Human annotation was used only for benchmark construction, not for evaluating model outputs."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The benchmark is used as a fixed evaluation set with uniform hyperparameters (temperature 0.1, top-p 0.001) applied to all models. No tuning is done on the benchmark data, so it functions as a clean held-out test set."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Table 3 provides per-task breakdowns for all 7 tasks across all 33 models, and Figure 3 provides stage-wise and orientation-wise aggregations."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "The paper discusses which models score low on which tasks in aggregate (e.g., 'GLM4-9B' scoring 38.44 on Attitude Classification) but provides no qualitative error analysis or concrete examples of model failures."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper reports several negative findings: DeepSeek-R1 performing worse than DeepSeek-V3 despite being newer (Section 4.2.4), Qwen3-8B and Qwen3-14B falling behind smaller Qwen3-4B-Instruct (Section 4.2.3), and GPT-4o showing weakness on merchant-oriented tasks (Section 4.3)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract claims stage/orientation-specific strengths and weaknesses, which is supported by Table 3 and Figure 3. The claim that 'no single model consistently excels across all stages' is directly supported by the per-task results."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper makes multiple unsupported causal claims: 'likely due to its optimization for complex reasoning and tool use' (Section 4.2.1), 'likely benefiting from its large model capacity' (Section 4.2.4), 'likely due to its training on fine-grained dialogue understanding' (Section 4.2.4). These are speculative explanations without causal study design."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title claims to benchmark LLMs 'in E-commerce' generally, but the data comes from a single platform (Xiaohongshu, as indicated by author affiliations). The paper does not acknowledge that results from one e-commerce platform's data may not generalize to other platforms, markets, or cultural contexts."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper attributes performance differences to model-specific factors (training data, instruction tuning) without considering alternative explanations such as prompt sensitivity, language bias from translation, or task-specific confounds."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper measures accuracy on simplified tasks (e.g., classification, matching) and frames this as evaluating 'Perception, Planning, and Action' capabilities of e-commerce agents. The gap between controlled benchmark performance and real-world agent capability in live e-commerce settings is not discussed."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Closed-source APIs are listed as 'GPT-4o', 'Gemini 2.5-Pro', 'Claude 3.7', 'Claude Sonnet 4' without snapshot dates or API version identifiers. The schema requires specific versions, not marketing names."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Full system prompts and user prompt templates for all 7 tasks are provided in Appendix A.2 (Figures 9-15), with sufficient detail to reconstruct the inputs sent to models."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 4.1.3 reports: batch size 32, input length 4096 tokens, temperature 0.1, top-p 0.001, repetition penalty 1.05, max generation 512 tokens."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. The models are evaluated via direct prompting without tools, memory, or multi-step workflows."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 3.1 describes a detailed multi-stage pipeline including task-specific filtering (Figure 5), global filtering (Figure 6), consistency checking (Figure 7), translation (Figure 8), and security screening, with specific criteria at each stage."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing the lack of error propagation evaluation and limited coverage of e-commerce domains."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The limitations mention specific threats: the benchmark 'does not capture error propagation across stages, which may occur in real-world deployments' and covers 'representative but finite e-commerce scenarios' that 'do not fully encompass all e-commerce domains.'"
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The Limitations section explicitly states what the benchmark does not cover: cross-stage error propagation and domains beyond the seven covered tasks, noting room for future expansion."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The paper states data is released at the GitHub repository (footnote 1), and Table 1 marks Availability with a checkmark."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 3.1.1 describes data collection from 'real e-commerce service scenarios, covering both customer and merchant orientations.' The pipeline including annotation, filtering, and verification is described in detail."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "Annotators are described as 'employees of our company with relevant experience in e-commerce operations' (Appendix A.3), but the sampling strategy for selecting which business data to include from the platform is not described — there is no information on how scenarios were selected from the universe of available interactions."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "The pipeline stages are described (Figure 2, Section 3.1), but no intermediate counts are provided. The paper does not report how many samples existed before filtering, how many were removed at each stage, or the rejection rates for quality checks."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding source, acknowledgments section, or grant information is disclosed anywhere in the paper."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: The University of Tokyo, Zhejiang University, and Xiaohongshu Inc. The corresponding author's email is at xiaohongshu.com."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "Three authors are from Xiaohongshu Inc., which is the apparent source of the benchmark data. The company has an interest in their platform's data being seen as a valuable source for AI evaluation, though they are not evaluating their own model."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests statement or financial interest declarations are present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates are stated for any of the 33 evaluated models, including closed-source APIs like GPT-4o and Gemini 2.5-Pro."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether any model's training data might contain similar e-commerce conversations or the specific tasks used in the benchmark."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "Although the benchmark data is newly created from proprietary business data (reducing contamination risk), the paper does not discuss contamination at all. The data was translated from Chinese to English, which could mitigate overlap, but this is not framed as a contamination prevention measure."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in the study. Annotators are company employees performing work duties, not research subjects."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in the study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in the study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in the study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in the study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in the study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in the study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference costs, API costs, latency, or per-example costs are reported despite evaluating 33 models (including paid APIs) across 4,804 samples."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "The paper states '8 NVIDIA H800 GPUs' but does not quantify total GPU hours, wall-clock time, or API spend for the experiments."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Section 4.1.3 states 'All experiments are conducted on 8 NVIDIA H800 GPUs with a single run.' No seed sensitivity analysis is performed."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Section 4.1.3 explicitly states 'with a single run,' making the number of runs clear."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The paper uses fixed hyperparameters (temperature 0.1, top-p 0.001, repetition penalty 1.05) but does not justify these choices or report whether alternatives were explored."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "The paper applies uniform settings across all models (Section 4.1.3: 'These settings are applied consistently across all evaluated models to provide a fair comparison'), so there is no cherry-picking of per-model configurations."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The paper compares 33 models across 7 tasks, making numerous comparative claims, but no statistical tests are performed at all, let alone corrections for multiple comparisons."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": false,
    328         "answer": false,
    329         "justification": "The paper evaluates published third-party models on a benchmark; it does not propose or evaluate its own model, so the Lucic et al. bias of re-implementing baselines does not apply."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Models ranging from 1B to 200B+ parameters are compared without normalizing for compute. The paper does not discuss performance per unit of compute or compare models at matched compute budgets."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper defines Perception, Planning, and Action stages and assigns tasks to them, but does not validate that these tasks actually measure the claimed constructs. For example, whether 'Query Match' truly measures 'Perception' capability rather than simple retrieval/matching is not discussed."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is used; models are evaluated via direct prompting with uniform settings."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether models' training data might include similar e-commerce interactions from before the benchmark's creation date."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the evaluation setup (e.g., providing question lists or response options) gives models information that would not be available in a real deployment."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether training data for any evaluated model might contain similar customer service conversations from the same or similar e-commerce platforms."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No leakage detection or prevention methods are applied. Although the data originates from proprietary sources and is translated, this is not framed or analyzed as a contamination mitigation strategy."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "No single model consistently excels across all stages (Perception, Planning, Action) or orientations (customer-oriented, merchant-oriented).",
    373       "evidence": "Table 3 and Figure 3 show varying performance across models: Claude Sonnet 4 leads overall (84.21) but Gemini 2.5-Pro leads on Solution Decision (87.06); GPT-4o shows relative weakness on merchant-oriented tasks while DeepSeek-V3 excels there.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Qwen3-235B-A22B-Instruct achieves the best overall performance (85.61) among all evaluated models.",
    378       "evidence": "Table 3 shows Qwen3-235B-A22B-Instruct at 85.61 average across 7 tasks, the highest value in the table.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Recent instruction-tuned variants consistently outperform their base counterparts.",
    383       "evidence": "Section 4.2.4: Qwen3-235B-A22B-Instruct (85.61) outperforms Qwen3-235B-A22B (84.43); Qwen3-30B-A3B-Instruct (83.63) outperforms Qwen3-30B-A3B (82.05). Section 4.2.2: Qwen3-4B-Instruct (82.26) substantially outperforms Qwen3-4B (78.13).",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Stage-wise evaluation reveals weaknesses hidden by overall average scores.",
    388       "evidence": "Figure 3 shows models with similar overall averages have distinct stage-specific profiles. GPT-4o performs well overall but is weaker on merchant-oriented tasks; Claude Sonnet 4 excels in Planning but lags in Action.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Smaller models (4B) can achieve performance competitive with much larger models when properly instruction-tuned.",
    393       "evidence": "Table 3 shows Qwen3-4B-Instruct (82.26) outperforming Qwen3-8B (81.01) and Qwen3-14B (81.70), though this is based on single-run results without variance estimates.",
    394       "supported": "weak"
    395     },
    396     {
    397       "claim": "DeepSeek-V3 performs better than DeepSeek-R1 in e-commerce scenarios.",
    398       "evidence": "Table 3: DeepSeek-V3 (84.23) vs DeepSeek-R1 (82.17). The difference is 2.06 points but based on a single run with no statistical test.",
    399       "supported": "weak"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "Single-run evaluation with no variance estimates",
    405       "detail": "All 33 models are evaluated in a single run with no repetition. Performance differences of 1-2 points between models (e.g., DeepSeek-V3 vs DeepSeek-R1) could easily fall within run-to-run variance, yet are interpreted as meaningful."
    406     },
    407     {
    408       "flag": "No statistical tests for comparative claims",
    409       "detail": "The paper makes dozens of comparative claims about model superiority based purely on raw accuracy differences, with no statistical significance tests. Many claimed differences are small (e.g., 84.21 vs 84.01 between Claude Sonnet 4 and Gemini 2.5-Pro)."
    410     },
    411     {
    412       "flag": "Company employees creating benchmark from own platform data",
    413       "detail": "Three authors from Xiaohongshu Inc. created a benchmark using data from their own platform. The data source is never named in the paper despite being identifiable from affiliations. Potential bias in task selection, data curation, and what constitutes 'correct' answers."
    414     },
    415     {
    416       "flag": "No contamination analysis",
    417       "detail": "33 pre-trained LLMs are evaluated on e-commerce tasks without any discussion of whether similar customer service data appeared in their training sets. Many models are trained on web data that likely includes e-commerce conversations."
    418     },
    419     {
    420       "flag": "Cosine similarity as evaluation metric for generation tasks",
    421       "detail": "Open-ended generation tasks (Query Rewrite, RAG-QA) are evaluated via cosine similarity with a single embedding model (Qwen3-Embedding-8B), which may not capture semantic correctness. This is validated neither against human judgments nor with alternative embedding models."
    422     },
    423     {
    424       "flag": "Speculative causal attributions",
    425       "detail": "The paper repeatedly attributes performance differences to model-specific factors ('likely due to its optimization for complex reasoning,' 'likely benefiting from its large model capacity') without evidence for these causal claims."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "τ-bench: A benchmark for tool-agent-user interaction in real-world domains",
    431       "authors": ["Shunyu Yao", "Noah Shinn", "Pedram Razavi", "Karthik Narasimhan"],
    432       "year": 2024,
    433       "arxiv_id": "2406.12045",
    434       "relevance": "Key related benchmark for evaluating LLM agents in realistic tool-use scenarios including retail domains."
    435     },
    436     {
    437       "title": "AgentBench: Evaluating LLMs as agents",
    438       "authors": ["Xiao Liu"],
    439       "year": 2024,
    440       "relevance": "Benchmark for evaluating LLMs as agents across multiple environments, relevant to agentic AI evaluation methodology."
    441     },
    442     {
    443       "title": "Large language model based multi-agents: A survey of progress and challenges",
    444       "authors": ["Taicheng Guo"],
    445       "year": 2024,
    446       "arxiv_id": "2402.01680",
    447       "relevance": "Survey of multi-agent LLM systems covering coordination, reasoning, and evaluation challenges."
    448     },
    449     {
    450       "title": "The rise and potential of large language model based agents: A survey",
    451       "authors": ["Zhiheng Xi"],
    452       "year": 2023,
    453       "arxiv_id": "2309.07864",
    454       "relevance": "Comprehensive survey of LLM-based agents covering capabilities, architectures, and evaluation approaches."
    455     },
    456     {
    457       "title": "The Llama 3 herd of models",
    458       "authors": ["Aaron Grattafiori"],
    459       "year": 2024,
    460       "arxiv_id": "2407.21783",
    461       "relevance": "Technical report for the Llama 3 model family, one of the major open-source LLMs evaluated in benchmarks."
    462     },
    463     {
    464       "title": "DeepSeek-V3 technical report",
    465       "authors": ["DeepSeek-AI"],
    466       "year": 2025,
    467       "arxiv_id": "2412.19437",
    468       "relevance": "Technical report for DeepSeek-V3, a major open-source model relevant to LLM capability evaluation."
    469     },
    470     {
    471       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    472       "authors": ["DeepSeek-AI"],
    473       "year": 2025,
    474       "arxiv_id": "2501.12948",
    475       "relevance": "Describes reinforcement learning approach to improving LLM reasoning, relevant to agent capability research."
    476     },
    477     {
    478       "title": "GPT-4o system card",
    479       "authors": ["OpenAI"],
    480       "year": 2024,
    481       "arxiv_id": "2410.21276",
    482       "relevance": "System card for GPT-4o, a key closed-source model evaluated across benchmarks."
    483     },
    484     {
    485       "title": "WebShop: Towards scalable real-world web interaction with grounded language agents",
    486       "authors": ["Shunyu Yao", "Howard Chen", "John Yang", "Karthik Narasimhan"],
    487       "year": 2022,
    488       "relevance": "Early benchmark for evaluating LLM agents in web-based shopping environments."
    489     },
    490     {
    491       "title": "Large language model agent: A survey on methodology, applications and challenges",
    492       "authors": ["Junyu Luo"],
    493       "year": 2025,
    494       "arxiv_id": "2503.21460",
    495       "relevance": "Recent comprehensive survey of LLM agent methodology and evaluation, relevant to understanding the agentic AI landscape."
    496     },
    497     {
    498       "title": "A survey on code generation with LLM-based agents",
    499       "authors": ["Yihong Dong"],
    500       "year": 2025,
    501       "arxiv_id": "2508.00083",
    502       "relevance": "Survey of LLM-based code generation agents, relevant to understanding agent capabilities in software engineering."
    503     },
    504     {
    505       "title": "A survey of large language models",
    506       "authors": ["Wayne Xin Zhao"],
    507       "year": 2025,
    508       "arxiv_id": "2303.18223",
    509       "relevance": "Comprehensive survey of LLMs covering training, evaluation, and capabilities, relevant to overall LLM assessment methodology."
    510     }
    511   ]
    512 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs