ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25540B)


      1 {
      2   "paper": {
      3     "title": "Jenius Agent: Towards Experience-Driven Accuracy Optimization in Real-World Scenarios",
      4     "authors": ["Defei Xia", "Bingfeng Pi", "Shenbin Zhang", "Song Hua", "Yunfei Wei", "Lei Zuo"],
      5     "year": 2026,
      6     "venue": "arXiv (submitted to MLSys 2025)",
      7     "arxiv_id": "2601.01857",
      8     "doi": "10.48550/arXiv.2601.01857"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "case-study"],
     13   "key_findings": "Jenius-Agent integrates adaptive prompt generation, context-aware tool orchestration, and hierarchical memory management into a ReAct-style agent. On their custom Jenius-bench (850 multi-turn tasks), the full system achieves 76.5% task completion vs 56.6% baseline (35% relative gain). Token consumption is reduced by over 60%. On APIGen, gains are marginal (81.5% to 85.0%), suggesting the benchmark saturates. The system is deployed at jenius.cn with 265 DAU after 6 months.",
     14   "claims": [
     15     {
     16       "claim": "Up to 35% relative gain in task completion rate over the base agent on Jenius-bench",
     17       "evidence": "Table 3: Base TCR=0.5659, Jenius TCR=0.7647 (Section 4.4.1). This is a 35% relative improvement.",
     18       "supported": "strong"
     19     },
     20     {
     21       "claim": "Over 60% reduction in token consumption compared to baseline",
     22       "evidence": "Figure 6 and Section 4.4.3: Jenius-bench usage decreases from 9.27M to 3.65M tokens.",
     23       "supported": "strong"
     24     },
     25     {
     26       "claim": "Adaptive prompt generation is the single largest contributor, improving TCR from 0.5659 to 0.7271 (16%)",
     27       "evidence": "Table 3: Base→B-P comparison. Subsequent modules (tool orchestration, memory) add smaller incremental gains.",
     28       "supported": "moderate"
     29     },
     30     {
     31       "claim": "APIGen benchmark saturates, showing only marginal improvements from optimization modules",
     32       "evidence": "Table 2: Base TCR=0.8150, Jenius TCR=0.8500. Section 4.4.1 attributes this to pre-training coverage.",
     33       "supported": "moderate"
     34     },
     35     {
     36       "claim": "8-10% improvement in correctness and fluency over baseline on CRCFF metrics",
     37       "evidence": "Table 4: Correctness improves from 0.6741 to 0.7580 (Qwen) and 0.7890 to 0.8350 (DeepSeek).",
     38       "supported": "weak"
     39     },
     40     {
     41       "claim": "System deployed at scale with 265 DAU after 6 months",
     42       "evidence": "Section 5.2: DAU grew from 42 to 265, MAU of 1,277 in first month, spanning 34 countries.",
     43       "supported": "moderate"
     44     }
     45   ],
     46   "checklist": {
     47     "artifacts": {
     48       "code_released": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No source code repository URL is provided. The paper mentions jenius.cn as a deployed product but releases no code."
     52       },
     53       "data_released": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Jenius-bench is described in detail but no download link or repository is provided. APIGen is public, but the custom 800-sample subset and noise-injection setup are not released."
     57       },
     58       "environment_specified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No environment specification, requirements file, or dependency list is provided. The paper mentions Alibaba Cloud and Kubernetes but no software versions."
     62       },
     63       "reproduction_instructions": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No reproduction instructions are provided. The system is described architecturally but there are no steps to replicate experiments."
     67       }
     68     },
     69     "statistical_methodology": {
     70       "confidence_intervals_or_error_bars": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "Tables 2-4 report only point estimates. No confidence intervals, error bars, or ± notation anywhere in the results."
     74       },
     75       "significance_tests": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The paper claims improvements across multiple configurations but performs no statistical significance tests. All comparisons are raw number differences."
     79       },
     80       "effect_sizes_reported": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Percentage improvements are reported with baseline context: e.g., TCR from 0.5659 to 0.7271 (↑16%), correctness from 0.6741 to 0.7580 (↑5.2%), and token reduction from 9.27M to 3.65M (>60%)."
     84       },
     85       "sample_size_justified": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "APIGen uses 800 sampled instances and Jenius-bench has 850 samples. No justification is given for these sizes or any power analysis."
     89       },
     90       "variance_reported": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No variance, standard deviation, or spread measures are reported. All results appear to be single-run numbers."
     94       }
     95     },
     96     "evaluation_design": {
     97       "baselines_included": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The paper compares four progressively enhanced configurations: Base (ReAct), B-P (+ prompt), B-PT (+ tool), and Jenius (full system). Section 4.3."
    101       },
    102       "baselines_contemporary": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "All baselines are internal ablations of the same system. No comparison against any external agent framework (e.g., AutoGen, LangChain, or other published agent systems)."
    106       },
    107       "ablation_study": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The four configurations (Base, B-P, B-PT, Jenius) form a progressive ablation, isolating the contribution of each module. Tables 2-3."
    111       },
    112       "multiple_metrics": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper uses 4T metrics (TCR, TFR, TIR, TPS) for execution fidelity, CRCFF (5 dimensions) for output quality, and token consumption. Section 4.1."
    116       },
    117       "human_evaluation": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "Output quality is evaluated entirely by LLM judges (Qwen and DeepSeek). No human evaluation of system outputs is performed. The Jenius-bench annotations are human-curated ground truth, but evaluation of the system's outputs is automated."
    121       },
    122       "held_out_test_set": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No mention of train/dev/test splits. The paper evaluates on the full APIGen sample and Jenius-bench without discussing whether any data was used for development or tuning."
    126       },
    127       "per_category_breakdown": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "Jenius-bench covers 38 tool categories and APIGen has 21 categories, but no per-category performance breakdown is provided. Only aggregate results are reported."
    131       },
    132       "failure_cases_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 5.3 discusses several real-world failure modes: spurious tool calls, URL reader failures, inappropriate triggering of expensive tools, and retry loops. Section 4.4.1 discusses TFR and TIR as failure metrics."
    136       },
    137       "negative_results_reported": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper reports that full Jenius increased TFR on Jenius-bench from 0.0329 to 0.0753 (Section 4.4.1, Table 3), and that memory management showed marginal gains on DeepSeek evaluator (Table 4). Section 6 acknowledges limitations."
    141       }
    142     },
    143     "claims_and_evidence": {
    144       "abstract_claims_supported": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Abstract claims of 'up to 35% relative gain' and 'reduced token consumption' are supported by Tables 2-3 and Figure 6. The abstract appropriately hedges with 'up to'."
    148       },
    149       "causal_claims_justified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Causal claims (each module 'improves' performance) are supported by controlled ablation: each configuration adds exactly one module to the previous, enabling attribution. Section 4.3."
    153       },
    154       "generalization_bounded": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The title claims 'Real-World Scenarios' broadly, but evaluation is limited to their own Jenius-bench and APIGen. The paper does not bound claims to these specific benchmarks, and the abstract frames this as a general 'scalable solution for robust, protocol-compatible autonomous agents.'"
    158       },
    159       "alternative_explanations_discussed": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "No alternative explanations for the observed improvements are discussed. For example, the prompt optimization module could simply be providing more information rather than 'adaptive' optimization. The improvements could stem from better prompt engineering rather than the proposed architecture."
    163       },
    164       "proxy_outcome_distinction": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "The paper uses TCR and CRCFF (LLM-judged scores) as proxies for agent quality and 'accuracy optimization' but does not discuss the gap between these metrics and actual user satisfaction or real-world task success. Section 5 mentions user engagement but does not connect deployment metrics to evaluation metrics."
    168       }
    169     },
    170     "setup_transparency": {
    171       "model_versions_specified": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The paper mentions using Qwen3-Embedding for tool retrieval and Qwen/DeepSeek as LLM evaluators, but never specifies which LLM powers the agent itself, nor provides model versions/snapshot dates for any model used."
    175       },
    176       "prompts_provided": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper describes prompt design principles (Section 3.1) at a high level but does not provide actual prompt text used in experiments. Only design patterns and categories are described."
    180       },
    181       "hyperparameters_reported": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No hyperparameters are reported: no temperature, top-p, max tokens, embedding dimensions, threshold K for memory summarization, or M for top-M retrieval."
    185       },
    186       "scaffolding_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The three-module scaffolding is described in detail: adaptive prompt generation (Section 3.1), tool orchestration with embedding similarity and inflection-point filtering (Section 3.2), and hierarchical memory management with dialogue-level alignment and session-level summarization (Section 3.3)."
    190       },
    191       "data_preprocessing_documented": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "For Jenius-bench: the paper says 850 'human-curated samples' with 'manual review and annotation by domain experts' but does not describe the curation pipeline, filtering criteria, or annotation guidelines. For APIGen: 800 instances were 'sampled' with no description of sampling method."
    195       }
    196     },
    197     "limitations_and_scope": {
    198       "limitations_section_present": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "There is no dedicated limitations section. Section 6 (Discussion and Conclusion) briefly mentions 'incomplete capture of hidden reasoning steps' and 'multiple valid tool-use paths' but this is embedded in the conclusion, not a substantive limitations discussion."
    202       },
    203       "threats_to_validity_specific": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No specific threats to validity are discussed. The brief mention in Section 6 is generic ('highlighting limitations') without identifying specific threats like evaluator bias, benchmark representativeness, or internal validity concerns."
    207       },
    208       "scope_boundaries_stated": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No explicit scope boundaries are stated. The paper does not clarify what the results do NOT show, what task types are excluded, or what settings the framework has not been tested in."
    212       }
    213     },
    214     "data_integrity": {
    215       "raw_data_available": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Neither Jenius-bench data nor experimental results (individual task outcomes) are available for verification."
    219       },
    220       "data_collection_described": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "Jenius-bench is described as 'derived from real user-agent interactions' with 'manual review,' but the collection procedure, time period, and selection criteria are not specified."
    224       },
    225       "recruitment_methods_described": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "No human participants in the evaluation. Jenius-bench is derived from logged interactions, not a participant study."
    229       },
    230       "data_pipeline_documented": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The pipeline from raw user interactions to the final 850-sample benchmark is not documented. No filtering stages, exclusion counts, or annotation procedures are described."
    234       }
    235     },
    236     "conflicts_of_interest": {
    237       "funding_disclosed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No funding disclosure or acknowledgments section is present in the paper."
    241       },
    242       "affiliations_disclosed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "All authors are listed as affiliated with 'Tianju Dihe (Suzhou) Technology Co., Ltd.' which operates the Jenius product being evaluated."
    246       },
    247       "funder_independent_of_outcome": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "All authors work at the company that develops and commercializes Jenius. The funder (the company itself) has a direct financial interest in positive results."
    251       },
    252       "financial_interests_declared": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "No competing interests statement is present. The authors work for the company whose product they evaluate, but this conflict is not explicitly acknowledged."
    256       }
    257     },
    258     "contamination": {
    259       "training_cutoff_stated": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "The underlying LLM powering the agent is not even named, let alone its training cutoff. APIGen was published in 2024 and could be in training data."
    263       },
    264       "train_test_overlap_discussed": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No discussion of whether the agent's LLM may have seen APIGen examples or similar tasks during training."
    268       },
    269       "benchmark_contamination_addressed": {
    270         "applies": true,
    271         "answer": false,
    272         "justification": "APIGen is a public benchmark (NeurIPS 2024). No contamination analysis is performed. Jenius-bench is proprietary so contamination risk is lower, but this is not discussed."
    273       }
    274     },
    275     "human_studies": {
    276       "pre_registered": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in the study."
    280       },
    281       "irb_or_ethics_approval": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in the study."
    285       },
    286       "demographics_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in the study."
    290       },
    291       "inclusion_exclusion_criteria": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "No human participants in the study."
    295       },
    296       "randomization_described": {
    297         "applies": false,
    298         "answer": false,
    299         "justification": "No human participants in the study."
    300       },
    301       "blinding_described": {
    302         "applies": false,
    303         "answer": false,
    304         "justification": "No human participants in the study."
    305       },
    306       "attrition_reported": {
    307         "applies": false,
    308         "answer": false,
    309         "justification": "No human participants in the study."
    310       }
    311     },
    312     "cost_and_practicality": {
    313       "inference_cost_reported": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Token consumption is reported in Figure 6: from 9.27M (Base) to 3.65M (Jenius) on Jenius-bench. Section 4.4.3 discusses token efficiency in detail."
    317       },
    318       "compute_budget_stated": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No total compute budget, GPU hours, API cost in dollars, or hardware specifications for experiments are stated."
    322       }
    323     },
    324     "experimental_rigor": {
    325       "seed_sensitivity_reported": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "No mention of multiple random seeds. All results appear to be single-run."
    329       },
    330       "number_of_runs_stated": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The number of experimental runs is never stated. Results are presented without indicating whether they are from single or multiple runs."
    334       },
    335       "hyperparameter_search_budget": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No hyperparameter search budget is reported. The inflection-point threshold N=10 was chosen empirically (Section 3.2) but the search process is not documented."
    339       },
    340       "best_config_selection_justified": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The progressive ablation design (Base→B-P→B-PT→Jenius) shows all configurations, but no justification for how specific sub-choices (e.g., N=10, threshold K) were selected."
    344       },
    345       "multiple_comparison_correction": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    349       },
    350       "self_comparison_bias_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The authors evaluate their own system (Jenius) on their own benchmark (Jenius-bench) without acknowledging this bias. Section 4.4 notes the within-framework comparison design but frames it positively rather than as a limitation."
    354       },
    355       "compute_budget_vs_performance": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "Figure 6 and Section 4.4.3 report token consumption alongside performance for each configuration, enabling performance-per-token analysis."
    359       },
    360       "benchmark_construct_validity": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No discussion of whether Jenius-bench or APIGen actually measure what they claim. The paper does not question construct validity of TCR as a proxy for agent quality or CRCFF as output quality."
    364       },
    365       "scaffold_confound_addressed": {
    366         "applies": false,
    367         "answer": false,
    368         "justification": "The paper evaluates Jenius as a bundled system; the scaffold IS the thing being tested. No cross-model comparison is made that would require isolating scaffold effects."
    369       }
    370     },
    371     "data_leakage": {
    372       "temporal_leakage_addressed": {
    373         "applies": true,
    374         "answer": false,
    375         "justification": "No discussion of temporal leakage. APIGen was published in 2024 and could be in the training data of models used. No cutoff dates are stated."
    376       },
    377       "feature_leakage_addressed": {
    378         "applies": true,
    379         "answer": false,
    380         "justification": "No discussion of whether the evaluation setup leaks information, e.g., whether tool descriptions in Jenius-bench contain hints toward the correct answer."
    381       },
    382       "non_independence_addressed": {
    383         "applies": true,
    384         "answer": false,
    385         "justification": "No discussion of independence between Jenius-bench tasks, or whether some tasks share structural similarities that could inflate performance."
    386       },
    387       "leakage_detection_method": {
    388         "applies": true,
    389         "answer": false,
    390         "justification": "No leakage detection or prevention method is applied to either benchmark."
    391       }
    392     }
    393   },
    394   "red_flags": [
    395     {
    396       "flag": "Company evaluating its own product",
    397       "detail": "All authors work at Tianju Dihe, which develops and commercializes Jenius. They evaluate their own system on their own proprietary benchmark (Jenius-bench) with no external validation, no independent evaluators, and no competing interests disclosure."
    398     },
    399     {
    400       "flag": "Proprietary benchmark with no release",
    401       "detail": "Jenius-bench (850 tasks) is the primary evaluation dataset but is not released. The data collection pipeline, annotation guidelines, and curation criteria are undocumented. Results cannot be independently verified."
    402     },
    403     {
    404       "flag": "No external baselines",
    405       "detail": "All comparisons are internal ablations (Base, B-P, B-PT, Jenius). No comparison against any published agent framework (AutoGen, LangChain, etc.) despite these being discussed in Related Work."
    406     },
    407     {
    408       "flag": "LLM-as-judge without validation",
    409       "detail": "CRCFF quality metrics are evaluated entirely by LLM judges (Qwen and DeepSeek). No human validation of the LLM judges' reliability or correlation with human judgment is provided."
    410     },
    411     {
    412       "flag": "No statistical rigor",
    413       "detail": "All results are single-run point estimates with no error bars, confidence intervals, significance tests, or variance across seeds. Marginal differences (e.g., 0.7494 vs 0.7647) may not be meaningful."
    414     },
    415     {
    416       "flag": "Undisclosed base model",
    417       "detail": "The LLM powering the agent is never identified. Without knowing the model, it is impossible to assess contamination risk, reproduce results, or compare fairly with other systems."
    418     },
    419     {
    420       "flag": "Tiny deployment scale presented as validation",
    421       "detail": "Section 5 presents 265 DAU and 1,277 MAU as evidence of real-world effectiveness, but these are very small numbers. No user satisfaction data, A/B test results, or outcome metrics from deployment are provided."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    427       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    428       "year": 2023,
    429       "arxiv_id": "2210.03629",
    430       "relevance": "Foundational agent reasoning paradigm (ReAct) that Jenius builds upon."
    431     },
    432     {
    433       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations",
    434       "authors": ["Qingyun Wu"],
    435       "relevance": "Major multi-agent framework discussed as a related approach with different execution assumptions."
    436     },
    437     {
    438       "title": "Why Do Multi-Agent LLM Systems Fail?",
    439       "authors": ["Mert Cemri"],
    440       "year": 2025,
    441       "arxiv_id": "2503.13657",
    442       "relevance": "Directly relevant to understanding agent failure modes, which Jenius claims to address."
    443     },
    444     {
    445       "title": "The Llama 3 Herd of Models",
    446       "authors": ["Abhimanyu Dubey"],
    447       "year": 2024,
    448       "relevance": "Major open LLM family relevant to LLM-based agent capability."
    449     },
    450     {
    451       "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    452       "authors": ["Timo Schick"],
    453       "year": 2023,
    454       "relevance": "Foundational work on LLM tool use that Jenius's tool orchestration extends."
    455     },
    456     {
    457       "title": "APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets",
    458       "authors": ["Zuxin Liu"],
    459       "year": 2024,
    460       "relevance": "Public benchmark used in this paper's evaluation."
    461     },
    462     {
    463       "title": "MCP-Zero: Proactive Toolchain Construction for LLM Agents from Scratch",
    464       "authors": ["Xiang Fei"],
    465       "year": 2025,
    466       "arxiv_id": "2506.01056",
    467       "relevance": "Adaptive tool discovery approach related to Jenius's tool orchestration module."
    468     },
    469     {
    470       "title": "DSPy: Compiling Declarative Language Model Calls into State-of-the-Art Pipelines",
    471       "authors": ["Omar Khattab"],
    472       "relevance": "Modular prompt compilation framework relevant to prompt optimization research."
    473     },
    474     {
    475       "title": "The Rise and Potential of Large Language Model Based Agents: A Survey",
    476       "authors": ["Zhiheng Xi"],
    477       "year": 2025,
    478       "relevance": "Comprehensive survey of LLM-based agents relevant to the survey scope."
    479     },
    480     {
    481       "title": "The Landscape of Emerging AI Agent Architectures for Reasoning, Planning, and Tool Calling: A Survey",
    482       "authors": ["Tula Masterman"],
    483       "year": 2024,
    484       "arxiv_id": "2404.11584",
    485       "relevance": "Survey of agent architectures for reasoning and tool calling."
    486     },
    487     {
    488       "title": "From RAG to Multi-Agent Systems: A Survey of Modern Approaches in LLM Development",
    489       "authors": ["Gabriel de Araujo e Aquino"],
    490       "year": 2025,
    491       "relevance": "Survey covering RAG and multi-agent system evolution relevant to agent development."
    492     }
    493   ]
    494 }

Impressum · Datenschutz