scan-v5.json (24187B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Jenius Agent: Towards Experience-Driven Accuracy Optimization in Real-World Scenarios", 6 "authors": [ 7 "Defei Xia", 8 "Bingfeng Pi", 9 "Shenbin Zhang", 10 "Song Hua", 11 "Yunfei Wei" 12 ], 13 "year": 2026, 14 "venue": "arXiv.org", 15 "arxiv_id": "2601.01857", 16 "doi": "10.48550/arXiv.2601.01857" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": false, 23 "justification": "The abstract claims 'reduced tool invocation failures' but Task Failure Rate (TFR) on Jenius-bench actually increased from 0.0329 (Base) to 0.0753 (Jenius). The claim of 'reduced response latency' is never measured in the paper.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper conducts a progressive four-variant ablation (Base→B-P→B-PT→Jenius) isolating contributions of each module, providing adequate support for within-framework causal attribution.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper frames results as applicable to 'real-world scenarios' broadly, but the primary novel benchmark (Jenius-bench) is derived entirely from their own production system and domain; gains on APIGen are marginal (0.8150→0.8500).", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "No alternative explanations are discussed; the paper does not consider whether gains could reflect Jenius-bench's construction bias toward their own system's tool ontology rather than genuine capability improvement.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": false, 47 "justification": "CRCFF evaluation uses LLM-as-judge (Qwen/DeepSeek) as a proxy for actual response quality, but this limitation — including potential self-serving evaluator bias — is not acknowledged.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": false, 55 "justification": "There is no dedicated limitations or threats-to-validity section; limitations are briefly mentioned in one clause of the conclusion ('incomplete capture of hidden reasoning steps') without dedicated treatment.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "No specific threats are discussed; the paper does not address Jenius-bench's construction from their own production logs (selection bias), the unspecified backbone LLM, or LLM evaluator self-serving bias.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper does not state what results do not show; it claims 'generalizability' without bounding that claim to its tested setting of one proprietary multi-turn benchmark and one public single-turn benchmark.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding disclosure is present anywhere in the paper.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "All authors are identified as affiliated with Tianju Dihe (Suzhou) Technology Co., Ltd., the company that operates the Jenius system being evaluated.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": false, 87 "justification": "All authors are employees of the company whose commercial product (Jenius, deployed at jenius.cn) is the primary subject of evaluation — they have direct interest in positive results.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial interests statement is included in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": false, 101 "justification": "The title's central term 'experience-driven' is never defined in the paper; the framework is described but the specific meaning of learning from 'experience' (vs. static design) is not operationalized.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper clearly states five contributions: system-level execution abstraction, modular optimization framework, task grounding improvements, an evaluation framework (4T + CRCFF), and experimental validation.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "The related work section engages substantively with prior approaches in prompt engineering (DSPy, Reflect-Retry-Reward), tool selection (MCP-Zero, BioMedTools), and memory management (StateFlow, Recursively Summarizing), noting specific gaps each leaves.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": false, 124 "justification": "No source code is released; the system is deployed at jenius.cn as a commercial product but no framework code or implementation is made available.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": false, 130 "justification": "Jenius-bench (850 samples, 38 categories) is described as novel but is not publicly released; it contains 'real user-agent interactions' from their production system.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "No requirements file, Dockerfile, or dependency specification is provided; the paper mentions Alibaba Cloud and Kubernetes infrastructure but gives no reproducible environment spec.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "No step-by-step reproduction instructions are provided; the benchmark is proprietary and the agent LLM backbone is unspecified.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "No confidence intervals or error bars are reported for any results in Tables 2, 3, or 4.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are used despite comparative claims across four agent configurations on two benchmarks.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Effect sizes are reported as percentage improvements (e.g., B-P improves TCR by 16%, Jenius achieves 35% relative gain over Base, token reduction >60%) with baseline context.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "APIGen sample of 800 is justified only as 'computational tractability'; no power analysis or principled sample size determination is provided for either dataset.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "No variance, standard deviation, or run-to-run variability is reported for any metric in the evaluation tables.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "A standard ReAct-style Base agent serves as the canonical baseline against which all three module additions are compared.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "The ReAct baseline is a well-established and appropriate contemporary baseline for agentic systems; the paper's goal is within-framework ablation rather than cross-system comparison.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "A four-configuration progressive ablation (Base, B-P, B-PT, Jenius) isolates the contribution of each module on both benchmarks.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Evaluation uses 4T execution fidelity metrics (TCR, TFR, TIR, TPS), five CRCFF output quality metrics, and token consumption, evaluated across two LLM judges.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": true, 205 "answer": false, 206 "justification": "Output quality is evaluated solely by LLM-as-judge (Qwen-3 and DeepSeek); no human evaluation of system outputs is conducted.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": false, 212 "justification": "Jenius-bench is derived from their own production system logs; there is no held-out test split described, and benchmark construction and system design are not cleanly separated.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": false, 218 "justification": "Despite 38 tool categories in Jenius-bench and 21 in APIGen, no per-category performance breakdown is provided.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Section 5.3 discusses concrete failure cases from deployment (spurious tool calls, inappropriate PPT generation, URL extraction failures, blind retry loops).", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper does not explicitly report that TFR worsened with B-P and Jenius compared to Base on Jenius-bench (0.0329→0.0859→0.0753); this is a negative result that is not highlighted or analyzed.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": false, 238 "justification": "The backbone LLM powering the agent itself is never specified anywhere in the paper — a critical omission that makes results unreproducible; only the embedding model (Qwen3 Embedding) and evaluators (Qwen-3, DeepSeek) are named.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": false, 244 "justification": "No actual prompts or system instructions are provided; the paper describes prompt generation principles but provides no concrete prompt templates or fill values.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": false, 250 "justification": "Penalty weights λw=λm=1 and context summarization threshold K are mentioned but temperature, top-p, and other LLM hyperparameters are absent; M for top-M tool retrieval is referenced but not given.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": true, 255 "answer": true, 256 "justification": "The three-module architecture (adaptive prompt generation, tool orchestration, hierarchical memory) is described in substantial detail including algorithms, formulas, and data flow diagrams.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": false, 262 "justification": "Jenius-bench's manual review process is described qualitatively but specific preprocessing steps, filtering criteria, and inter-annotator agreement metrics are not documented.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "Neither Jenius-bench nor the raw evaluation results are publicly available; the benchmark is described as novel but not released.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Jenius-bench is described as derived from real user-agent interactions with manual review by domain experts; domains and tool category counts are specified.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "Jenius-bench is derived from production logs, not recruited participants; NA for standard benchmark evaluation context.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": false, 288 "justification": "The paper describes that trajectories come from real interactions and undergo manual review but does not document the full pipeline from raw logs to final benchmark instances.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "The backbone LLM is never identified, making it impossible to state its training cutoff; this omission prevents any contamination analysis.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "No discussion of whether the unspecified agent LLM could have been trained on APIGen data (a public dataset with 60K samples) is included.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "APIGen is a public benchmark that could be in training data; contamination is not addressed despite comparative claims on this dataset.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human subjects experiment conducted; deployment usage data is observational.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No formal human subjects study; NA.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No formal human participants; geographic distribution of production users is reported but not in a human-subjects research context.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "NA — no human subjects experiment.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "NA — no human subjects experiment.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "NA — no human subjects experiment.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "NA — no human subjects experiment.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": true, 360 "justification": "Token consumption (input + output tokens) is measured and reported for all four agent variants on both benchmarks as a proxy for inference cost.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "No total computational budget, GPU hours, or monetary cost for running the experiments is stated.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Jenius achieves up to 35% relative gain in task completion rate over the base agent", 375 "evidence": "TCR improves from 0.5659 (Base) to 0.7647 (Jenius) on Jenius-bench = 35.1% relative gain (Table 3)", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "The framework reduces token consumption by over 60%", 380 "evidence": "Token usage drops from 9.27M (Base) to 3.65M (Jenius) on Jenius-bench, a 60.6% reduction (Figure 6)", 381 "supported": "strong" 382 }, 383 { 384 "claim": "The framework reduces tool invocation failures", 385 "evidence": "TFR (task failure rate) actually increased from 0.0329 (Base) to 0.0753 (Jenius) on Jenius-bench; only TIR (partial completion) improved", 386 "supported": "unsupported" 387 }, 388 { 389 "claim": "Adaptive prompt generation is the dominant contributor, improving TCR by 16%", 390 "evidence": "B-P achieves TCR=0.7271 vs Base=0.5659 on Jenius-bench, a 16pp absolute gain per Table 3", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "The 4T + CRCFF evaluation framework captures execution-level failures that output-only metrics miss", 395 "evidence": "Conceptually motivated by comparison with APIGen's limitations; not validated against external ground truth or human judgment", 396 "supported": "weak" 397 }, 398 { 399 "claim": "The system improves response latency", 400 "evidence": "Latency is never measured in the paper; only token counts are reported as a proxy", 401 "supported": "unsupported" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval", 406 "case-study" 407 ], 408 "key_findings": "Jenius-Agent achieves a 35% relative improvement in task completion rate over a base ReAct agent on the proprietary Jenius-bench dataset, driven primarily by adaptive prompt generation (+16pp TCR). Token efficiency improves by over 60% across modules. However, task failure rate (TFR) actually worsens with the full system (0.0329→0.0753), gains on the public APIGen benchmark are marginal (0.8150→0.8500), and neither the agent backbone LLM nor the benchmark are disclosed, making independent reproduction impossible.", 409 "red_flags": [ 410 { 411 "flag": "Unspecified backbone LLM", 412 "detail": "The LLM powering the agent is never named or versioned anywhere in the paper, making results irreproducible and contamination analysis impossible." 413 }, 414 { 415 "flag": "Proprietary benchmark not released", 416 "detail": "Jenius-bench is the primary evaluation benchmark but is derived from the authors' own production logs and not publicly released, preventing independent verification." 417 }, 418 { 419 "flag": "TFR worsened, not highlighted", 420 "detail": "Task Failure Rate increases from 0.0329 (Base) to 0.0753 (Jenius) on Jenius-bench — the opposite of the abstract's claim of 'reduced tool invocation failures' — and this is not acknowledged as a negative result." 421 }, 422 { 423 "flag": "Self-evaluation on own system", 424 "detail": "All authors are employees of the company whose commercial product is being evaluated; Jenius-bench is constructed from that same system's production logs, creating circular self-validation." 425 }, 426 { 427 "flag": "LLM-as-judge self-serving risk", 428 "detail": "Output quality is evaluated by Qwen-3 and DeepSeek, which may be the same model family as the unspecified agent LLM, introducing potential evaluator-generator alignment bias." 429 }, 430 { 431 "flag": "No statistical significance testing", 432 "detail": "All comparative claims are made without confidence intervals, error bars, or significance tests across five metrics and two benchmarks." 433 }, 434 { 435 "flag": "Claim without measurement", 436 "detail": "The abstract claims 'reduced response latency' but no latency measurements are reported anywhere in the paper." 437 } 438 ], 439 "cited_papers": [ 440 { 441 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 442 "relevance": "Foundational baseline agent framework; the paper's Base agent implements ReAct-style observe-think-act loop" 443 }, 444 { 445 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations", 446 "relevance": "Prior multi-agent framework compared in design space discussion" 447 }, 448 { 449 "title": "APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets", 450 "relevance": "Public single-turn tool-use benchmark used for evaluation" 451 }, 452 { 453 "title": "DSPy: Compiling Declarative Language Model Calls into State-of-the-Art Pipelines", 454 "relevance": "Prior prompt optimization framework compared in related work" 455 }, 456 { 457 "title": "MCP-Zero: Proactive Toolchain Construction for LLM Agents from Scratch", 458 "relevance": "Prior tool discovery method for comparison in tool selection module design" 459 }, 460 { 461 "title": "StateFlow: Enhancing LLM Task-Solving through State-Driven Workflows", 462 "relevance": "Prior memory and state management approach in agent systems" 463 }, 464 { 465 "title": "A Survey on Large Language Model Based Autonomous Agents", 466 "relevance": "Survey paper covering agent architecture landscape that this work situates within" 467 }, 468 { 469 "title": "Recursively Summarizing Enables Long-Term Dialogue Memory in Large Language Models", 470 "relevance": "Prior hierarchical memory compression approach compared to Jenius memory module" 471 } 472 ], 473 "engagement_factors": { 474 "practical_relevance": { 475 "score": 2, 476 "justification": "The system is deployed in production with real users and reports concrete operational lessons, but the closed-source nature limits practitioner reuse." 477 }, 478 "surprise_contrarian": { 479 "score": 0, 480 "justification": "All findings confirm expected directional improvements from modular agent optimization; no surprising or counterintuitive results are highlighted." 481 }, 482 "fear_safety": { 483 "score": 0, 484 "justification": "Safety is mentioned in the prompt moderation layer but no AI risk concerns are raised as primary findings." 485 }, 486 "drama_conflict": { 487 "score": 0, 488 "justification": "No controversy or conflict with prior work; the paper positions itself as complementary to existing frameworks." 489 }, 490 "demo_ability": { 491 "score": 2, 492 "justification": "The system is live at jenius.cn and can be tried directly, though the evaluation benchmark is not accessible." 493 }, 494 "brand_recognition": { 495 "score": 0, 496 "justification": "Authors are from a small Chinese tech company (Tianju Dihe); no well-known lab affiliation." 497 } 498 }, 499 "hn_data": { 500 "threads": [], 501 "top_points": 0, 502 "total_points": 0, 503 "total_comments": 0 504 } 505 }