ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (24187B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Jenius Agent: Towards Experience-Driven Accuracy Optimization in Real-World Scenarios",
      6     "authors": [
      7       "Defei Xia",
      8       "Bingfeng Pi",
      9       "Shenbin Zhang",
     10       "Song Hua",
     11       "Yunfei Wei"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2601.01857",
     16     "doi": "10.48550/arXiv.2601.01857"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The abstract claims 'reduced tool invocation failures' but Task Failure Rate (TFR) on Jenius-bench actually increased from 0.0329 (Base) to 0.0753 (Jenius). The claim of 'reduced response latency' is never measured in the paper.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper conducts a progressive four-variant ablation (Base→B-P→B-PT→Jenius) isolating contributions of each module, providing adequate support for within-framework causal attribution.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper frames results as applicable to 'real-world scenarios' broadly, but the primary novel benchmark (Jenius-bench) is derived entirely from their own production system and domain; gains on APIGen are marginal (0.8150→0.8500).",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No alternative explanations are discussed; the paper does not consider whether gains could reflect Jenius-bench's construction bias toward their own system's tool ontology rather than genuine capability improvement.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "CRCFF evaluation uses LLM-as-judge (Qwen/DeepSeek) as a proxy for actual response quality, but this limitation — including potential self-serving evaluator bias — is not acknowledged.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section; limitations are briefly mentioned in one clause of the conclusion ('incomplete capture of hidden reasoning steps') without dedicated treatment.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No specific threats are discussed; the paper does not address Jenius-bench's construction from their own production logs (selection bias), the unspecified backbone LLM, or LLM evaluator self-serving bias.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not state what results do not show; it claims 'generalizability' without bounding that claim to its tested setting of one proprietary multi-turn benchmark and one public single-turn benchmark.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding disclosure is present anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors are identified as affiliated with Tianju Dihe (Suzhou) Technology Co., Ltd., the company that operates the Jenius system being evaluated.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "All authors are employees of the company whose commercial product (Jenius, deployed at jenius.cn) is the primary subject of evaluation — they have direct interest in positive results.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is included in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "The title's central term 'experience-driven' is never defined in the paper; the framework is described but the specific meaning of learning from 'experience' (vs. static design) is not operationalized.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper clearly states five contributions: system-level execution abstraction, modular optimization framework, task grounding improvements, an evaluation framework (4T + CRCFF), and experimental validation.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The related work section engages substantively with prior approaches in prompt engineering (DSPy, Reflect-Retry-Reward), tool selection (MCP-Zero, BioMedTools), and memory management (StateFlow, Recursively Summarizing), noting specific gaps each leaves.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No source code is released; the system is deployed at jenius.cn as a commercial product but no framework code or implementation is made available.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Jenius-bench (850 samples, 38 categories) is described as novel but is not publicly released; it contains 'real user-agent interactions' from their production system.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements file, Dockerfile, or dependency specification is provided; the paper mentions Alibaba Cloud and Kubernetes infrastructure but gives no reproducible environment spec.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided; the benchmark is proprietary and the agent LLM backbone is unspecified.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "No confidence intervals or error bars are reported for any results in Tables 2, 3, or 4.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are used despite comparative claims across four agent configurations on two benchmarks.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Effect sizes are reported as percentage improvements (e.g., B-P improves TCR by 16%, Jenius achieves 35% relative gain over Base, token reduction >60%) with baseline context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "APIGen sample of 800 is justified only as 'computational tractability'; no power analysis or principled sample size determination is provided for either dataset.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or run-to-run variability is reported for any metric in the evaluation tables.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "A standard ReAct-style Base agent serves as the canonical baseline against which all three module additions are compared.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "The ReAct baseline is a well-established and appropriate contemporary baseline for agentic systems; the paper's goal is within-framework ablation rather than cross-system comparison.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "A four-configuration progressive ablation (Base, B-P, B-PT, Jenius) isolates the contribution of each module on both benchmarks.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Evaluation uses 4T execution fidelity metrics (TCR, TFR, TIR, TPS), five CRCFF output quality metrics, and token consumption, evaluated across two LLM judges.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "Output quality is evaluated solely by LLM-as-judge (Qwen-3 and DeepSeek); no human evaluation of system outputs is conducted.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "Jenius-bench is derived from their own production system logs; there is no held-out test split described, and benchmark construction and system design are not cleanly separated.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "Despite 38 tool categories in Jenius-bench and 21 in APIGen, no per-category performance breakdown is provided.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 5.3 discusses concrete failure cases from deployment (spurious tool calls, inappropriate PPT generation, URL extraction failures, blind retry loops).",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "The paper does not explicitly report that TFR worsened with B-P and Jenius compared to Base on Jenius-bench (0.0329→0.0859→0.0753); this is a negative result that is not highlighted or analyzed.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "The backbone LLM powering the agent itself is never specified anywhere in the paper — a critical omission that makes results unreproducible; only the embedding model (Qwen3 Embedding) and evaluators (Qwen-3, DeepSeek) are named.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "No actual prompts or system instructions are provided; the paper describes prompt generation principles but provides no concrete prompt templates or fill values.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Penalty weights λw=λm=1 and context summarization threshold K are mentioned but temperature, top-p, and other LLM hyperparameters are absent; M for top-M tool retrieval is referenced but not given.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The three-module architecture (adaptive prompt generation, tool orchestration, hierarchical memory) is described in substantial detail including algorithms, formulas, and data flow diagrams.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": false,
    262           "justification": "Jenius-bench's manual review process is described qualitatively but specific preprocessing steps, filtering criteria, and inter-annotator agreement metrics are not documented.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Neither Jenius-bench nor the raw evaluation results are publicly available; the benchmark is described as novel but not released.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Jenius-bench is described as derived from real user-agent interactions with manual review by domain experts; domains and tool category counts are specified.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "Jenius-bench is derived from production logs, not recruited participants; NA for standard benchmark evaluation context.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "The paper describes that trajectories come from real interactions and undergo manual review but does not document the full pipeline from raw logs to final benchmark instances.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "The backbone LLM is never identified, making it impossible to state its training cutoff; this omission prevents any contamination analysis.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether the unspecified agent LLM could have been trained on APIGen data (a public dataset with 60K samples) is included.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "APIGen is a public benchmark that could be in training data; contamination is not addressed despite comparative claims on this dataset.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human subjects experiment conducted; deployment usage data is observational.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No formal human subjects study; NA.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No formal human participants; geographic distribution of production users is reported but not in a human-subjects research context.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "NA — no human subjects experiment.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "NA — no human subjects experiment.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "NA — no human subjects experiment.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "NA — no human subjects experiment.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Token consumption (input + output tokens) is measured and reported for all four agent variants on both benchmarks as a proxy for inference cost.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total computational budget, GPU hours, or monetary cost for running the experiments is stated.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Jenius achieves up to 35% relative gain in task completion rate over the base agent",
    375       "evidence": "TCR improves from 0.5659 (Base) to 0.7647 (Jenius) on Jenius-bench = 35.1% relative gain (Table 3)",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The framework reduces token consumption by over 60%",
    380       "evidence": "Token usage drops from 9.27M (Base) to 3.65M (Jenius) on Jenius-bench, a 60.6% reduction (Figure 6)",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "The framework reduces tool invocation failures",
    385       "evidence": "TFR (task failure rate) actually increased from 0.0329 (Base) to 0.0753 (Jenius) on Jenius-bench; only TIR (partial completion) improved",
    386       "supported": "unsupported"
    387     },
    388     {
    389       "claim": "Adaptive prompt generation is the dominant contributor, improving TCR by 16%",
    390       "evidence": "B-P achieves TCR=0.7271 vs Base=0.5659 on Jenius-bench, a 16pp absolute gain per Table 3",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "The 4T + CRCFF evaluation framework captures execution-level failures that output-only metrics miss",
    395       "evidence": "Conceptually motivated by comparison with APIGen's limitations; not validated against external ground truth or human judgment",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "The system improves response latency",
    400       "evidence": "Latency is never measured in the paper; only token counts are reported as a proxy",
    401       "supported": "unsupported"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "case-study"
    407   ],
    408   "key_findings": "Jenius-Agent achieves a 35% relative improvement in task completion rate over a base ReAct agent on the proprietary Jenius-bench dataset, driven primarily by adaptive prompt generation (+16pp TCR). Token efficiency improves by over 60% across modules. However, task failure rate (TFR) actually worsens with the full system (0.0329→0.0753), gains on the public APIGen benchmark are marginal (0.8150→0.8500), and neither the agent backbone LLM nor the benchmark are disclosed, making independent reproduction impossible.",
    409   "red_flags": [
    410     {
    411       "flag": "Unspecified backbone LLM",
    412       "detail": "The LLM powering the agent is never named or versioned anywhere in the paper, making results irreproducible and contamination analysis impossible."
    413     },
    414     {
    415       "flag": "Proprietary benchmark not released",
    416       "detail": "Jenius-bench is the primary evaluation benchmark but is derived from the authors' own production logs and not publicly released, preventing independent verification."
    417     },
    418     {
    419       "flag": "TFR worsened, not highlighted",
    420       "detail": "Task Failure Rate increases from 0.0329 (Base) to 0.0753 (Jenius) on Jenius-bench — the opposite of the abstract's claim of 'reduced tool invocation failures' — and this is not acknowledged as a negative result."
    421     },
    422     {
    423       "flag": "Self-evaluation on own system",
    424       "detail": "All authors are employees of the company whose commercial product is being evaluated; Jenius-bench is constructed from that same system's production logs, creating circular self-validation."
    425     },
    426     {
    427       "flag": "LLM-as-judge self-serving risk",
    428       "detail": "Output quality is evaluated by Qwen-3 and DeepSeek, which may be the same model family as the unspecified agent LLM, introducing potential evaluator-generator alignment bias."
    429     },
    430     {
    431       "flag": "No statistical significance testing",
    432       "detail": "All comparative claims are made without confidence intervals, error bars, or significance tests across five metrics and two benchmarks."
    433     },
    434     {
    435       "flag": "Claim without measurement",
    436       "detail": "The abstract claims 'reduced response latency' but no latency measurements are reported anywhere in the paper."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    442       "relevance": "Foundational baseline agent framework; the paper's Base agent implements ReAct-style observe-think-act loop"
    443     },
    444     {
    445       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations",
    446       "relevance": "Prior multi-agent framework compared in design space discussion"
    447     },
    448     {
    449       "title": "APIGen: Automated Pipeline for Generating Verifiable and Diverse Function-Calling Datasets",
    450       "relevance": "Public single-turn tool-use benchmark used for evaluation"
    451     },
    452     {
    453       "title": "DSPy: Compiling Declarative Language Model Calls into State-of-the-Art Pipelines",
    454       "relevance": "Prior prompt optimization framework compared in related work"
    455     },
    456     {
    457       "title": "MCP-Zero: Proactive Toolchain Construction for LLM Agents from Scratch",
    458       "relevance": "Prior tool discovery method for comparison in tool selection module design"
    459     },
    460     {
    461       "title": "StateFlow: Enhancing LLM Task-Solving through State-Driven Workflows",
    462       "relevance": "Prior memory and state management approach in agent systems"
    463     },
    464     {
    465       "title": "A Survey on Large Language Model Based Autonomous Agents",
    466       "relevance": "Survey paper covering agent architecture landscape that this work situates within"
    467     },
    468     {
    469       "title": "Recursively Summarizing Enables Long-Term Dialogue Memory in Large Language Models",
    470       "relevance": "Prior hierarchical memory compression approach compared to Jenius memory module"
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 2,
    476       "justification": "The system is deployed in production with real users and reports concrete operational lessons, but the closed-source nature limits practitioner reuse."
    477     },
    478     "surprise_contrarian": {
    479       "score": 0,
    480       "justification": "All findings confirm expected directional improvements from modular agent optimization; no surprising or counterintuitive results are highlighted."
    481     },
    482     "fear_safety": {
    483       "score": 0,
    484       "justification": "Safety is mentioned in the prompt moderation layer but no AI risk concerns are raised as primary findings."
    485     },
    486     "drama_conflict": {
    487       "score": 0,
    488       "justification": "No controversy or conflict with prior work; the paper positions itself as complementary to existing frameworks."
    489     },
    490     "demo_ability": {
    491       "score": 2,
    492       "justification": "The system is live at jenius.cn and can be tried directly, though the evaluation benchmark is not accessible."
    493     },
    494     "brand_recognition": {
    495       "score": 0,
    496       "justification": "Authors are from a small Chinese tech company (Tianju Dihe); no well-known lab affiliation."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [],
    501     "top_points": 0,
    502     "total_points": 0,
    503     "total_comments": 0
    504   }
    505 }

Impressum · Datenschutz