ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28593B)


      1 {
      2   "paper": {
      3     "title": "PersonaDual: Balancing Personalization and Objectivity via Adaptive Reasoning",
      4     "authors": [
      5       "Xiaoyou Liu",
      6       "Xinyi Mou",
      7       "Shengbin Yue",
      8       "Liang Wang",
      9       "Yuqing Wang",
     10       "Qiexiang Wang",
     11       "Tianrui Qin",
     12       "Wangchunshu Zhou",
     13       "Zhongyu Wei"
     14     ],
     15     "year": 2026,
     16     "venue": "arXiv",
     17     "arxiv_id": "2601.08679",
     18     "doi": "10.48550/arXiv.2601.08679"
     19   },
     20   "scan_version": 2,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval"],
     23   "key_findings": "PersonaDual adaptively switches between objective and personalized reasoning modes via prefix-controlled generation, trained with SFT then DualGRPO reinforcement learning. Under unaligned personas, it achieves 54.0% objective accuracy, close to the 54.7% no-persona upper bound. Under aligned personas, it exceeds the upper bound by 2.8%. The framework generalizes across Qwen3-8B and Llama-3.1-8B backbones, and the DualGRPO components (forced prefix sampling + dual-mode advantage) are shown to be tightly coupled—removing either degrades performance.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The custom training dataset PersonaDualData (8,000 SFT + 9,998 RL examples) is described but no download link is provided. Evaluation uses public benchmarks, but the constructed training data is not released."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Table 6 mentions '8 NVIDIA A800', 'bfloat16', and 'DeepSpeed ZeRO-3' but provides no library versions, requirements.txt, or environment setup sufficient to recreate the software environment."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No reproduction instructions, README, or runnable scripts are provided."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Table 1 states 'Reported numbers are averaged over three runs' but no confidence intervals, error bars, or standard deviations are reported in any table or figure."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Claims like 'PersonaDual achieves the best overall performance' are based solely on comparing point estimates. No statistical significance tests (t-tests, bootstrap, etc.) are applied."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper consistently reports improvements with baseline context, e.g., '54.0% vs. 54.7% no-persona upper bound' (Section 5.1), '2.8% over the no-personalization upper bound' (Section 5.1), and absolute numbers in Table 1 allowing effect size computation."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Training dataset sizes (8,000 SFT, 9,998 RL) and benchmark sample sizes (500-1,500 per benchmark, Table 7) are stated but not justified. No power analysis or rationale for why these sizes are sufficient."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "Despite stating 'averaged over three runs' in Table 1, no standard deviation, variance, or spread measure is reported for any result."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 1 includes extensive baselines across three categories: general-purpose models (Qwen3-8B, Llama-3.1-8B, CoT, G-SFT-RL), personalization-oriented models (Personal-Prompt, P-SFT-RL, ALIGNXPERT-ICA, ALIGNXPERT-PBA), and dual-mode models (PersonaDual-Prompt, PersonaDual-Router)."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Baselines include Qwen3-8B (Yang et al., 2025), Llama-3.1-8B (Dubey et al., 2024), DeepSeek-R1 (Guo et al., 2025), and ALIGNXPERT (Li et al., 2025a), all published 2024-2025."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Table 2 presents stage-wise ablation (SFT vs. full RL) and component ablation (w/o DualAdv, w/o DualAdv + PfxSmp), demonstrating each component's contribution."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "All benchmarks are evaluated using a single metric: accuracy. TriviaQA uses exact match and PubMedQA/MATH500 use GPT-4o-mini judge, but the final reported metric is always accuracy. No complementary metrics (e.g., F1, calibration, latency) are reported."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No human evaluation is conducted. All evaluation is automated—MCQ accuracy, exact match, or GPT-4o-mini as judge. For a paper about personalization quality, human judgment would be relevant."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Evaluation uses standard benchmark test sets. Table 5 footnote states 'samples overlapping with PubMedQA are filtered out from UltraMedical, and samples overlapping with TriviaQA are filtered out from FLAN' to prevent train-test leakage."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 1 provides per-benchmark breakdowns for all 5 objective and 2 personalized benchmarks. Table 3 provides per-task mode deviation ratios. Figure 3b shows mode proportions per setting."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 5.4 analyzes mode deviation failures in multi-turn dialogue, showing 84.3% mode-alignment rate when starting with general mode. Table 3 breaks down deviation ratios by task type, with PubMedQA showing 68.6% deviation."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Section 5.4 reports mode alignment drops in multi-turn settings. Table 9 (Appendix D) acknowledges performance degradation on MATH500 with the Llama backbone. The paper notes 'switching from objective to personalized reasoning is a more challenging scenario.'"
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Abstract claims 'near interference-free performance' (54.0% vs 54.7% upper bound, Table 1), 'improving objective QA accuracy by nearly 3%' (57.5% vs 54.7% = 2.8%, Table 1), and personalization improvements (77.3% personalized avg, Table 1) are all supported by the experimental results."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Causal claims like 'DualGRPO specifically enhances the model's ability to identify beneficial persona cues' (Section 5.2) are supported by controlled ablation experiments in Table 2, which remove individual components while holding others constant."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The Limitations section explicitly bounds scope: 'limited by the availability of persona-related benchmarks, and currently relies on PersonaFeedback and FSPO-roleplay' and 'mainly trained and evaluated in English.' Results are reported per-benchmark without overclaiming generality."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, whether the SFT data quality (expert model-generated trajectories) rather than the dual-mode architecture drives gains, or whether the aligned persona condition leaks answer-relevant information via GPT-4o-generated personas."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Claims are stated at the measurement level—accuracy on specific benchmarks—rather than mapping to broader abstract constructs. The paper says 'performance close to the no-persona setting' and 'improving objective QA accuracy by nearly 3%,' which match the granularity of what was measured."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Models are identified by marketing names only: 'Qwen3-8B-Instruct,' 'Llama-3.1-8B-Instruct,' 'DeepSeek-R1,' 'GPT-4o,' 'GPT-4o-mini.' No snapshot dates, API versions, or model checksums are provided."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "Prompts used for data generation with the expert model (DeepSeek-R1) and for baselines like Personal-Prompt are described only in natural language (e.g., 'instructed to generate reasoning steps grounded in factual evidence'). Actual prompt text is not provided."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Table 6 provides detailed hyperparameters for both SFT and RL stages: learning rates (5e-5, 1e-6), batch sizes (16, 64), epochs (1, 5), KL penalty (0.04), temperature (0.6), rollouts per sample (8), prefix advantage weight (2.0), and more."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. PersonaDual is a fine-tuned model with prefix-controlled mode switching, not an agentic system."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Appendix B.1 describes the data construction pipeline: gain-based selection for SFT mode assignment, filtering of overlapping samples with test benchmarks (Table 5 footnote), balanced persona conditions for RL data, and dataset composition (Table 5)."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "A dedicated 'Limitations' section is present after Section 6, discussing benchmark availability and language scope."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The Limitations section identifies study-specific threats: reliance on only two personalization benchmarks (PersonaFeedback and FSPO-roleplay) and English-only evaluation, both concrete to this work."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The paper explicitly states what was NOT tested: 'Expanding to more diverse personalization benchmarks remains an important direction' and 'extending the framework to multilingual settings is left for future exploration.'"
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The PersonaDualData training set and per-run experimental outputs are not available for verification. Only aggregated results are reported."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 4 and Appendix B.1 describe data construction: sampling from UltraMedical, FLAN, and AlignX; expert model (DeepSeek-R1) generates reasoning trajectories; gain-based selection for mode assignment. Table 5 shows composition."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. Data sources are standard public benchmarks and datasets (UltraMedical, FLAN, AlignX, PersonaHub)."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Appendix B.1 documents the pipeline: source datasets → expert model generation → gain-based mode selection → overlap filtering → final SFT/RL splits. Table 5 shows counts per source and mode."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding source is mentioned anywhere in the paper. Authors are affiliated with Fudan University and OPPO, a major technology company, but no funding or grant acknowledgments are provided."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: Fudan University, Shanghai Innovation Institute, and OPPO."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "Funding is not disclosed, so independence cannot be assessed. OPPO (a major tech company) is listed as an affiliation for four authors, creating a potential undisclosed conflict if OPPO funded or directed the work."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial disclosures statement is present in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The paper fine-tunes Qwen3-8B-Instruct but does not state the training data cutoff date for the base model. This is needed to assess whether benchmark data was in pretraining."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "The paper filters its own SFT/RL training data to remove overlaps with PubMedQA and TriviaQA (Table 5 footnote), but does not discuss whether the Qwen3-8B base model's pretraining data overlaps with the evaluation benchmarks."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "MMLU-Pro, TriviaQA, PubMedQA, SuperGPQA, and MATH500 are all public benchmarks that may appear in Qwen3-8B's pretraining data. No contamination analysis is performed for the base model."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. The Ethics Statements section confirms 'our experiments do not involve private user data.'"
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No inference cost, latency, or tokens-per-query data is reported. The method uses 2n rollouts per sample during RL and dual-mode inference, but practical cost is not quantified."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Table 6 states '8 NVIDIA A800' GPUs and DeepSpeed ZeRO-3 for both stages, but total GPU hours, wall-clock training time, and total compute budget are not reported."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Table 1 states 'averaged over three runs' but no per-seed results, standard deviations, or sensitivity analysis across seeds is provided. The reader cannot assess result stability."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Table 1 explicitly states 'Reported numbers are averaged over three runs.'"
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Final hyperparameters are listed in Table 6 but no information about how they were selected—no search method, number of configurations tried, or compute spent on tuning."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No description of how the final hyperparameter configuration was selected. Only the final values are reported without justification."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The paper makes many comparative claims across 7 benchmarks and 10+ baselines without any significance tests, let alone multiple comparison corrections."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "Multiple baselines (CoT, G-SFT-RL, Personal-Prompt, P-SFT-RL, PersonaDual-Prompt, PersonaDual-Router) are constructed or trained by the authors. No acknowledgment of the bias in evaluating one's own system against one's own baseline implementations."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "PersonaDual uses two-stage training (SFT + 5 epochs RL) while some baselines use only one stage. Performance is not compared at matched compute budgets, making it unclear how much improvement comes from the method vs. additional training."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of whether the evaluation benchmarks (MMLU-Pro, TriviaQA, etc.) actually measure 'objectivity' or whether PersonaFeedback/FSPO-roleplay adequately capture 'personalization.' The Limitations section notes limited personalization benchmarks but does not discuss construct validity."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No scaffolding is involved. All models are compared as fine-tuned or prompted models running directly on the same benchmarks."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether Qwen3-8B's pretraining data temporally overlaps with the evaluation benchmarks. Several benchmarks (TriviaQA 2017, PubMedQA 2019, MATH500 2021) predate the model and could be in training data."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "Aligned personas are 'generated by GPT-4o based on the question content' (Section 4). This means the persona encodes question-relevant information by design, but the paper does not discuss whether this leaks answer information beyond legitimate personalization context."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "The paper filters SFT/RL training data for overlaps with specific test sets (Table 5 footnote) but does not address independence between the base model's pretraining data and evaluation benchmarks."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": true,
    368         "justification": "Table 5 footnote describes a concrete prevention method: 'samples overlapping with PubMedQA are filtered out from UltraMedical, and samples overlapping with TriviaQA are filtered out from FLAN.' This is an overlap removal pipeline applied to the fine-tuning data."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "PersonaDual achieves near interference-free performance under unaligned personas, with 54.0% average objective accuracy vs. the 54.7% no-persona upper bound.",
    375       "evidence": "Table 1 shows PersonaDual's unaligned objective accuracy is 0.540 vs. no-persona upper bound of 0.547 across 5 benchmarks (Section 5.1, RQ1).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "PersonaDual improves objective QA accuracy by ~2.8% over the no-persona baseline when personas are aligned with the question.",
    380       "evidence": "Table 1 shows PersonaDual's aligned objective accuracy of 0.575 vs. 0.547 upper bound (Section 5.1, RQ1). Averaged over 3 runs.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "PersonaDual achieves balanced excellence across both general and personalized reasoning, outperforming specialized models in both domains.",
    385       "evidence": "Table 1: PersonaDual scores 0.540/0.575 on objective (matching G-SFT-RL's 0.527/0.559) and 0.773 on personalized (beating P-SFT-RL's 0.750), Section 5.1 RQ2.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "DualGRPO's forced prefix sampling and dual-mode advantage decomposition are tightly coupled—removing either degrades performance.",
    390       "evidence": "Table 2 ablation: full PersonaDual (54.0/57.5/77.2) > w/o DualAdv (53.3/55.9/75.5) > w/o DualAdv + PfxSmp (53.6/56.1/76.9). Section 5.2.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "PersonaDual learns context-aware mode selection through training, resulting in more appropriate switching than prompt-based or router-based approaches.",
    395       "evidence": "Figure 3b shows PersonaDual selects general mode 62% of the time under unaligned personas on objective tasks, vs. 0% for PersonaDual-Prompt and 65.7% for PersonaDual-Router (Section 5.1, RQ3).",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Personalized information generally reduces accuracy on objective QA (up to 8% drop) but improves personalized QA by 10-20%.",
    400       "evidence": "Figure 1 and Table 4 show performance drops on PubMedQA/TriviaQA with unaligned personas and large gains on PersonaFeedback/FSPO-roleplay with aligned personas (Section 1, Appendix A).",
    401       "supported": "strong"
    402     },
    403     {
    404       "claim": "PersonaDual generalizes to LLaMA-3.1-8B-Instruct as an alternative backbone.",
    405       "evidence": "Table 9 (Appendix D) shows PersonaDual achieves best overall performance (0.426/0.445 objective, 0.745 personalized) among all Llama-based variants.",
    406       "supported": "moderate"
    407     }
    408   ],
    409   "red_flags": [
    410     {
    411       "flag": "No variance despite multiple runs",
    412       "detail": "Results are stated as 'averaged over three runs' but no standard deviations or error bars are reported. Without variance, the reader cannot assess whether observed differences (e.g., 54.0% vs. 53.6%) are within noise."
    413     },
    414     {
    415       "flag": "Aligned persona may leak answer information",
    416       "detail": "Aligned personas are 'generated by GPT-4o based on the question content' (Section 4). A persona relevant to a PubMedQA question about cardiology might encode domain cues that help answer the question, confounding personalization benefit with information leakage."
    417     },
    418     {
    419       "flag": "All baselines author-implemented",
    420       "detail": "CoT, G-SFT-RL, Personal-Prompt, P-SFT-RL, PersonaDual-Prompt, and PersonaDual-Router are all constructed or trained by the authors. No independent baseline implementations are used, risking systematic underperformance of baselines (Lucic et al., 2018)."
    421     },
    422     {
    423       "flag": "Undisclosed corporate funding/conflict",
    424       "detail": "Four authors are affiliated with OPPO (a major tech company) but no funding sources or competing interests are declared. The corporate-academic collaboration has no transparency about financial arrangements or editorial independence."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "GPT-4 technical report",
    430       "authors": ["Josh Achiam"],
    431       "year": 2023,
    432       "arxiv_id": "2303.08774",
    433       "relevance": "Foundational LLM capability paper; PersonaDual uses GPT-4o for aligned persona generation and GPT-4o-mini for evaluation."
    434     },
    435     {
    436       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    437       "authors": ["Daya Guo"],
    438       "year": 2025,
    439       "arxiv_id": "2501.12948",
    440       "relevance": "Used as the expert model for generating reasoning trajectories in PersonaDual's training data construction."
    441     },
    442     {
    443       "title": "The Llama 3 Herd of Models",
    444       "authors": ["Aaron Grattafiori"],
    445       "year": 2024,
    446       "arxiv_id": "2407.21783",
    447       "relevance": "Llama-3.1-8B-Instruct used as both baseline and alternative backbone for PersonaDual generalization experiments."
    448     },
    449     {
    450       "title": "MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark",
    451       "authors": ["Yubo Wang"],
    452       "year": 2024,
    453       "relevance": "One of five objective evaluation benchmarks used to assess PersonaDual's reasoning capability."
    454     },
    455     {
    456       "title": "From 1,000,000 Users to Every User: Scaling Up Personalized Preference for User-Level Alignment",
    457       "authors": ["Jia-Nan Li"],
    458       "year": 2025,
    459       "arxiv_id": "2503.15463",
    460       "relevance": "ALIGNXPERT system used as personalization-oriented baseline; AlignX dataset used in PersonaDual's training data."
    461     },
    462     {
    463       "title": "Qwen3 Technical Report",
    464       "authors": ["An Yang"],
    465       "year": 2025,
    466       "arxiv_id": "2505.09388",
    467       "relevance": "Qwen3-8B-Instruct is PersonaDual's primary backbone model."
    468     },
    469     {
    470       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    471       "authors": ["Lingjiao Chen"],
    472       "year": 2023,
    473       "arxiv_id": "2305.05176",
    474       "relevance": "Related work on adaptive inference architectures for LLMs, cascaded invocation via hierarchical reasoning."
    475     },
    476     {
    477       "title": "Mixtral of Experts",
    478       "authors": ["Albert Q Jiang"],
    479       "year": 2024,
    480       "arxiv_id": "2401.04088",
    481       "relevance": "Related work on routing-based mixture architectures for adaptive LLM inference."
    482     },
    483     {
    484       "title": "Discovering Language Model Behaviors with Model-Written Evaluations",
    485       "authors": ["Ethan Perez"],
    486       "year": 2023,
    487       "relevance": "Cited for evidence that preference-aligned models may exhibit sycophancy, motivating PersonaDual's objectivity preservation."
    488     },
    489     {
    490       "title": "Simple Synthetic Data Reduces Sycophancy in Large Language Models",
    491       "authors": ["Jerry Wei"],
    492       "year": 2023,
    493       "arxiv_id": "2308.03958",
    494       "relevance": "Evidence that personalized information can override internal knowledge and produce personalization-induced hallucinations."
    495     },
    496     {
    497       "title": "Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities",
    498       "authors": ["Gheorghe Comanici"],
    499       "year": 2025,
    500       "arxiv_id": "2507.06261",
    501       "relevance": "Major LLM family cited for incorporating memory mechanisms for personalization across sessions."
    502     },
    503     {
    504       "title": "FSPO: Few-Shot Preference Optimization of Synthetic Preference Data in LLMs Elicits Effective Personalization to Real Users",
    505       "authors": ["Anikait Singh"],
    506       "year": 2025,
    507       "arxiv_id": "2502.19312",
    508       "relevance": "Source of the FSPO-roleplay evaluation benchmark used to assess personalization capability."
    509     }
    510   ]
    511 }

Impressum · Datenschutz