scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30267B)
      1 {
      2   "paper": {
      3     "title": "RAL2M: Retrieval Augmented Learning-To-Match Against Hallucination in Compliance-Guaranteed Service Systems",
      4     "authors": [
      5       "Mengze Hong",
      6       "Di Jiang",
      7       "Jiangtao Wen",
      8       "Zhiyang Su",
      9       "Yawen Li",
     10       "Yanjie Sun",
     11       "Guan Wang",
     12       "Chen Jason Zhang"
     13     ],
     14     "year": 2026,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2601.02917",
     17     "doi": "10.48550/arXiv.2601.02917"
     18   },
     19   "scan_version": 2,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "RAL2M repositions LLMs as query-response matching judges rather than generators, eliminating generation hallucination by design. The proposed query-adaptive latent ensemble achieves 70.7% accuracy and 13.9% hallucination rate on a multi-domain QA benchmark, substantially outperforming majority voting (60.2%/49.2%), weighted voting, and a neural ensemble baseline. Analysis reveals heterogeneous inter-model dependencies (Mistral largely uncorrelated with other models) and that 98.5% of samples have at least one correct judge, suggesting the core challenge is recovering minority-correct judgments rather than simply adding more models.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Section 5.1 footnote states 'Data and code are publicly available at GitHub Repo' with what appears to be a hyperlinked reference to a repository."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The English benchmark is claimed publicly available (Section 5.1 footnote). The five underlying QA datasets (HotpotQA, MS MARCO, CovidQA, ExpertQA, HAGRID) are all public. The Chinese dataset 'will be publicly released upon acceptance' (Appendix D footnote), which does not count, but the main English dataset does."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper mentions 'single NVIDIA L20 GPU with 48GB of memory' and specific model names, but no requirements.txt, Dockerfile, or dependency specification is provided in the paper."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are included in the paper. Hyperparameters are listed (Appendix A.3) and the prompt is given (Appendix A.4), but no commands, scripts, or README-style instructions for replicating the experiments are provided."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Despite running 5 seeds per ensemble model, Tables 3, 6, 7, and 10 report only point estimates with no confidence intervals, error bars, or ± notation."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims 'significantly outperforms strong baselines' (abstract) but no statistical significance tests (p-values, t-tests, etc.) are reported anywhere. All comparisons are based on raw number differences."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Results are reported with baseline context: 70.7% vs 64.2% (neural model), 60.2% (majority vote). Section 6.4.1 states '12.6% accuracy gain and a 26.4% hallucination reduction' from 10% to 90% data scaling. Sufficient context to assess magnitude."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The dataset contains 82,606 queries but no justification is given for this sample size. No power analysis or discussion of whether the sample is sufficient for the claims made."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "Section 5.4 states 'Each ensemble model was run with five different seeds and reported the average performance for robustness,' but no standard deviation, variance, or spread measure is reported in any results table."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Table 3 compares against multiple baselines: BGE retrieval (with keyphrase and intent augmentation), single LLM inference (average and GPT-5), LLM debate, majority voting, weighted voting, and a neural ensemble model."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines include recent methods: FrugalGPT (2024), AutoMix (2024), LLM-TOPLA (2024), LLM-Blender (2023), GPT-5 for ablation. Table 1 surveys methods from 2023-2025."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 6.4 presents ablation studies: training data size scaling (Table 6, from 10% to 90%), and ensemble size scaling (Table 7, from 2 to 5 models). The neural model baseline also serves as an ablation isolating the latent dependency modeling contribution."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Four metrics are reported: Accuracy, Hallucination Rate (FPR), Precision, and F1 Score (Table 3, Appendix A.1)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "Human annotators were used for data quality (Appendix B) but not for evaluating system outputs. All evaluation of matching decisions is automated against ground-truth labels."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 5.1 states 'the dataset is split 70% for training and 30% for testing.' Results are reported on the test split."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Figure 3 shows per-dataset accuracy and hallucination rate across all five QA datasets (CovidQA, ExpertQA, HAGRID, HotpotQA, MS MARCO). Table 4 breaks down single LLM performance per model."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 6.1 discusses LLM debate's excessive rejection (1% hallucination but extremely low F1). Section 6.2 notes retrieval augmentation can 'slightly distract models' (Qwen drops from 63.9% to 61.1%). Appendix E's toy example illustrates majority voting failure. Section 6.3 analyzes the 40% of samples where majority vote fails."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Intent augmentation 'biases the system toward accepting most candidates, resulting in extremely high hallucination rates' (98.8%, Table 3). LLM debate yields 'extremely low F1 scores due to excessive rejection.' Retrieval augmentation hurts Qwen performance (Table 4). GPT-5 'does not outperform smaller models' (Section 6.4)."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims: 'eliminates generation hallucination' (supported by design — no open-ended generation), 'significantly outperforms strong baselines' (supported by Table 3, 70.7% vs 64.2% next-best). The 'wisdom of the crowd' claim is supported by Figure 4 analysis."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Causal claims about component contributions (latent variables, query-adaptive weighting, dependency modeling) are supported by controlled ablation studies: training data scaling (Table 6), ensemble size with/without the proposed method (Table 7), and comparison against the neural model that lacks dependency modeling."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper is titled 'Compliance-Guaranteed Service Systems' and motivates with healthcare and finance (Section 1), but evaluates on general QA benchmarks (HotpotQA, MS MARCO, CovidQA, ExpertQA, HAGRID). The Chinese financial dataset is closer to the compliance domain but is proprietary and not the main evaluation. The compliance-guaranteed framing extends substantially beyond what general QA matching benchmarks demonstrate."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper does not discuss alternative explanations for the observed improvements. Possible confounds include the latent model having more parameters than baselines, training data distribution effects, or model selection effects. No robustness checks against alternative hypotheses are provided."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures matching accuracy and hallucination rate (defined as FPR in Appendix A.1) and reports them as such. The empirical measurements are precisely described and match the granularity of the empirical claims. The 'compliance-guaranteed' framing is about the system architecture (retrieval-based, no generation), not about inflating what the metrics measure."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 5.4 and footnote 2 specify: 'Qwen2.5-7B, Mistral-7B-v0.2, Llama-3.1-8b, Gemma-2-9b, GLM-4-9b, and GPT-5.' The encoder is 'bge-large-en-v1.5.' These include version numbers and parameter counts."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Appendix A.4 provides the full judgment prompt text used for all LLM inferences, including the exact instruction text, role description, and output format specification."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Appendix A.3 reports: hidden dimension 512, dropout 0.3, learning rate 1e-3, 256 MC samples for training, 1024 for evaluation, 10 VI iterations for training, 60 for evaluation. Section 5.4 states temperature 0 for all LLMs."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. The system is a retrieval pipeline with direct LLM inference for binary judgments, followed by ensemble aggregation."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 5.1 describes query generation via 'intention-enhanced similar-question generation' under strict and relaxed constraints, LLM classification by two judges, filtering by five human annotators, yielding 82,606 queries. Table 2 shows per-dataset composition. Appendix B describes annotator training."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 8 ('Limitations') provides substantive discussion of two specific limitations: reliance on knowledge base completeness and computational cost of the LLM ensemble."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 8 discusses specific threats: (1) 'performance is directly constrained by the completeness and accuracy of this knowledge base' — the system cannot handle novel topics not in D1; (2) 'requires multiple LLM inferences per user query, which can be more computationally expensive.' These are specific to this approach, not generic."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "The limitations discuss method constraints (cannot synthesize new information, needs KB updates for evolving topics) but do not explicitly state what the experimental results do NOT show — e.g., no statement about generalization limits to untested domains, real-world deployment conditions, or adversarial queries."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 5.1 footnote states 'Data and code are publicly available at GitHub Repo.' The underlying five QA datasets (HotpotQA, MS MARCO, CovidQA, ExpertQA, HAGRID) are all publicly available."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 5.1 describes: 5 public QA collections extended to 10,020 QA pairs and 47,975 document chunks; query generation using intention-enhanced similar-question generation; LLM classification by two judges followed by five human annotators; 82,606 total queries with balanced positive/negative classes."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants as study subjects. The data source is standard public benchmarks (HotpotQA, MS MARCO, etc.). Human annotators were recruited for data quality (Appendix B describes their qualifications) but are not study participants."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Section 5.1 documents the pipeline: 5 public QA collections → 10,020 QA pairs + 47,975 document chunks → query generation (aligned + misaligned) → LLM classification + human filtering → 82,606 queries → 70/30 train/test split. Distinct-N metrics (D5: 89.2%, D7: 73.6%) verify diversity."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding source, acknowledgments section, or grant information is disclosed anywhere in the paper."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are clearly listed: Hong Kong Polytechnic University, NYU Shanghai, HKUST, Beijing University of Posts and Telecommunications. All are academic institutions."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding is disclosed, so independence cannot be assessed. The paper uses open-source LLMs and does not evaluate any sponsor's product, but without funding disclosure the criterion is not satisfied."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests statement, patent disclosure, or financial interests declaration is present in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No training data cutoff dates are stated for any of the five LLMs used (Qwen2.5-7B, Mistral-7B-v0.2, Llama-3.1-8b, Gemma-2-9b, GLM-4-9b). These models likely have training data that includes the public QA datasets used."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No discussion of whether the LLMs may have seen HotpotQA (2018), MS MARCO (2016), CovidQA (2020), ExpertQA (2024), or HAGRID (2023) answers during pre-training. This is a significant risk since several datasets predate the models' training."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "The benchmark is constructed from public QA datasets dating back to 2016-2023, all likely within the training data of models released in 2024-2025. No contamination analysis or mitigation is discussed."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants as study subjects. Human annotators were used for data quality but are not study participants."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants as study subjects."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants as study subjects."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants as study subjects."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants as study subjects."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants as study subjects."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants as study subjects."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Section 6.1 mentions single LLM inference processes 'approximately 23 queries per second,' but no latency or cost figures are provided for the full proposed ensemble method, which requires 5 LLM inferences plus the latent model per query."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "Section 5.4 states 'a single NVIDIA L20 GPU with 48GB of memory' but provides no total training time, GPU hours, or computational budget for the experiments."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "Section 5.4 states 'Each ensemble model was run with five different seeds and reported the average performance for robustness.' Results are averaged across 5 seeds, though per-seed variation is not shown."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Section 5.4 explicitly states 'five different seeds' for each ensemble model."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Appendix A.3 states 'Hyperparameter tuning was performed to identify optimal configurations' and lists final values, but does not report the number of configurations tried, the search method, or computational cost of the search."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Appendix A.3 lists the final hyperparameter values but does not describe how they were selected — no mention of validation set performance, selection criterion, or whether all configurations were evaluated."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The paper compares against 8+ baselines across multiple metrics and datasets without any statistical tests or correction for multiple comparisons."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors implement their own baselines (neural ensemble model, voting methods) and compare against their proposed method without acknowledging the inherent bias of evaluating one's own system against self-implemented baselines."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The proposed method requires 5 LLM inferences plus ensemble model inference per query, while single LLM baselines require only one inference. Table 7 varies ensemble size but does not report corresponding compute costs. The compute difference is acknowledged only qualitatively in limitations."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The benchmark uses synthetically generated queries to evaluate matching accuracy, but no discussion of whether synthetic query matching performance transfers to real-world compliance system performance. No analysis of construct validity — whether the benchmark measures what the paper claims to evaluate."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "No scaffolding is involved. All LLMs use the same zero-shot prompt and inference pipeline. Comparisons are between ensemble strategies, not between different model scaffolds."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "The benchmark uses QA pairs from datasets created 2016-2024. The LLMs were released in 2024-2025 and likely trained on data including these QA pairs. No discussion of temporal leakage."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "The judgment prompt provides the full QA pair and supporting document to each LLM. No discussion of whether this setup leaks answer information or whether models' prior knowledge of these QA pairs biases their judgments."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "Queries are synthetically generated from the same QA pairs used in both training and testing the matching system. No discussion of structural similarity or independence between train and test examples."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, decontamination, or temporal splits are used."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "RAL2M eliminates generation hallucination by repositioning LLMs as query-response matching judges rather than generators.",
    374       "evidence": "This is a design claim supported by the system architecture (Section 1, Figure 1): the system returns only verified responses from a knowledge base or a safe fallback, with no open-ended generation.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "The proposed latent ensemble achieves 70.7% accuracy and 13.9% hallucination rate, outperforming all baselines including the neural ensemble (64.2%/32.5%) and majority voting (60.2%/49.2%).",
    379       "evidence": "Table 3 (Section 6.1) reports these figures. However, no statistical significance tests or error bars are provided for the 5-seed runs.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "98.5% of samples have at least one correct LLM judge, showing the core challenge is recovering minority-correct judgments.",
    384       "evidence": "Figure 4 (Section 6.3) shows the distribution of correct judges across samples, with 98.5% having ≥1, 90.6% having ≥2, and 60.2% having ≥3 correct judges.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Modeling inter-model dependencies is critical for ensemble performance — naive majority voting can amplify errors from correlated models.",
    389       "evidence": "Table 5 shows heterogeneous dependencies (Mistral uncorrelated with others, Gemma-Qwen strongly correlated). Table 7 shows majority voting degrades severely with 3 models (48.4% accuracy, 87.2% hallucination) while the proposed method improves steadily.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "The method generalizes across languages, achieving 95% accuracy on a Chinese financial services dataset.",
    394       "evidence": "Table 10 (Appendix D) shows 95% accuracy, 94% recall, and 95% F1 on the Chinese dataset, outperforming all baselines. However, this is a single proprietary dataset with potentially different characteristics (Section D notes 'lower overall complexity').",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "GPT-5 does not outperform smaller open-source models at single-LLM judgment.",
    399       "evidence": "Table 3 shows GPT-5 at 55.7% accuracy (61.3% hallucination), comparable to the 55.5% LLM average. Section 6.4 notes 'judgment quality is only weakly correlated with model size.'",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "No error bars despite multiple seeds",
    406       "detail": "The paper runs 5 seeds per ensemble model (Section 5.4) but reports only point estimates in all tables. With no variance information, readers cannot assess result stability or whether differences between methods are meaningful."
    407     },
    408     {
    409       "flag": "No statistical significance tests",
    410       "detail": "All comparative claims including 'significantly outperforms' (abstract) are based solely on comparing raw numbers. No p-values, confidence intervals, or statistical tests are provided for any comparison."
    411     },
    412     {
    413       "flag": "Synthetic evaluation may not reflect real usage",
    414       "detail": "All 82,606 queries are synthetically generated via LLM-based 'intention-enhanced similar-question generation' (Section 5.1). Real user queries in compliance systems may have different characteristics (ambiguity, domain jargon, adversarial intent) not captured by synthetic generation."
    415     },
    416     {
    417       "flag": "Contamination risk from public QA datasets",
    418       "detail": "The benchmark uses QA pairs from HotpotQA (2018), MS MARCO (2016), CovidQA (2020), ExpertQA (2024), and HAGRID (2023). The LLMs used were released in 2024-2025 and likely saw these QA pairs during pre-training, potentially biasing their matching judgments. No contamination analysis is performed."
    419     },
    420     {
    421       "flag": "Compliance framing exceeds experimental scope",
    422       "detail": "The paper frames itself as solving compliance-guaranteed service systems (title, introduction cites healthcare and finance regulations) but evaluates on general QA benchmarks. The gap between matching accuracy on HotpotQA/MS MARCO and real-world regulatory compliance is not addressed."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    428       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    429       "year": 2024,
    430       "relevance": "LLM cost optimization through cascading, directly compared as a baseline for LLM ensemble/routing strategies."
    431     },
    432     {
    433       "title": "AutoMix: Automatically mixing language models",
    434       "authors": ["Pranjal Aggarwal", "Aman Madaan", "Ankit Anand"],
    435       "year": 2024,
    436       "relevance": "Automated LLM mixing/routing method compared as a cascade baseline for multi-model strategies."
    437     },
    438     {
    439       "title": "LLM-Blender: Ensembling large language models with pairwise ranking and generative fusion",
    440       "authors": ["Dongfu Jiang", "Xiang Ren", "Bill Yuchen Lin"],
    441       "year": 2023,
    442       "relevance": "Foundational LLM ensemble method using pairwise ranking and fusion, directly compared in Table 1."
    443     },
    444     {
    445       "title": "Are more LLM calls all you need? towards the scaling properties of compound AI systems",
    446       "authors": ["Lingjiao Chen", "Jared Quincy Davis", "Boris Hanin"],
    447       "year": 2024,
    448       "relevance": "Studies scaling properties of compound AI systems with multiple LLM calls, directly relevant to ensemble and routing research."
    449     },
    450     {
    451       "title": "Harnessing multiple large language models: A survey on LLM ensemble",
    452       "authors": ["Zhijun Chen", "Jingzheng Li", "Pengpeng Chen"],
    453       "year": 2025,
    454       "arxiv_id": "2502.18036",
    455       "relevance": "Comprehensive survey on LLM ensemble methods covering strategies, architectures, and applications."
    456     },
    457     {
    458       "title": "ChatEval: Towards better LLM-based evaluators through multi-agent debate",
    459       "authors": ["Chi-Min Chan", "Weize Chen", "Yusheng Su"],
    460       "year": 2024,
    461       "relevance": "Multi-agent debate for LLM-based evaluation, used as the LLM debate baseline in this paper."
    462     },
    463     {
    464       "title": "Don't hallucinate, abstain: Identifying LLM knowledge gaps via multi-LLM collaboration",
    465       "authors": ["Shangbin Feng", "Weijia Shi", "Yike Wang"],
    466       "year": 2024,
    467       "relevance": "Multi-LLM approach to hallucination mitigation through abstention, directly relevant to the paper's hallucination reduction goals."
    468     },
    469     {
    470       "title": "PromptWizard: Optimizing prompts via task-aware, feedback-driven self-evolution",
    471       "authors": ["Eshaan Agarwal", "Raghav Magazine", "Joykirat Singh"],
    472       "year": 2025,
    473       "relevance": "Prompt optimization framework used in this paper's pipeline for judgment prompt tuning."
    474     },
    475     {
    476       "title": "JudgeBench: A benchmark for evaluating LLM-based judges",
    477       "authors": ["Sijun Tan", "Siyuan Zhuang", "Kyle Montgomery"],
    478       "year": 2025,
    479       "relevance": "Benchmark for evaluating LLM judges, directly relevant to using LLMs as judgment agents."
    480     },
    481     {
    482       "title": "LLM-TOPLA: Efficient LLM ensemble by maximising diversity",
    483       "authors": ["Selim Furkan Tekin", "Fatih Ilhan", "Tiansheng Huang"],
    484       "year": 2024,
    485       "relevance": "LLM ensemble method maximizing diversity, compared in Table 1 as a selection/regeneration baseline."
    486     },
    487     {
    488       "title": "Beyond majority voting: LLM aggregation by leveraging higher-order information",
    489       "authors": ["Rui Ai", "Yuqi Pan", "David Simchi-Levi"],
    490       "year": 2025,
    491       "arxiv_id": "2510.01499",
    492       "relevance": "Advanced LLM aggregation beyond majority voting, directly relevant to the ensemble aggregation challenge addressed in this paper."
    493     }
    494   ]
    495 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs