scan.json (27271B)
1 { 2 "paper": { 3 "title": "Local LLM Ensembles for Zero-shot Portuguese Named Entity Recognition", 4 "authors": [ 5 "João Lucas Luz Lima Sarcinelli", 6 "Diego Furtado Silva" 7 ], 8 "year": 2025, 9 "venue": "arXiv.org", 10 "arxiv_id": "2512.10043", 11 "doi": "10.48550/arXiv.2512.10043" 12 }, 13 "scan_version": 2, 14 "active_modules": ["experimental_rigor", "data_leakage"], 15 "methodology_tags": ["benchmark-eval"], 16 "key_findings": "A three-step ensemble pipeline (extraction, voting, disambiguation) using 5 locally-run LLMs (7-14B parameters) outperforms individual models on 4 of 5 Portuguese NER datasets in zero-shot settings. The voting step is critical to performance, while disambiguation can sometimes be simplified. Cross-dataset ensemble configurations generally transfer well, often matching or exceeding configurations selected on the target dataset, suggesting the pipeline is viable even without any annotated data for the target task.", 17 "checklist": { 18 "artifacts": { 19 "code_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "GitHub repository provided in the abstract and Section 3: https://github.com/Joao-Luz/local-llm-ner-ensemble." 23 }, 24 "data_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "All five datasets used (HAREM, LeNER-Br, UlyssesNER-Br, GeoCorpus-2, MariNER) are publicly available with citations to their original publications in Table 1." 28 }, 29 "environment_specified": { 30 "applies": true, 31 "answer": false, 32 "justification": "No mention of Python version, CUDA version, GPU model, library versions, requirements.txt, or Dockerfile. The paper mentions 'consumer GPUs' in Section 7 but does not specify which hardware or software environment was used." 33 }, 34 "reproduction_instructions": { 35 "applies": true, 36 "answer": false, 37 "justification": "No step-by-step reproduction instructions are provided in the paper. A code repository is linked but the paper does not describe how to run the experiments." 38 } 39 }, 40 "statistical_methodology": { 41 "confidence_intervals_or_error_bars": { 42 "applies": true, 43 "answer": false, 44 "justification": "Table 3 reports only point estimates of micro-F1. Although Section 5.1 states results are averaged over three runs, no confidence intervals, error bars, or ± notation are provided." 45 }, 46 "significance_tests": { 47 "applies": true, 48 "answer": false, 49 "justification": "The paper claims ensembles outperform individual LLMs based on comparing F1 scores in Table 3 without any statistical significance tests (no p-values, t-tests, or other tests)." 50 }, 51 "effect_sizes_reported": { 52 "applies": true, 53 "answer": true, 54 "justification": "Table 3 reports absolute micro-F1 scores for both individual models and ensembles, providing sufficient context to assess the magnitude of improvement (e.g., ensemble 0.541 vs best individual 0.528 on LeNER-BR test)." 55 }, 56 "sample_size_justified": { 57 "applies": true, 58 "answer": false, 59 "justification": "The validation set size of 100 sentences per dataset (Section 4.3) is not justified via power analysis or other reasoning beyond 'simulating a low-resource scenario.'" 60 }, 61 "variance_reported": { 62 "applies": true, 63 "answer": false, 64 "justification": "Section 5.1 states 'the reported results are the average of three runs of each experiment' but no standard deviation, IQR, or other spread measure is reported anywhere. The reader cannot assess result stability." 65 } 66 }, 67 "evaluation_design": { 68 "baselines_included": { 69 "applies": true, 70 "answer": true, 71 "justification": "Table 3 includes individual LLM baselines (each model running the full pipeline alone) and a fully supervised RoBERTa model trained on full train splits." 72 }, 73 "baselines_contemporary": { 74 "applies": true, 75 "answer": true, 76 "justification": "Baselines use contemporary models: LLaMA 3.1 (2024), Qwen2 (2024), Gemma 2 (2024), Phi3 (2024), Mistral v0.2 (2023), and RoBERTa. All are recent and appropriate." 77 }, 78 "ablation_study": { 79 "applies": true, 80 "answer": true, 81 "justification": "Section 5.2 and Table 4 present an ablation study removing the voting step and simplifying the disambiguation step, demonstrating the contribution of each pipeline component." 82 }, 83 "multiple_metrics": { 84 "applies": true, 85 "answer": false, 86 "justification": "Only micro-F1 is reported as the evaluation metric. No precision, recall, or other complementary metrics are provided." 87 }, 88 "human_evaluation": { 89 "applies": true, 90 "answer": false, 91 "justification": "Evaluation is entirely automated using micro-F1 against gold-standard annotations. No human evaluation of the NER outputs is included." 92 }, 93 "held_out_test_set": { 94 "applies": true, 95 "answer": true, 96 "justification": "Section 4.3 explicitly separates the train split (100-sentence sample for configuration selection) from the test split used for final evaluation. 'Testing is performed in the entire test set available for each dataset.'" 97 }, 98 "per_category_breakdown": { 99 "applies": true, 100 "answer": true, 101 "justification": "Table 3 provides per-dataset breakdowns across all 5 datasets. Individual model performance is also broken down per dataset." 102 }, 103 "failure_cases_discussed": { 104 "applies": true, 105 "answer": true, 106 "justification": "Section 5.1 discusses HAREM as the dataset where the ensemble underperformed individual models. Section 5.3 analyzes limitations of the heuristic selection method with Figure 3." 107 }, 108 "negative_results_reported": { 109 "applies": true, 110 "answer": true, 111 "justification": "HAREM underperformance is reported in Section 5.1. Section 5.3 shows the heuristic doesn't find the optimal configuration. The overall honest reporting of when ensembles fail to help is commendable." 112 } 113 }, 114 "claims_and_evidence": { 115 "abstract_claims_supported": { 116 "applies": true, 117 "answer": true, 118 "justification": "Abstract claims of outperforming individual LLMs in 4/5 datasets are supported by Table 3. The cross-dataset transferability claim is supported by Figure 4." 119 }, 120 "causal_claims_justified": { 121 "applies": true, 122 "answer": true, 123 "justification": "The paper makes causal claims through ablation (Section 5.2) that voting and disambiguation contribute to performance. The ablation design uses controlled removal of single components, which is adequate for these claims." 124 }, 125 "generalization_bounded": { 126 "applies": true, 127 "answer": false, 128 "justification": "The conclusion states 'our pipeline is not dependent on this language' without testing on any other language. The abstract frames contributions broadly ('scalable, low-resource, and zero-shot NER') based on only Portuguese data with 5 specific models." 129 }, 130 "alternative_explanations_discussed": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper offers hypotheses for why ensembles help on domain-specific datasets (Section 5.1) but does not systematically consider confounds or alternative explanations such as data characteristics, annotation quality differences across datasets, or model-specific biases." 134 }, 135 "proxy_outcome_distinction": { 136 "applies": true, 137 "answer": true, 138 "justification": "The paper measures micro-F1 on NER tasks and claims NER performance. The measurement matches the claim granularity — no proxy gap exists." 139 } 140 }, 141 "setup_transparency": { 142 "model_versions_specified": { 143 "applies": true, 144 "answer": true, 145 "justification": "Table 2 footnotes link to specific HuggingFace models: Meta-Llama-3.1-8B-Instruct, Qwen2-7B-Instruct, gemma-2-9b-it, Phi3-Medium-128K-Instruct, Mistral-7B-Instruct-v0.2. These are specific versioned model identifiers." 146 }, 147 "prompts_provided": { 148 "applies": true, 149 "answer": false, 150 "justification": "The paper describes prompts in natural language only — e.g., 'zero-shot prompt strategy adapted from [22]', 'task definition entity type descriptions.' Actual prompt text is not provided in the paper. A code repository is linked which may contain the prompts, but the paper itself shows no prompt text." 151 }, 152 "hyperparameters_reported": { 153 "applies": true, 154 "answer": true, 155 "justification": "Section 4.3 reports temperature values for each model: t=0 for LLaMA3, Mistral, Phi3; t=1.0 for Gemma2; t=1.5 for Qwen2. Voting and disambiguation use t=0. The calibration procedure is described." 156 }, 157 "scaffolding_described": { 158 "applies": false, 159 "answer": false, 160 "justification": "No agentic scaffolding is used. The system is a three-step prompting pipeline (extraction, voting, disambiguation) without tools, retry logic, memory, or feedback mechanisms." 161 }, 162 "data_preprocessing_documented": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section 3.1 describes entity validation (removing inconsistent types, matching to original sentence). Section 4.3 describes the 100-sentence sampling from train splits and the temperature calibration procedure using HAREM." 166 } 167 }, 168 "limitations_and_scope": { 169 "limitations_section_present": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 7 'Limitations' is a dedicated section discussing GPU memory requirements, processing time, and the computational burden of evaluating 7,000+ combinations." 173 }, 174 "threats_to_validity_specific": { 175 "applies": true, 176 "answer": false, 177 "justification": "Section 7 discusses only practical/computational limitations (GPU memory, processing time). No specific threats to validity are discussed — e.g., the representativeness of the 100-sentence validation sample, the generalizability beyond Portuguese, or the stability of results across runs." 178 }, 179 "scope_boundaries_stated": { 180 "applies": true, 181 "answer": false, 182 "justification": "The paper does not explicitly state what the results do NOT show. The conclusion claims language independence ('our pipeline is not dependent on this language') without bounding this. No explicit statement of what populations, languages, model sizes, or settings are excluded from the claims." 183 } 184 }, 185 "data_integrity": { 186 "raw_data_available": { 187 "applies": true, 188 "answer": true, 189 "justification": "All five NER datasets are publicly available: HAREM [18], LeNER-Br [15], UlyssesNER-Br [2], GeoCorpus-2 [4], MariNER [19]. Each is cited with its original publication." 190 }, 191 "data_collection_described": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 4.1 and Table 1 describe the datasets used, including their domains, entity types, and provenance. Section 4.3 describes the sampling and splitting procedure." 195 }, 196 "recruitment_methods_described": { 197 "applies": false, 198 "answer": false, 199 "justification": "No human participants. All data comes from standard public NER benchmark datasets." 200 }, 201 "data_pipeline_documented": { 202 "applies": true, 203 "answer": true, 204 "justification": "The pipeline from data input to final evaluation is documented: dataset selection → 100-sentence sampling for validation → temperature calibration → model combination exhaustive search → test set evaluation. Entity validation steps described in Section 3.1." 205 } 206 }, 207 "conflicts_of_interest": { 208 "funding_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Section 8 discloses funding from CAPES (Coordenação de Aperfeiçoamento de Pessoal de Nível Superior – Brasil) with Finance Code 001." 212 }, 213 "affiliations_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "Authors are from Instituto de Ciências Matemáticas e Computação, Universidade de São Paulo. They do not evaluate their own products — all LLMs are from third parties (Meta, Google, Microsoft, Alibaba, Mistral AI)." 217 }, 218 "funder_independent_of_outcome": { 219 "applies": true, 220 "answer": true, 221 "justification": "CAPES is a Brazilian government agency for academic research funding with no financial stake in the specific LLM or NER outcomes." 222 }, 223 "financial_interests_declared": { 224 "applies": true, 225 "answer": false, 226 "justification": "No competing interests or financial interests statement is included in the paper." 227 } 228 }, 229 "contamination": { 230 "training_cutoff_stated": { 231 "applies": true, 232 "answer": false, 233 "justification": "No training data cutoff dates are stated for any of the five LLMs used (LLaMA 3.1, Qwen2, Gemma 2, Phi3, Mistral v0.2). The models are evaluated on NER benchmarks they may have seen during training." 234 }, 235 "train_test_overlap_discussed": { 236 "applies": true, 237 "answer": false, 238 "justification": "No discussion of whether the NER datasets (some published as early as 2006 for HAREM) appeared in any of the models' training data." 239 }, 240 "benchmark_contamination_addressed": { 241 "applies": true, 242 "answer": false, 243 "justification": "HAREM (2006), LeNER-Br (2018), GeoCorpus-2 (2020), and UlyssesNER-Br (2022) all predate the training of the models used. No contamination analysis or discussion is provided." 244 } 245 }, 246 "human_studies": { 247 "pre_registered": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants in this study." 251 }, 252 "irb_or_ethics_approval": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "demographics_reported": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "inclusion_exclusion_criteria": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "randomization_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "blinding_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "attrition_reported": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 } 282 }, 283 "cost_and_practicality": { 284 "inference_cost_reported": { 285 "applies": true, 286 "answer": false, 287 "justification": "Section 7 mentions 'high GPU memory requirements' and 'long processing times' but does not quantify inference cost, latency, tokens consumed, or wall-clock time for any experiment." 288 }, 289 "compute_budget_stated": { 290 "applies": true, 291 "answer": false, 292 "justification": "No GPU hours, hardware specifications, or total compute budget are stated. Section 7 notes '7,000 combinations' must be processed but does not report how long this took or what hardware was used." 293 } 294 }, 295 "experimental_rigor": { 296 "seed_sensitivity_reported": { 297 "applies": true, 298 "answer": false, 299 "justification": "Results are averaged over 3 runs (Section 5.1) but no standard deviation, variance, or sensitivity analysis across seeds is reported." 300 }, 301 "number_of_runs_stated": { 302 "applies": true, 303 "answer": true, 304 "justification": "Section 5.1 explicitly states: 'the reported results are the average of three runs of each experiment.'" 305 }, 306 "hyperparameter_search_budget": { 307 "applies": true, 308 "answer": false, 309 "justification": "The search is exhaustive over 7,000+ model combinations (Section 7), but no compute budget (GPU hours, wall-clock time) for the search is reported." 310 }, 311 "best_config_selection_justified": { 312 "applies": true, 313 "answer": true, 314 "justification": "Section 3.4 describes selection of the best configuration by micro-F1 on a 100-sentence validation sample from the train split. The criterion and procedure are clearly stated." 315 }, 316 "multiple_comparison_correction": { 317 "applies": true, 318 "answer": false, 319 "justification": "The paper evaluates many model combinations across 5 datasets but applies no statistical tests, let alone multiple comparison corrections." 320 }, 321 "self_comparison_bias_addressed": { 322 "applies": true, 323 "answer": false, 324 "justification": "The authors propose and evaluate their own ensemble method against individual model baselines without acknowledging the bias of evaluating their own system." 325 }, 326 "compute_budget_vs_performance": { 327 "applies": true, 328 "answer": false, 329 "justification": "The ensemble uses multiple LLMs (3-5 models per step) with significantly more compute than individual model baselines. This cost-performance tradeoff is not analyzed. The supervised RoBERTa baseline uses different compute (full training) with no comparison." 330 }, 331 "benchmark_construct_validity": { 332 "applies": true, 333 "answer": false, 334 "justification": "No discussion of whether micro-F1 on these NER benchmarks actually measures the intended NER capability, or whether the benchmarks have construct validity issues." 335 }, 336 "scaffold_confound_addressed": { 337 "applies": false, 338 "answer": false, 339 "justification": "No agentic scaffolding is involved. All models are evaluated within the same three-step prompting pipeline framework." 340 } 341 }, 342 "data_leakage": { 343 "temporal_leakage_addressed": { 344 "applies": true, 345 "answer": false, 346 "justification": "No discussion of temporal leakage. Several benchmarks (HAREM 2006, LeNER-Br 2018, GeoCorpus-2 2020) significantly predate the LLMs' training periods, creating contamination risk." 347 }, 348 "feature_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No discussion of whether the evaluation setup leaks information. The zero-shot prompts provide entity type definitions which could vary in how much information they convey." 352 }, 353 "non_independence_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of whether train and test splits share structural similarities (e.g., same documents, same domains) that could inflate results." 357 }, 358 "leakage_detection_method": { 359 "applies": true, 360 "answer": false, 361 "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination analysis." 362 } 363 } 364 }, 365 "claims": [ 366 { 367 "claim": "The ensemble pipeline outperforms individual LLMs in 4 out of 5 Portuguese NER datasets.", 368 "evidence": "Table 3 shows ensemble configurations achieving higher micro-F1 than any individual model on LeNER-BR (0.541 vs 0.528), GeoCorpus2 (0.313 vs 0.262), UlyssesNER (0.353 vs 0.334), and MariNER (0.706 vs 0.699). HAREM is the exception where Gemma2 individually (0.609) outperforms the ensemble (0.591).", 369 "supported": "moderate" 370 }, 371 { 372 "claim": "The voting step is critical to ensemble performance.", 373 "evidence": "Table 4 (Section 5.2) shows consistent performance drops when voting is removed: HAREM -0.181, LeNER-BR -0.063, GeoCorpus2 -0.135, UlyssesNER -0.104, MariNER -0.085 in micro-F1.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Cross-dataset ensemble configurations generally outperform individual LLMs, potentially eliminating the need for annotated data.", 378 "evidence": "Figure 4 shows that ensembles obtained on different source datasets outperform individual LLMs for 4/5 target datasets. For HAREM and UlyssesNER, cross-dataset configurations even outperformed the heuristic-selected configuration.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "The heuristic for selecting model combinations may not find the optimal ensemble configuration.", 383 "evidence": "Section 5.3 and Figure 3 show that for HAREM, the overall best ensemble (Gemma2 extraction, Phi3 voting, LLaMA3 disambiguation) outperforms the heuristic-selected ensemble, indicating the heuristic is sensitive to validation set size and representativeness.", 384 "supported": "strong" 385 }, 386 { 387 "claim": "Individual models complement each other's knowledge gaps for domain-specific entity types.", 388 "evidence": "Section 5.1 hypothesizes that domain-specific datasets (GeoCorpus2 with 13 types, UlyssesNER with 16 types, LeNER-BR with 6 types) benefit more from ensembles because individual models struggle with specialized entities but complement each other. However, this is not tested directly.", 389 "supported": "weak" 390 } 391 ], 392 "red_flags": [ 393 { 394 "flag": "No error bars or variance across runs", 395 "detail": "Results are averaged over 3 runs but no standard deviation or confidence intervals are reported. With only 3 runs and no spread measures, the reader cannot assess whether differences between models/ensembles are meaningful or within noise." 396 }, 397 { 398 "flag": "No statistical significance testing", 399 "detail": "All comparative claims (ensemble vs individual models) are based on comparing point estimates without any statistical tests. Some differences are very small (e.g., MariNER ensemble 0.706 vs Gemma2 0.699) and may not be statistically significant." 400 }, 401 { 402 "flag": "No contamination analysis", 403 "detail": "Pre-trained LLMs are evaluated on NER benchmarks dating back to 2006 (HAREM). The models may have seen these datasets during pre-training, which would inflate zero-shot performance and undermine the paper's core premise." 404 }, 405 { 406 "flag": "Small validation set for configuration selection", 407 "detail": "The heuristic selects from 7,000+ configurations using only 100 sentences. Section 5.3 demonstrates this is unreliable (heuristic-selected configuration is suboptimal for HAREM), but no analysis is provided for other datasets." 408 }, 409 { 410 "flag": "Single evaluation metric", 411 "detail": "Only micro-F1 is reported. Precision and recall are not shown, making it impossible to understand whether ensembles improve coverage (recall) or accuracy (precision) or both." 412 } 413 ], 414 "cited_papers": [ 415 { 416 "title": "GPT-NER: Named Entity Recognition via Large Language Models", 417 "authors": ["S. Wang", "X. Sun", "X. Li", "R. Ouyang", "F. Wu", "T. Zhang", "J. Li", "G. Wang"], 418 "year": 2023, 419 "arxiv_id": "2304.10428", 420 "doi": "10.48550/arXiv.2304.10428", 421 "relevance": "Directly evaluates LLM capabilities for NER, showing they underperform specialized models — core motivation for the ensemble approach." 422 }, 423 { 424 "title": "LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion", 425 "authors": ["D. Jiang", "X. Ren", "B. Y. Lin"], 426 "year": 2023, 427 "arxiv_id": "2306.02561", 428 "doi": "10.48550/arXiv.2306.02561", 429 "relevance": "Key prior work on LLM ensembling through fusion-based methods, demonstrating that combining multiple LLMs can outperform individuals." 430 }, 431 { 432 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 433 "authors": ["L. Chen", "M. Zaharia", "J. Zou"], 434 "year": 2023, 435 "doi": "10.48550/arXiv.2305.05176", 436 "arxiv_id": "2305.05176", 437 "relevance": "Proposes cost-effective LLM usage through routing and cascading, directly relevant to the survey's coverage of LLM efficiency and routing strategies." 438 }, 439 { 440 "title": "LLM Chain Ensembles for Scalable and Accurate Data Annotation", 441 "authors": ["D. Farr", "N. Manzonelli", "I. Cruickshank", "K. Starbird", "J. West"], 442 "year": 2024, 443 "doi": "10.1109/BigData62323.2024.10826109", 444 "relevance": "Proposes cascading LLM chains for classification tasks, demonstrating routing and ensemble approaches for practical LLM deployment." 445 }, 446 { 447 "title": "Mixtral of Experts", 448 "authors": ["A. Q. Jiang", "A. Sablayrolles", "A. Roux"], 449 "year": 2024, 450 "arxiv_id": "2401.04088", 451 "relevance": "Mixture-of-experts LLM architecture that uses ensemble of trainable weight subsets, relevant to the survey's coverage of LLM architectures and efficiency." 452 }, 453 { 454 "title": "Improving entity recognition using ensembles of deep learning and fine-tuned large language models: A case study on adverse event extraction from VAERS and social media", 455 "authors": ["Y. Li", "D. Viswaroopan", "W. He", "J. Li", "X. Zuo", "H. Xu", "C. Tao"], 456 "year": 2025, 457 "doi": "10.1016/j.jbi.2025.104789", 458 "relevance": "Uses LLM ensembles including GPT-3.5 for NER in the medical domain, showing ensemble voting outperforms individual models." 459 }, 460 { 461 "title": "Empirical Study of Zero-Shot NER with ChatGPT", 462 "authors": ["T. Xie", "Q. Li", "J. Zhang", "Y. Zhang", "Z. Liu", "H. Wang"], 463 "year": 2023, 464 "arxiv_id": "2310.10035", 465 "doi": "10.48550/arXiv.2310.10035", 466 "relevance": "Empirical evaluation of ChatGPT's zero-shot NER capabilities, providing the prompt strategy adapted by this paper." 467 }, 468 { 469 "title": "A Survey of Large Language Models", 470 "authors": ["W. X. Zhao", "K. Zhou", "J. Li"], 471 "year": 2025, 472 "arxiv_id": "2303.18223", 473 "doi": "10.48550/arXiv.2303.18223", 474 "relevance": "Comprehensive LLM survey covering in-context learning capabilities referenced for establishing LLM baseline capabilities." 475 }, 476 { 477 "title": "The Llama 3 Herd of Models", 478 "authors": ["A. Grattafiori", "A. Dubey", "A. Jauhri"], 479 "year": 2024, 480 "arxiv_id": "2407.21783", 481 "doi": "10.48550/arXiv.2407.21783", 482 "relevance": "Technical report for one of the five open-weight LLMs evaluated, relevant to the survey's coverage of open-weight model capabilities." 483 }, 484 { 485 "title": "Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone", 486 "authors": ["M. Abdin", "J. Aneja", "H. Awadalla"], 487 "year": 2024, 488 "arxiv_id": "2404.14219", 489 "relevance": "Technical report for the Phi-3 model used in the ensemble, relevant to small/local LLM capabilities." 490 } 491 ] 492 }