scan.json (26842B)
1 { 2 "paper": { 3 "title": "Adaptive Data Augmentation with Multi-armed Bandit: Sample-Efficient Embedding Calibration for Implicit Pattern Recognition", 4 "authors": [ 5 "Minxue Tang", 6 "Yangyang Yu", 7 "Aolin Ding", 8 "Maziyar Baran Pouyan", 9 "Taha Belkhouja", 10 "Yujia Bao" 11 ], 12 "year": 2026, 13 "venue": "arXiv preprint", 14 "arxiv_id": "2602.19385" 15 }, 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": false, 21 "justification": "No GitHub link or code repository is provided anywhere in the paper or appendix. The paper describes a framework (ADAMAB) with non-trivial implementation details but releases no code." 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "All six datasets used (MultiWD, FQS, TREC, OxfordPets, Flowers102, CUB200) are publicly available benchmarks with cited sources and dataset links (e.g., CUB200 at data.caltech.edu). The paper uses these standard public benchmarks without modification." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": false, 31 "justification": "Appendix B states experiments ran on 'a single MacBook Pro with a single M4 Max chip and 36GB memory' and mentions Adam optimizer, but no requirements.txt, Dockerfile, or library version list is provided. The specific Python or library versions are not stated." 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": false, 36 "justification": "No step-by-step reproduction instructions or README are provided. The appendices describe training hyperparameters and prompt templates but do not give commands or scripts to reproduce the results." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": false, 43 "justification": "Tables 2 and 3 report only point estimates (accuracy percentages) with no confidence intervals, error bars, or uncertainty measures. All results are single point values." 44 }, 45 "significance_tests": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper makes comparative claims (ADAMAB outperforms random augmentation and baselines) with no statistical significance tests (no p-values, t-tests, or bootstrap tests). Conclusions are drawn from direct numeric comparisons only." 49 }, 50 "effect_sizes_reported": { 51 "applies": true, 52 "answer": true, 53 "justification": "Tables 2 and 3 report percentage improvements with full baseline context (e.g., OpenAI-text-embedding-3-small achieves 39.21% → 58.15% with ADAMAB, stated as '+18.94%'). The schema description explicitly says: 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' The paper meets this standard by providing both the absolute baseline and final accuracy alongside the improvement delta." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper uses 5 initial samples per class for text tasks and 2-3 per class for image tasks, but there is no power analysis or justification for why these specific numbers were chosen. The very small sample sizes for the claims being made are not discussed." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": false, 63 "justification": "No standard deviations, IQRs, or any spread measures are reported. All results are single-run point estimates. It is unclear whether experiments were repeated with multiple seeds." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "The paper compares ADAMAB against multiple baselines including GPT-4o-mini, Gemini2.0-Flash-Lite, Mistral-Small, Llama-3.2 (decoder LLMs), Cohere-v3.5, BGE-reranker-v2-m3, Jina-reranker-m0 (rerankers), and embedding models, plus calibration without augmentation and with random augmentation." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": true, 75 "justification": "Baselines include GPT-4o-mini, Gemini2.0-Flash-Lite, and QWen-3-emb-06b, which are contemporary models as of early 2026. The comparison set appears to represent current state of the art." 76 }, 77 "ablation_study": { 78 "applies": true, 79 "answer": true, 80 "justification": "Section 4.3 contains ablation studies examining how the number of training samples and the exploration hyperparameter α affect performance. These systematically vary individual components (augmentation rounds, α value) to measure their contribution." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": false, 85 "justification": "The paper reports only classification accuracy as the evaluation metric. No additional metrics (F1, precision, recall, AUC, etc.) are reported. The number of parameters is shown as a proxy for efficiency but is not a performance metric." 86 }, 87 "human_evaluation": { 88 "applies": false, 89 "answer": false, 90 "justification": "The paper evaluates classification accuracy on automated benchmarks; human evaluation of model outputs is not relevant to the claims being made about classification accuracy." 91 }, 92 "held_out_test_set": { 93 "applies": true, 94 "answer": true, 95 "justification": "Table 1 shows separate counts for initial (training) data and test data for each dataset. Results in Tables 2 and 3 are reported on these held-out test sets (e.g., 227 test examples for MultiWD, 5394 for CUB200)." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": false, 100 "justification": "Results are reported as aggregate accuracy across each dataset with no per-class or per-category breakdown. Given datasets have 6 to 200 classes, per-category performance could vary significantly but is not shown." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": false, 105 "justification": "The paper notes some cases where few-shot in-context learning underperforms zero-shot due to long context, and observes that homogeneous synthetic data hurts performance at high sample counts. However, no qualitative failure case analysis or error breakdown is provided." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Figure 3 shows that generating too many synthetic samples causes accuracy to decrease due to synthetic data homogeneity—a genuine negative result where more augmentation is worse. The paper also notes cases where few-shot in-context learning is worse than zero-shot." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The abstract claims 'up to 40% accuracy improvement when training with less than 5 initial data samples of each class,' which is supported by results in Tables 2 and 3 (e.g., Flowers102 with Voyage: +43.20%, CUB200 with CLIP: +35.42%)." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": true, 122 "justification": "The paper's causal claims are primarily via ablation: comparing calibration with ADAMAB vs. random augmentation vs. no augmentation, and varying the exploration hyperparameter alpha. These are controlled single-variable manipulations. The schema states: 'ablation studies... ARE causal claims — check whether the ablation design is adequate (controlled single-variable manipulation counts as YES).' The ablation design holds all other variables constant (same embedding model, same dataset, same calibrator architecture, same total augmentation budget) while varying only the augmentation strategy, which is adequate." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": false, 127 "justification": "The abstract claims ADAMAB achieves 'extraordinary calibration accuracy in pattern recognition tasks of different domains' and the conclusion calls it 'highly suitable for resource-constrained environments,' but experiments are limited to 6 datasets with specific embedding models. Claims of broad generalization are not adequately bounded." 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper does not discuss alternative explanations for results, such as whether ADAMAB's advantage stems from the specific datasets chosen, the particular generative models used for augmentation, or the architecture choices of the calibrator. No threats-to-validity section is present." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper uses 'GPT-4o-mini,' 'Gemini2.0-Flash-Lite,' 'GPT-Image-1-mini,' and 'OpenAI-text-embedding-3-small' without snapshot dates or API version identifiers. Marketing names without version pinning do not satisfy this criterion." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": false, 144 "justification": "Appendix B.2 provides prompt templates with placeholders (e.g., '{label1}: {description1}', '{existing_queries}') but does not provide the actual filled values used in experiments. Templates without the actual fill values do not allow reconstruction of the prompts sent to the model." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": true, 149 "justification": "Appendix B.1 and Table 4 report Adam optimizer with learning rate η₀=0.005, weight decay λ=0.0001, cosine annealing scheduler, batch sizes, number of training rounds, augmentation rounds (∆n), and exploration hyperparameter α per dataset." 150 }, 151 "scaffolding_described": { 152 "applies": false, 153 "answer": false, 154 "justification": "The paper does not use agentic scaffolding. It trains lightweight calibrators on top of fixed embedding models via gradient descent, with no agent loops, tool use, or multi-step reasoning scaffold." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": false, 159 "justification": "Table 1 states the number of initial training examples and test examples per dataset, but the paper does not describe how the initial subset was selected from the full datasets or what preprocessing was applied to the raw data." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": false, 166 "justification": "There is no dedicated limitations or threats-to-validity section in the paper. Section 4.3 briefly notes issues with synthetic data homogeneity but this is discussed as an ablation finding, not a systematic limitations discussion." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": false, 171 "justification": "No threats-to-validity section exists. The paper does not discuss specific threats such as the small number of initial samples, potential data leakage from public benchmarks into generative models, or the choice of evaluation datasets." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": false, 176 "justification": "The conclusion makes broad claims about ADAMAB being 'highly suitable for resource-constrained environments' without bounding results to the specific models, datasets, and task types tested. The paper does not explicitly state what it did NOT test." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": true, 183 "justification": "All six datasets are publicly available standard benchmarks (MultiWD on Authorea, FQS published in ACM CCS 2024, TREC from ACL proceedings, OxfordPets/Flowers102/CUB200 from standard CV repositories). Raw data can be independently obtained." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section 4.1 describes each dataset with its source, number of classes, and domain. The datasets are well-documented external benchmarks with cited original papers describing their collection procedures." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "The paper uses existing public benchmark datasets and no human participants were recruited. NA applies as there is no human participant data collection." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": false, 198 "justification": "The paper states how many initial samples per class were used (e.g., 5/class for text datasets) but does not describe how these specific samples were selected from the full datasets or whether selection was random or stratified." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": false, 205 "justification": "No acknowledgments section or funding disclosure is present in the paper. The five co-authors from Accenture (a commercial consulting firm) have a potential financial interest in the evaluated approach but no funding source is disclosed." 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Author affiliations are clearly stated: Minxue Tang (Duke University) and the remaining five authors (Yangyang Yu, Aolin Ding, Maziyar Baran Pouyan, Taha Belkhouja, Yujia Bao) from 'Center for Advanced AI, Accenture.'" 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": false, 215 "justification": "Five of six authors are from Accenture, a commercial entity that stands to benefit from the positive evaluation of this AI calibration framework. The funder/employer is not independent of the research outcome." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests statement is present. The Accenture affiliation of five authors represents a potential commercial interest in the method being shown to work, but no declaration is made." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": true, 226 "answer": false, 227 "justification": "The paper uses GPT-4o-mini and Gemini2.0-Flash-Lite as baselines evaluated on classification benchmarks, but does not state the training data cutoff dates for any of the models used. Some benchmark datasets (e.g., TREC from 2001-2002, Flowers102 from 2008) predate any of these models." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": true, 231 "answer": false, 232 "justification": "The paper does not discuss whether any of the benchmark datasets (TREC, OxfordPets, Flowers102, CUB200) may appear in the training data of the evaluated models (GPT-4o-mini, Gemini). These are long-established public datasets that are likely in LLM training corpora." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": true, 236 "answer": false, 237 "justification": "Datasets like TREC (2001), OxfordPets (2012), Flowers102 (2008), and CUB200 (2011) were published long before the training cutoffs of models like GPT-4o-mini. The paper does not acknowledge or discuss the risk that baseline LLMs may have been trained on these benchmarks." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants are involved; experiments are computational benchmark evaluations." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants are involved; no IRB approval is needed." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants are involved." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants are involved." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants are involved; this is a computational study." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants are involved; no blinding is applicable." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants are involved." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": false, 281 "justification": "The paper uses GPT-4o-mini for text generation and GPT-Image-1-mini for image generation across multiple training epochs without reporting API costs or tokens consumed. Compute costs for the calibrators themselves are not quantified beyond noting the number of trainable parameters." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": false, 286 "justification": "Appendix B states experiments ran on 'a single MacBook Pro with a single M4 Max chip and 36GB memory' but no wall-clock training times or total compute budget (GPU/CPU hours, API call counts) are reported." 287 } 288 } 289 }, 290 "claims": [ 291 { 292 "claim": "ADAMAB achieves up to 40% accuracy improvement over the base embedding model when training with fewer than 5 initial data samples per class.", 293 "evidence": "Tables 2 and 3 show improvements such as +43.20% for Flowers102 with Voyage-multimodal-3 and +35.42% for CUB200 with CLIP-VIT-Large. Abstract and Section 4.2 make this claim.", 294 "supported": "moderate" 295 }, 296 { 297 "claim": "ADAMAB consistently outperforms random data augmentation across all tested datasets and embedding models.", 298 "evidence": "Tables 2 and 3 show ADAMAB accuracy exceeds random augmentation in all 18 reported conditions (6 datasets × ~3 embedding models each). Section 4.2 discusses this.", 299 "supported": "moderate" 300 }, 301 { 302 "claim": "The MAB-based adaptive augmentation strategy provably converges to a stationary point with the rate O(1/T) + O(sqrt(log(T)/T)).", 303 "evidence": "Theorem 2 and its proof in Appendix A.2 establish this convergence under Assumptions 1 (β-smoothness) and 2 (bounded gradients).", 304 "supported": "strong" 305 }, 306 { 307 "claim": "ADAMAB achieves higher accuracy than GPT-4o-mini and other large decoder LLMs on implicit pattern recognition tasks despite being much smaller.", 308 "evidence": "Tables 2 and 3 show ADAMAB's calibration (e.g., 61.63% on TREC vs. GPT-4o-mini's 60.03%, 89.54% on FQS vs. 80.31%) using only ~2.6M additional parameters. Section 4.2 discusses this.", 309 "supported": "moderate" 310 }, 311 { 312 "claim": "Generating too many synthetic samples decreases accuracy due to homogeneity in the synthetic data.", 313 "evidence": "Figure 3 shows accuracy declining when average samples per class exceed the sweet spot for each dataset. Section 4.3 discusses this as a limitation of small generative models.", 314 "supported": "moderate" 315 } 316 ], 317 "methodology_tags": [ 318 "benchmark-eval" 319 ], 320 "key_findings": "ADAMAB is a lightweight embedding calibration framework that trains small neural networks on top of fixed embedding models (CLIP, OpenAI text embeddings) using adaptive data augmentation guided by a Multi-Armed Bandit UCB algorithm. On six classification datasets spanning text and image modalities with 2-5 initial samples per class, ADAMAB achieves 10-40% accuracy improvements over base embedding models and outperforms random augmentation. The paper provides a theoretical convergence guarantee showing ADAMAB achieves O(1/T) + O(sqrt(log(T)/T)) convergence. A key negative finding is that generating too many synthetic samples decreases accuracy due to homogeneity in outputs from smaller generative models.", 321 "red_flags": [ 322 { 323 "flag": "No statistical uncertainty quantification", 324 "detail": "All results in Tables 2 and 3 are point estimates with no confidence intervals, standard deviations, or indication of whether experiments were run multiple times. With test sets as small as 227 examples (MultiWD), the uncertainty in accuracy estimates is non-trivial." 325 }, 326 { 327 "flag": "Benchmark contamination unaddressed", 328 "detail": "Baseline LLMs (GPT-4o-mini, Gemini2.0-Flash-Lite) are evaluated on datasets like TREC (2001), OxfordPets (2012), and Flowers102 (2008) that almost certainly appear in their training data. The paper does not acknowledge this confound, which could artificially deflate baseline performance if test examples were memorized, or inflate it if the models have learned the label space." 329 }, 330 { 331 "flag": "No model version pinning", 332 "detail": "The paper uses 'GPT-4o-mini' and 'Gemini2.0-Flash-Lite' without snapshot dates or API version identifiers. These models are updated frequently, making exact reproduction impossible and results non-reproducible over time." 333 }, 334 { 335 "flag": "Commercial affiliation bias", 336 "detail": "Five of six authors are employed at Accenture's Center for Advanced AI, a commercial consulting firm that would benefit from a successful AI calibration framework. No competing interests statement is present, and no funding disclosure is made." 337 }, 338 { 339 "flag": "Single accuracy metric", 340 "detail": "The paper reports only top-1 classification accuracy. For datasets like FQS (safety policy classification), recall or precision per category would be more meaningful for the stated application domain." 341 }, 342 { 343 "flag": "Hyperparameter selection on test data", 344 "detail": "Appendix B states 'we conduct experiments on a wide range of hyperparameters and select the best hyperparameters with the highest accuracy.' It is unclear whether hyperparameter selection used a validation set separate from the test set, raising risk of overfitting reported results to the test set." 345 } 346 ], 347 "cited_papers": [ 348 { 349 "title": "Language models are few-shot learners", 350 "authors": [ 351 "Tom Brown", 352 "Benjamin Mann", 353 "Nick Ryder" 354 ], 355 "year": 2020, 356 "relevance": "Foundational paper on LLM few-shot learning, motivating the few-shot setting evaluated in this paper." 357 }, 358 { 359 "title": "Learning transferable visual models from natural language supervision", 360 "authors": [ 361 "Alec Radford", 362 "Jong Wook Kim", 363 "Chris Hallacy" 364 ], 365 "year": 2021, 366 "relevance": "Introduces CLIP, the primary vision embedding model evaluated and calibrated in this paper." 367 }, 368 { 369 "title": "LoRA: Low-rank adaptation of large language models", 370 "authors": [ 371 "Edward J Hu", 372 "Yelong Shen", 373 "Phillip Wallis" 374 ], 375 "year": 2022, 376 "relevance": "Baseline efficient fine-tuning method that ADAMAB is positioned against as not requiring model access." 377 }, 378 { 379 "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks", 380 "authors": [ 381 "Patrick Lewis", 382 "Ethan Perez", 383 "Aleksandra Piktus" 384 ], 385 "year": 2020, 386 "relevance": "RAG framework that is part of the embedding retrieval context motivating this work." 387 }, 388 { 389 "title": "Active learning principles for in-context learning with large language models", 390 "authors": [ 391 "Katerina Margatina", 392 "Timo Schick", 393 "Nikolaos Aletras", 394 "Jane Dwivedi-Yu" 395 ], 396 "year": 2023, 397 "relevance": "Directly related work on adaptive learning for LLM in-context learning, cited as motivation for ADAMAB." 398 }, 399 { 400 "title": "Data augmentation using large language models: Data perspectives, learning paradigms and challenges", 401 "authors": [ 402 "Bosheng Ding", 403 "Chengwei Qin", 404 "Ruochen Zhao" 405 ], 406 "year": 2024, 407 "relevance": "Survey of LLM-based data augmentation, directly in scope for the survey as related work on the same topic." 408 }, 409 { 410 "title": "A survey on data synthesis and augmentation for large language models", 411 "authors": [ 412 "Ke Wang", 413 "Jiahui Zhu", 414 "Minjie Ren" 415 ], 416 "year": 2024, 417 "relevance": "Survey on synthetic data augmentation for LLMs, directly relevant as background for this paper's contribution." 418 }, 419 { 420 "title": "\"Do Anything Now\": Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models", 421 "authors": [ 422 "Xinyue Shen", 423 "Zeyuan Chen", 424 "Michael Backes", 425 "Yun Shen", 426 "Yang Zhang" 427 ], 428 "year": 2024, 429 "relevance": "Source paper for the Forbidden Question Set benchmark used in evaluation; relevant to LLM safety evaluation research." 430 }, 431 { 432 "title": "From selection to generation: A survey of LLM-based active learning", 433 "authors": [ 434 "Yu Xia" 435 ], 436 "year": 2025, 437 "relevance": "Recent survey of LLM-based active learning, directly in the topic area of this paper and the broader survey scope." 438 }, 439 { 440 "title": "SimCSE: Simple contrastive learning of sentence embeddings", 441 "authors": [ 442 "Tianyu Gao", 443 "Xingcheng Yao", 444 "Danqi Chen" 445 ], 446 "year": 2021, 447 "relevance": "Foundational embedding model work cited in related work on embedding-based similarity search." 448 }, 449 { 450 "title": "LLM-DA: Data augmentation via large language models for few-shot named entity recognition", 451 "authors": [ 452 "Junjie Ye", 453 "Nuo Xu", 454 "Yikun Wang" 455 ], 456 "year": 2024, 457 "relevance": "Related work on LLM-based data augmentation for few-shot NLP tasks, directly comparable to ADAMAB's text augmentation component." 458 }, 459 { 460 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 461 "authors": [ 462 "Lingjiao Chen", 463 "Matei Zaharia", 464 "James Zou" 465 ], 466 "year": 2023, 467 "relevance": "Related work on cost-efficient use of LLMs, relevant to the cost reduction motivation of ADAMAB." 468 } 469 ] 470 }