calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (17310B)
      1 {
      2   "paper_slug": "adaptive-data-augmentation-2026",
      3   "total_questions": 50,
      4   "agreement_count": 48,
      5   "disagreement_count": 2,
      6   "agreement_rate": 0.96,
      7   "disagreements": [
      8     {
      9       "category": "statistical_methodology",
     10       "question": "effect_sizes_reported",
     11       "sonnet_applies": true,
     12       "sonnet_answer": false,
     13       "opus_applies": true,
     14       "opus_answer": true,
     15       "opus_justification": "Tables 2 and 3 report percentage improvements with full baseline context (e.g., OpenAI-text-embedding-3-small achieves 39.21% → 58.15% with ADAMAB, stated as '+18.94%'). The schema description explicitly says: 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' The paper meets this standard by providing both the absolute baseline and final accuracy alongside the improvement delta.",
     16       "sonnet_justification": "While absolute accuracy improvements are shown (e.g., '+18.94%' over base embedding), no standardized effect sizes (Cohen's d, odds ratios) are reported. The improvement deltas are shown but without denominator context needed for a proper effect size measure.",
     17       "direction": "sonnet_generous"
     18     },
     19     {
     20       "category": "claims_and_evidence",
     21       "question": "causal_claims_justified",
     22       "sonnet_applies": true,
     23       "sonnet_answer": false,
     24       "opus_applies": true,
     25       "opus_answer": true,
     26       "opus_justification": "The paper's causal claims are primarily via ablation: comparing calibration with ADAMAB vs. random augmentation vs. no augmentation, and varying the exploration hyperparameter alpha. These are controlled single-variable manipulations. The schema states: 'ablation studies... ARE causal claims — check whether the ablation design is adequate (controlled single-variable manipulation counts as YES).' The ablation design holds all other variables constant (same embedding model, same dataset, same calibrator architecture, same total augmentation budget) while varying only the augmentation strategy, which is adequate.",
     27       "sonnet_justification": "The paper makes causal claims ('ADAMAB diminishes the gradient shifting,' 'adaptive augmentation leads to faster convergence') and ablation comparisons ('calibration with ADAMAB outperforms random augmentation'). Ablation studies manipulate only one variable at a time, but there is no discussion of confounds such as the effect of the specific generative models used or the sensitivity to hyperparameter choices made by grid search.",
     28       "direction": "sonnet_generous"
     29     }
     30   ],
     31   "opus_checklist": {
     32     "artifacts": {
     33       "code_released": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No GitHub link, code repository URL, or archive is provided anywhere in the paper or appendices. The ADAMAB framework is described algorithmically but no source code is released."
     37       },
     38       "data_released": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "All six datasets (MultiWD, FQS, TREC, OxfordPets, Flowers102, CUB200) are publicly available standard benchmarks with cited original sources. The paper uses these without modification."
     42       },
     43       "environment_specified": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Appendix B states experiments ran on 'a single MacBook Pro with a single M4 Max chip and 36GB memory' and mentions Adam optimizer, but no requirements.txt, Dockerfile, or library version list is provided."
     47       },
     48       "reproduction_instructions": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. Appendices describe hyperparameters and prompt templates but lack executable reproduction steps."
     52       }
     53     },
     54     "statistical_methodology": {
     55       "confidence_intervals_or_error_bars": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "Tables 2 and 3 report only point estimates (accuracy percentages) with no confidence intervals, error bars, or ± notation."
     59       },
     60       "significance_tests": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper claims ADAMAB outperforms baselines and random augmentation based solely on comparing numeric accuracy values with no statistical significance tests (no p-values, t-tests, bootstrap tests, etc.)."
     64       },
     65       "effect_sizes_reported": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Tables 2 and 3 report percentage improvements with baseline context (e.g., OpenAI-text-embedding-3-small at 39.21% → ADAMAB at 58.15%, '+18.94%'). The schema states 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' This standard is met."
     69       },
     70       "sample_size_justified": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The paper uses 2-5 initial samples per class without any power analysis or justification for why these specific numbers were chosen."
     74       },
     75       "variance_reported": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No standard deviations, IQR, or any spread measures are reported. All results appear to be single-run point estimates with no indication of multiple runs or seed variation."
     79       }
     80     },
     81     "evaluation_design": {
     82       "baselines_included": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Comprehensive baselines include decoder LLMs (GPT-4o-mini, Gemini2.0-Flash-Lite, Mistral-Small, Llama-3.2), rerankers (Cohere-v3.5, BGE-reranker-v2-m3, Jina-reranker-m0), embedding models, and calibration with/without random augmentation."
     86       },
     87       "baselines_contemporary": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Baselines include GPT-4o-mini, Gemini2.0-Flash-Lite, QWen-3-emb-06b, and Voyage-multimodal-3, which are contemporary models as of early 2026."
     91       },
     92       "ablation_study": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Section 4.3 provides ablation studies varying the number of augmentation rounds (Fig. 3) and the exploration hyperparameter alpha (Fig. 4), systematically measuring each component's contribution."
     96       },
     97       "multiple_metrics": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "Only classification accuracy is reported as the evaluation metric. No F1, precision, recall, AUC, or other metrics are used."
    101       },
    102       "human_evaluation": {
    103         "applies": false,
    104         "answer": false,
    105         "justification": "The paper evaluates classification accuracy on automated benchmarks. Human evaluation of system outputs is not relevant to the claims about classification accuracy."
    106       },
    107       "held_out_test_set": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Table 1 shows separate init (training) and test data counts for each dataset. Results in Tables 2 and 3 are on these held-out test sets."
    111       },
    112       "per_category_breakdown": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "Results are aggregate accuracy per dataset with no per-class or per-category breakdown, despite datasets having 6 to 200 classes."
    116       },
    117       "failure_cases_discussed": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "No qualitative failure case analysis or error breakdown is provided. Observations about few-shot underperforming zero-shot and synthetic data homogeneity are ablation findings, not systematic failure analysis of the classification outputs."
    121       },
    122       "negative_results_reported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Figure 3 shows accuracy declining when too many synthetic samples are generated due to data homogeneity. The paper also notes few-shot in-context learning can be worse than zero-shot in some cases."
    126       }
    127     },
    128     "claims_and_evidence": {
    129       "abstract_claims_supported": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The abstract claims 'up to 40% accuracy improvement when training with less than 5 initial data samples of each class,' which is supported by Tables 2 and 3 (e.g., Flowers102 with Voyage: +43.20%)."
    133       },
    134       "causal_claims_justified": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper's causal claims are made through controlled ablation studies: ADAMAB vs. random augmentation vs. no augmentation with all other variables held constant (same embedding model, dataset, calibrator architecture, augmentation budget). The schema states 'controlled single-variable manipulation counts as YES' for ablation-based causal claims."
    138       },
    139       "generalization_bounded": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The conclusion claims ADAMAB is 'highly suitable for resource-constrained environments' and establishes 'a new paradigm' without bounding results to the specific 6 datasets, 4 embedding models, and 2 generative models tested."
    143       },
    144       "alternative_explanations_discussed": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No discussion of alternative explanations for the results, confounding factors, or threats to validity. The paper does not consider whether improvements stem from the specific datasets, generative models, or other factors."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "The paper uses 'GPT-4o-mini,' 'Gemini2.0-Flash-Lite,' 'GPT-Image-1-mini,' and 'OpenAI-text-embedding-3-small' without snapshot dates or API version identifiers. Per the schema, marketing names without a snapshot date or API version do not count."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "Appendix B.2 provides prompt templates with placeholders (e.g., '{label1}: {description1}', '{existing_queries}', '{num_generate}'). The schema explicitly states prompt templates with placeholders do NOT count unless the actual fill values are also provided."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Appendix B.1 and Table 4 report Adam optimizer, learning rate (0.005), weight decay (0.0001), cosine annealing scheduler, batch sizes, training rounds, augmentation rounds, delta_n, and alpha per dataset."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No agentic scaffolding is used. The method trains lightweight calibrators via gradient descent without agent loops, tool use, or multi-step reasoning."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "Table 1 states initial training and test counts, but does not describe how the initial subset was selected from full datasets (random? stratified? fixed seed?). For TREC, they mention filtering to 30 classes with >5 samples but the broader preprocessing is undocumented."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No dedicated limitations or threats-to-validity section exists. Section 4.3 notes synthetic data homogeneity as an ablation finding but this is not a systematic limitations discussion."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No threats-to-validity discussion exists. Specific threats such as benchmark contamination, hyperparameter tuning on test data, or dependence on specific generative model quality are not discussed."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The conclusion makes broad claims about ADAMAB being 'highly suitable for resource-constrained environments' and establishing 'a new paradigm' without explicitly stating what was NOT tested or claimed."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "All six datasets are publicly available standard benchmarks (MultiWD on Authorea, FQS from ACM CCS 2024, TREC from ACL, OxfordPets/Flowers102/CUB200 from standard CV repositories). Raw data can be independently obtained."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 4.1 describes each dataset with its source, number of classes, and domain. These are well-documented external benchmarks."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants were recruited. The paper uses existing public benchmark datasets."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "The paper states how many initial samples per class were used but does not describe how these specific samples were selected from the full datasets, nor the random seed or selection procedure."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No acknowledgments section or funding disclosure is present. Five of six authors are from Accenture but no funding information is provided."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Author affiliations are clearly stated: Minxue Tang (Duke University) and five co-authors from 'Center for Advanced AI, Accenture.'"
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Five of six authors are from Accenture, a commercial consulting firm that would benefit from demonstrating effective AI calibration methods. The employer/funder is not independent of the research outcome."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests statement is present. The Accenture affiliation of five authors represents a potential commercial interest that is not declared."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "The paper uses GPT-4o-mini, Gemini2.0-Flash-Lite, and other models as baselines evaluated on benchmarks, but does not state training data cutoff dates for any model."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No discussion of whether benchmark datasets (some dating to 2001-2012) appear in the training data of the evaluated LLMs."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "Datasets like TREC (2001), OxfordPets (2012), Flowers102 (2008), CUB200 (2011) were published well before training cutoffs of GPT-4o-mini and Gemini. The contamination risk is not acknowledged."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants; computational benchmark evaluations only."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants; no IRB approval needed."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants; computational study."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants; no blinding applicable."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "The paper uses GPT-4o-mini for text generation and GPT-Image-1-mini for image generation across multiple training epochs without reporting API costs, tokens consumed, or cost per example."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Appendix B mentions the hardware ('single MacBook Pro with M4 Max chip and 36GB memory') but no wall-clock training times, total API costs, or total compute budget are reported."
    302       }
    303     }
    304   }
    305 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs