calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (18324B)
      1 {
      2   "paper_slug": "acar-adaptive-complexity-2026",
      3   "total_questions": 50,
      4   "agreement_count": 48,
      5   "disagreement_count": 2,
      6   "agreement_rate": 0.96,
      7   "disagreements": [
      8     {
      9       "category": "statistical_methodology",
     10       "question": "effect_sizes_reported",
     11       "sonnet_applies": true,
     12       "sonnet_answer": false,
     13       "opus_applies": true,
     14       "opus_answer": true,
     15       "opus_justification": "The paper reports percentage-point differences with full baseline context (e.g., ACAR-U 55.6% vs Arena-2 54.4%, retrieval degradation from 55.6% to 52.4% = -3.4pp). The schema explicitly states: 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' Table 1 provides all absolute numbers alongside the differences, meeting this threshold.",
     16       "sonnet_justification": "Effect sizes are reported only as raw percentage-point differences (e.g., '-3.4pp' for retrieval augmentation), but without baseline context or standardized effect size measures (Cohen's d, etc.) and without any statistical tests making these differences interpretable.",
     17       "direction": "opus_generous"
     18     },
     19     {
     20       "category": "data_integrity",
     21       "question": "data_pipeline_documented",
     22       "sonnet_applies": true,
     23       "sonnet_answer": true,
     24       "opus_applies": true,
     25       "opus_answer": false,
     26       "opus_justification": "The main execution pipeline (task -> ACAR routing -> results) is documented via Algorithm 1 and the TEAMLLM substrate. However, the experience store for ACAR-UJ (837 entries) appears with no explanation of how it was constructed. The benchmark task selection process (whether 60 MathArena tasks, 250 Reasoning Gym tasks, etc. are the full benchmarks or subsamples) is also not explicitly stated. These are unexplained elements in the data pipeline.",
     27       "sonnet_justification": "The full pipeline is documented: Algorithm 1 specifies the routing procedure, Section 3.1 describes TEAMLLM's data capture (seed, prompt hash, environment fingerprint per run), and Appendix A describes how figures can be regenerated from the runs.jsonl files. The pipeline from task to final answer is fully specified.",
     28       "direction": "sonnet_generous"
     29     }
     30   ],
     31   "opus_checklist": {
     32     "artifacts": {
     33       "code_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The paper provides a GitHub URL (https://github.com/mechramc/ACAR-TeamLLM) in a footnote in Section 1.2 and again in Appendix A, described as publicly available with all code and figure regeneration scripts."
     37       },
     38       "data_released": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The paper uses four publicly available benchmarks (MathArena, Reasoning Gym, LiveCodeBench, SuperGPQA) and additionally releases execution artifacts (runs.jsonl with 7,550+ runs) via the GitHub repository as described in Appendix B."
     42       },
     43       "environment_specified": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper mentions 'environment fingerprint' as part of run logging (Section 3.1, Appendix A) but does not provide a requirements.txt, Dockerfile, conda environment file, or library version listing in the paper. No dependency specifications are given."
     47       },
     48       "reproduction_instructions": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Appendix A states 'All figures in this paper can be regenerated from released artifacts using provided scripts.' Appendix B provides a detailed artifact manifest showing the file structure. The GitHub repository is referenced as containing all regeneration scripts."
     52       }
     53     },
     54     "statistical_methodology": {
     55       "confidence_intervals_or_error_bars": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper reports only point estimates (e.g., '55.6% accuracy', '54.4%') with no confidence intervals, error bars, or uncertainty notation in any table or figure."
     59       },
     60       "significance_tests": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper claims 'ACAR-U exceeds Arena-2 by 1.2 percentage points' without any statistical significance test. No p-values, t-tests, bootstrap tests, or other significance measures appear anywhere in the paper."
     64       },
     65       "effect_sizes_reported": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper reports percentage-point differences with full baseline context: ACAR-U 55.6% vs Arena-2 54.4% (1.2pp improvement), ACAR-UJ 52.4% vs ACAR-U 55.6% (-3.4pp). Table 1 provides absolute numbers for all configurations. Per the schema, 'percentage improvement with baseline context' satisfies this criterion."
     69       },
     70       "sample_size_justified": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The paper uses 1,510 tasks across four benchmarks with no justification for why this sample size is sufficient. No power analysis or discussion of statistical adequacy is provided."
     74       },
     75       "variance_reported": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The paper uses temperature=0 for determinism but reports single-run results with no variance, standard deviation, or spread measures. No analysis of variance from any source (model non-determinism, benchmark sampling) is provided."
     79       }
     80     },
     81     "evaluation_design": {
     82       "baselines_included": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Three baselines are compared against in Table 1: Single-Model (best single model, Claude Sonnet 4), Arena-2 (two-model ensemble), and Arena-3 (three-model ensemble on all tasks)."
     86       },
     87       "baselines_contemporary": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The baselines use current frontier models (Claude Sonnet 4, GPT-4o, Gemini 2.0 Flash) from major providers as of 2026. The multi-model ensemble baselines represent current practice."
     91       },
     92       "ablation_study": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "ACAR-U vs ACAR-UJ ablates the Jungler retrieval component. Table 2 provides per-benchmark results for both configurations, isolating the retrieval component's effect."
     96       },
     97       "multiple_metrics": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The paper reports accuracy (Table 1), total API cost in USD (Table 1, Figure 4, Figure 6), escalation rate by mode (Section 5.3, Figure 5), and latency in milliseconds (Figure 7)."
    101       },
    102       "human_evaluation": {
    103         "applies": false,
    104         "answer": false,
    105         "justification": "Human evaluation is not relevant to this paper's claims. All benchmarks use automated correctness verification (test case execution for LiveCodeBench, answer matching for others)."
    106       },
    107       "held_out_test_set": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "All four benchmarks are used as evaluation sets. ACAR has no learned components and performs no tuning on these benchmarks, so all reported results are effectively on held-out data."
    111       },
    112       "per_category_breakdown": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Results are broken down per benchmark in Figure 3 (pass rates), Table 2 (retrieval impact), and Figure 5 (escalation distribution). Individual benchmark performance varies substantially (SuperGPQA 60.5% vs MathArena 26.7%)."
    116       },
    117       "failure_cases_discussed": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 6 is dedicated to negative results and failure modes: retrieval augmentation hurting performance (6.1), agreement-but-wrong being unrecoverable (6.2), and attribution proxies failing (6.3)."
    121       },
    122       "negative_results_reported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 6 reports three quantified negative findings: retrieval augmentation decreases accuracy by 3.4pp (Table 2), agreement-but-wrong creates an 8pp ceiling (Section 6.2), and attribution proxies show weak correlation (Section 6.3)."
    126       }
    127     },
    128     "claims_and_evidence": {
    129       "abstract_claims_supported": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "All abstract claims are supported: 55.6% accuracy (Table 1), 54.2% avoidance of full ensembling (Section 5.3), -3.4pp retrieval degradation (Table 2), 8pp gap to Arena-3 (Table 1), attribution proxy failure (Section 6.3)."
    133       },
    134       "causal_claims_justified": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The causal claim that retrieval hurts performance is supported by controlled ablation: ACAR-U and ACAR-UJ differ only in the retrieval component, tested on the same 1,510 tasks. This single-variable manipulation is adequate for the causal claim."
    138       },
    139       "generalization_bounded": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 8 explicitly bounds generalization: 'Three models from major providers; may not generalize to open-source models' and 'Benchmark bias: SuperGPQA dominates (66% of tasks).' The paper does not overclaim beyond its tested setting."
    143       },
    144       "alternative_explanations_discussed": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper discusses alternative explanations for key findings: low semantic similarity (median 0.167) as the cause of retrieval failure rather than retrieval being inherently harmful (Section 6.1), syntactic code equivalence inflating LiveCodeBench escalation (Section 8), and the structural explanation for the 8pp accuracy ceiling (Section 6.2)."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "The paper names 'Claude Sonnet 4', 'GPT-4o', and 'Gemini 2.0 Flash' without API version identifiers or snapshot dates. Per the schema, marketing names without snapshot dates or API versions do not count."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "Algorithm 1 describes the routing procedure and the paper mentions 'prompt template hash' in run logging, but the actual prompt text sent to models is not provided in the paper or appendix."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Temperature=0 is stated (Section 4.2), N=3 probe samples specified (Section 3.2.1), and the retrieval similarity threshold of 0.0 for ACAR-UJ is stated (Section 3.2.4)."
    165       },
    166       "scaffolding_described": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "TEAMLLM substrate is described in detail (Section 3.1) with three enforced invariants. Algorithm 1 fully specifies the ACAR routing procedure including probing, routing, and logging phases. The Jungler retrieval component is described in Section 3.2.4."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The paper states benchmark task counts (60+250+200+1000=1510) but does not document whether these are full benchmarks or subsamples, how tasks were selected from each benchmark, or how the experience store (837 entries) for ACAR-UJ was constructed."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 8 'Limitations' is a dedicated section listing four specific limitations: model set coverage, benchmark bias, absence of learned routing comparison, and code equivalence issues."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 8 identifies specific threats: 'SuperGPQA dominates (66% of tasks)' affecting aggregate result generalizability, and 'LiveCodeBench escalation is inflated by syntactically different but semantically equivalent outputs' affecting routing efficiency measurements."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The paper explicitly states scope boundaries: results may not generalize to open-source models, learned routers may outperform on specific distributions, and the 'What holds / What does not hold' abstract structure clearly demarcates what is and isn't claimed."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Complete runs.jsonl files (7,550+ runs with per-task decision traces) are released via the GitHub repository. Appendix B provides the artifact manifest. Independent verification of all reported results is possible."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 4.1 describes the four benchmarks with task counts and characteristics. Section 3.1 describes TEAMLLM's data capture mechanism (seed, prompt hash, environment fingerprint per run). The execution procedure is documented in Algorithm 1."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants involved. The paper uses standard public benchmarks for automated evaluation. Recruitment methods are not applicable."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "The execution pipeline (task -> ACAR routing -> results) is documented via Algorithm 1 and TEAMLLM. However, the experience store for ACAR-UJ (837 entries) appears with no construction explanation, and whether the benchmark task counts represent full benchmarks or subsamples is not stated. These are undocumented elements in the data pipeline."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No funding source is mentioned anywhere in the paper. There is no acknowledgments section listing grants, corporate sponsors, or funding agencies."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The paper lists only the author name 'Ramchand Kumaresan' with no institutional affiliation. The author evaluates products from Anthropic, OpenAI, and Google without disclosing any relationship to these companies."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No funding source is disclosed. The paper appears to be solo independent research with no identified funder, making funder independence not applicable."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "There is no competing interests statement, patent disclosure, or financial interests declaration anywhere in the paper. Per the schema, absence of disclosure is not the same as absence of conflict."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "The paper does not state training data cutoff dates for any of the three models (Claude Sonnet 4, GPT-4o, Gemini 2.0 Flash). This information is needed to assess whether benchmark tasks could have appeared in training data."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No discussion of potential train/test overlap for any of the four benchmarks. The paper does not analyze whether benchmark tasks appeared in model training data."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "The paper uses four benchmarks without discussing whether they were available before the training cutoffs of the models tested. No contamination analysis or discussion is performed."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants. This is a benchmark evaluation study with automated metrics."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants. IRB approval is not applicable."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants. Demographics are not applicable."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants. Inclusion/exclusion criteria are not applicable."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants. Randomization of participant assignment is not applicable."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants. Blinding is not applicable."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants. Attrition is not applicable."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Table 1 reports total API costs in USD for each configuration (Single-Model: $17.04, Arena-2: $20.64, ACAR-U: $20.34, Arena-3: $20.64). Figure 4 shows the cost-accuracy Pareto frontier. Per-run cost is logged in the decision trace."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Total API costs for all configurations are reported in Table 1. Figure 6 shows cumulative cost over tasks ($20.34 for ACAR-U). Latency distributions are also reported in Figure 7."
    302       }
    303     }
    304   }
    305 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs