calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (13786B)
      1 {
      2   "paper_slug": "agentic-ai-architectures-2026",
      3   "calibration_date": "2026-02-28",
      4   "calibration_model": "claude-opus-4-6",
      5   "scan_model": "claude-sonnet",
      6   "total_questions": 50,
      7   "agreement_count": 50,
      8   "disagreement_count": 0,
      9   "agreement_rate": 1.0,
     10   "disagreements": [],
     11   "opus_checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": false,
     16         "justification": "No repository URL or code archive is provided anywhere in the paper. A survey can release analysis scripts or data extraction tools, but none are offered."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "No dataset or structured corpus of reviewed papers is released. The paper does not provide a downloadable list of surveyed works or extracted metadata."
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No environment specification or dependency list is provided. The paper is a survey with no computational experiments, but could have released tooling; it did not."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No reproduction instructions are provided. There is no description of how to replicate the literature review process (no search queries, databases, or inclusion criteria)."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": false,
     37         "answer": false,
     38         "justification": "This is a survey/review paper that conducts no original statistical experiments. Confidence intervals are not applicable."
     39       },
     40       "significance_tests": {
     41         "applies": false,
     42         "answer": false,
     43         "justification": "No original comparative experiments are conducted. Significance tests do not apply to this survey paper."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": false,
     47         "answer": false,
     48         "justification": "No original experiments are run. Effect size reporting is not applicable to this survey."
     49       },
     50       "sample_size_justified": {
     51         "applies": false,
     52         "answer": false,
     53         "justification": "No empirical experiment with a sample is conducted. Sample size justification does not apply."
     54       },
     55       "variance_reported": {
     56         "applies": false,
     57         "answer": false,
     58         "justification": "No original experiments are run. Variance reporting does not apply to this survey."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The survey mentions prior surveys (Wang et al., Xi et al., Luo et al.) in Section 1 but does not provide a structured comparison of coverage, quality, or methodology against them."
     66       },
     67       "baselines_contemporary": {
     68         "applies": false,
     69         "answer": false,
     70         "justification": "No experimental baselines are used. For a survey paper, this criterion about experimental baseline recency does not apply."
     71       },
     72       "ablation_study": {
     73         "applies": false,
     74         "answer": false,
     75         "justification": "No system is built or evaluated. There are no components to ablate in a survey paper."
     76       },
     77       "multiple_metrics": {
     78         "applies": false,
     79         "answer": false,
     80         "justification": "No original experiments are conducted. Multiple evaluation metrics do not apply."
     81       },
     82       "human_evaluation": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No system outputs are produced that would require human evaluation. This is a survey paper."
     86       },
     87       "held_out_test_set": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "No train/test data split is relevant for a survey paper."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The survey organizes reviewed systems into six taxonomy dimensions with per-category breakdowns. Tables 1-5 provide detailed comparisons of perception modules, memory architectures, action spaces, multi-agent frameworks, and cognitive architectures."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section 8 ('Challenges and Future Directions') discusses failure modes including hallucination in action, infinite loops, and latency bottlenecks. Tables 1-5 include 'Critical limitation' or 'Primary limitation' columns for each system."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The survey reports negative results from cited works: WebArena shows <15% success on long-horizon tasks (Section 8.2); synchronous agents achieve 47% but drop to 11% in asynchronous settings (Section 7.2); prompt-only defenses described as brittle (Section 7.4)."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The abstract claims to propose a unified taxonomy covering Perception, Brain, Planning, Action, Tool Use, and Collaboration, and to review evaluation practices and open challenges. Sections 3-8 deliver on all these claims."
    113       },
    114       "causal_claims_justified": {
    115         "applies": false,
    116         "answer": false,
    117         "justification": "The paper is a survey that describes and synthesizes existing work. It does not make original causal claims requiring causal inference methodology."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "The paper makes broad claims about 'Agentic AI' while repeating specific quantitative results from narrow studies as general findings. Examples: '30% reduction in bugs' from ChatDev (Section 5.3.1), '100% hallucination reduction' in 'controlled environments' (Section 5.3.2), '50% navigation improvement' from VLM-GroNav (Section 6.2.3) — all presented without bounding to their specific experimental settings."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": false,
    126         "answer": false,
    127         "justification": "This is a pure survey/taxonomy paper presenting no original empirical results. Alternative explanations for original findings are not applicable since it has no original empirical findings."
    128       }
    129     },
    130     "setup_transparency": {
    131       "model_versions_specified": {
    132         "applies": false,
    133         "answer": false,
    134         "justification": "No experiments are run by the authors. Model version specification does not apply to this survey paper."
    135       },
    136       "prompts_provided": {
    137         "applies": false,
    138         "answer": false,
    139         "justification": "The paper does not use prompting in its own methodology. It is a survey that reviews others' use of prompting."
    140       },
    141       "hyperparameters_reported": {
    142         "applies": false,
    143         "answer": false,
    144         "justification": "No experiments are run. Hyperparameter reporting does not apply to this survey."
    145       },
    146       "scaffolding_described": {
    147         "applies": false,
    148         "answer": false,
    149         "justification": "The paper does not build or deploy an agentic scaffold. It surveys existing scaffolds built by others."
    150       },
    151       "data_preprocessing_documented": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "No systematic literature search methodology is described. No search queries, databases searched, date ranges, inclusion/exclusion criteria, or paper counts at screening stages are provided. The review corpus selection is entirely opaque."
    155       }
    156     },
    157     "limitations_and_scope": {
    158       "limitations_section_present": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 8 ('Challenges and Future Directions') provides substantive discussion of open challenges and limitations including hallucination in action (8.1), infinite loops (8.2), latency and cost (8.3), alignment (8.4), open-ended learning (8.5), and theoretical limits (8.6)."
    162       },
    163       "threats_to_validity_specific": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "Section 8 discusses challenges in the field broadly but does not address threats to validity of the survey itself — no discussion of selection bias in reviewed literature, recency bias, coverage gaps, or limitations of the non-systematic review methodology."
    167       },
    168       "scope_boundaries_stated": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "The paper does not explicitly state what it did NOT cover or what claims it is not making. No explicit exclusion criteria or out-of-scope domains are defined. The title claims coverage of 'Agentic AI' broadly without bounding what is excluded."
    172       }
    173     },
    174     "data_integrity": {
    175       "raw_data_available": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No raw data (list of surveyed papers, extraction tables, coding sheets, or systematic review protocols) is released. The survey's evidence base cannot be independently verified."
    179       },
    180       "data_collection_described": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No systematic literature search process is described. The paper does not explain how papers were identified, what databases were searched, what search terms were used, or what time period was covered."
    184       },
    185       "recruitment_methods_described": {
    186         "applies": false,
    187         "answer": false,
    188         "justification": "No human participants are involved. Recruitment methods for participants are not applicable to this survey paper."
    189       },
    190       "data_pipeline_documented": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "There is no documented pipeline from paper discovery to inclusion in the survey. No filtering stages, criteria, or counts of papers at each stage are described."
    194       }
    195     },
    196     "conflicts_of_interest": {
    197       "funding_disclosed": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "No acknowledgments or funding section is present anywhere in the paper. There is no mention of funding sources, grants, or sponsors."
    201       },
    202       "affiliations_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Author affiliations are clearly listed on the title page: University College of Engineering, Anna University; National Institute of Technology Tiruchirappalli; and University of Melbourne."
    206       },
    207       "funder_independent_of_outcome": {
    208         "applies": false,
    209         "answer": false,
    210         "justification": "No funding is disclosed. The paper appears to be unfunded university research. Funder independence cannot be assessed and is NA given apparent lack of external funding."
    211       },
    212       "financial_interests_declared": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No competing interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict."
    216       }
    217     },
    218     "contamination": {
    219       "training_cutoff_stated": {
    220         "applies": false,
    221         "answer": false,
    222         "justification": "The paper is a survey and does not evaluate any pre-trained model on any benchmark. Contamination questions do not apply."
    223       },
    224       "train_test_overlap_discussed": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "The paper does not evaluate a pre-trained model on a benchmark. Train/test overlap is not applicable."
    228       },
    229       "benchmark_contamination_addressed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "The paper does not run any benchmarks itself. Contamination is not applicable."
    233       }
    234     },
    235     "human_studies": {
    236       "pre_registered": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No human participants are involved in this survey paper."
    240       },
    241       "irb_or_ethics_approval": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants. IRB approval is not applicable."
    245       },
    246       "demographics_reported": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants. Demographics are not applicable."
    250       },
    251       "inclusion_exclusion_criteria": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants. Participant inclusion/exclusion criteria are not applicable."
    255       },
    256       "randomization_described": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants. Randomization is not applicable."
    260       },
    261       "blinding_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants. Blinding is not applicable."
    265       },
    266       "attrition_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants. Attrition is not applicable."
    270       }
    271     },
    272     "cost_and_practicality": {
    273       "inference_cost_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "This is a survey paper with no original system or method. Inference cost of the survey's own method does not apply."
    277       },
    278       "compute_budget_stated": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "This is a survey paper. Compute budget for the survey itself is not applicable."
    282       }
    283     }
    284   }
    285 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs