calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (15657B)
      1 {
      2   "paper_slug": "advancements-generative-ai-2023",
      3   "calibration_date": "2026-02-28",
      4   "total_questions": 50,
      5   "agreement_count": 49,
      6   "disagreement_count": 1,
      7   "agreement_rate": 0.98,
      8   "disagreements": [
      9     {
     10       "category": "evaluation_design",
     11       "question": "baselines_contemporary",
     12       "sonnet": {
     13         "applies": false,
     14         "answer": false,
     15         "justification": "Since no structured baselines are included, this criterion does not apply."
     16       },
     17       "opus": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "The survey could and should have compared itself against prior generative AI surveys (e.g., the Zhang et al. 2023 survey it cites). Since no baselines are included at all, they cannot be contemporary. The criterion is applicable because a survey paper can include baseline comparisons against prior surveys, but the paper simply did not do so."
     21       },
     22       "direction": "applies_boundary",
     23       "explanation": "Sonnet treats baselines_contemporary as NA because no baselines exist, reasoning that you cannot assess contemporaneity of nonexistent baselines. Opus treats it as applicable because the scan-agent instructions for survey papers indicate baselines_included applies=true, and if a survey could have included baselines, it could also have included contemporary ones. The failure to include any baselines does not make the criterion structurally inapplicable — it makes the answer false."
     24     }
     25   ],
     26   "opus_checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No code repository, analysis scripts, or archive URLs are provided anywhere in the paper."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No dataset is released. The survey does not release its corpus of reviewed papers, and the informal comparisons (ChatGPT vs. Bard screenshots) involve no released data."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No environment specifications, library versions, or setup instructions are provided. A survey can release analysis code with environment specs, but this paper does not."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No reproduction instructions are provided. There are no scripts, README, or step-by-step directions to replicate any portion of this work."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": false,
     52         "answer": false,
     53         "justification": "This is a descriptive survey paper with no experiments producing quantitative results that would require confidence intervals or error bars."
     54       },
     55       "significance_tests": {
     56         "applies": false,
     57         "answer": false,
     58         "justification": "No formal experiments are conducted and no comparative statistical claims are made. Significance tests are not applicable to a descriptive survey."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "No formal experiments are run. Effect sizes are not applicable to a descriptive survey paper."
     64       },
     65       "sample_size_justified": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No experimental sample is used. The paper is a descriptive survey without controlled experiments."
     69       },
     70       "variance_reported": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No repeated experiments are conducted. Variance reporting is not applicable to this descriptive survey."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "The paper does not compare against any prior survey of generative AI. The informal experiments (ChatGPT vs. Bard on 3 questions, Firefly vs. Stable Diffusion on 1 prompt) lack controlled baselines and proper methodology."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No baselines are included at all, so they cannot be contemporary. The survey could have compared against prior surveys such as the Zhang et al. 2023 survey it cites, but it does not. The criterion applies because a survey paper can and should include contemporary baselines."
     86       },
     87       "ablation_study": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "No system is proposed or evaluated. This is a descriptive survey paper with no novel system to ablate."
     91       },
     92       "multiple_metrics": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "No formal evaluation with metrics is conducted. The paper contains only informal visual/anecdotal comparisons without any quantitative metrics."
     96       },
     97       "human_evaluation": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "No formal human evaluation of system outputs is performed. The informal comparisons involve the authors' own subjective impressions without structured evaluation methodology."
    101       },
    102       "held_out_test_set": {
    103         "applies": false,
    104         "answer": false,
    105         "justification": "No formal train/test split or held-out test set is relevant to this descriptive survey."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper is organized by model type (GANs, GPT, Autoencoders, Diffusion Models, Transformers) and by task type (text, image, video, code, music, speech, scientific content generation), providing systematic breakdowns across these categories."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section II.C.2 discusses GAN failure modes (mode collapse, non-convergence, gradient vanishing, instability). Section V discusses broader challenges of generative AI including privacy, security, and misinformation."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper reports known technical limitations (GAN mode collapse, gradient vanishing) and broader negative consequences (sophisticated cyberwarfare, impersonation, misinformation, job displacement) from the literature."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The abstract claims 'a comprehensive review' and the conclusion states the paper 'benchmarked state-of-the-art tools against Generative AI.' The actual benchmarking consists of informal screenshots comparing ChatGPT vs. Bard on 3 unstructured questions and Firefly vs. Stable Diffusion on 1 image prompt, which does not support the claim of systematic benchmarking or comprehensive review."
    128       },
    129       "causal_claims_justified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "The paper makes no causal claims. 'ChatGPT outperformed Bard' is a comparative performance claim, not a causal claim. The paper is a descriptive survey without causal inference language directed at its own findings."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The title claims 'Comprehensive Review' and the paper makes broad claims about Generative AI across industries. The coverage is selective, lacks systematic methodology, and the scope is not bounded to specific models, time periods, or domains."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No alternative explanations for the informal comparison results are discussed. The claim that 'ChatGPT outperformed Bard' is presented without considering prompt sensitivity, version differences, evaluation criteria, or other confounds."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper refers to 'ChatGPT', 'Bard', 'Stable Diffusion', and 'Firefly' in its informal experiments without specifying exact model versions, API versions, or snapshot dates."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "For the text generation comparison (Section III.A), the three prompts are explicitly stated: 'Provide a brief description of what Bard is in one paragraph', 'Provide a brief description of what ChatGPT is in one paragraph', and 'Habari za saa hizi'. For image generation (Section III.B), the prompt 'College Student Programming' is provided."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No hyperparameters (temperature, top-p, max tokens, etc.) are reported for any of the model interactions. No API settings are described."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. The paper directly uses commercial tools (ChatGPT, Bard, Firefly, Stable Diffusion) as black boxes."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No systematic literature search methodology is described. There are no search queries, databases, filtering criteria, inclusion/exclusion criteria, or PRISMA-style documentation of the paper selection process."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "There is no dedicated limitations or threats-to-validity section. Section V discusses challenges of generative AI in general but not the methodological limitations of this paper itself."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No specific threats to validity of the paper's own claims or methodology are discussed. There is no acknowledgment that the informal comparisons are insufficient to support performance claims."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No explicit scope boundaries are stated. The paper presents itself as a 'comprehensive review' without bounding its claims to specific models, time periods, or domains, and without stating what it does NOT cover."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw data (model outputs, screenshots in raw form, survey paper corpus) is made available for independent verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No systematic data collection procedure is described. The paper does not explain how papers or tools were identified and selected for inclusion in the review."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants are involved. This is a survey paper that collects no human subjects data, and the data source is not a standard benchmark."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No pipeline from paper/tool collection to final analysis is documented. No search queries, databases, or filtering steps are described."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or sponsors."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Bowie State University, Morgan State University, and University of the District of Columbia. None are affiliated with the companies whose tools are informally evaluated."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No funding is disclosed. Per schema guidance, this is treated as unfunded academic work (NA). The authors are at universities with no disclosed external funding."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "There is no competing interests statement or financial disclosure anywhere in the paper. Absence of disclosure is not the same as absence of conflict."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "This is a descriptive survey paper that does not evaluate a pre-trained model's capability on any benchmark. The informal comparisons are not benchmark evaluations."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No benchmark evaluation of a pre-trained model is conducted. Contamination analysis is not applicable to this survey paper."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No benchmark is used to evaluate model capability. Contamination is not applicable to this descriptive survey."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants are involved in this survey paper."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants are involved in this survey paper."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants are involved in this survey paper."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants are involved in this survey paper."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants are involved in this survey paper."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants are involved in this survey paper."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants are involved in this survey paper."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "This is a survey paper that does not propose a method. Cost reporting is not applicable per the schema guidance for survey papers."
    292       },
    293       "compute_budget_stated": {
    294         "applies": false,
    295         "answer": false,
    296         "justification": "This is a survey paper that does not propose a method requiring computational budget reporting."
    297       }
    298     }
    299   }
    300 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs