scan.schema.json (44490B)
1 { 2 "$schema": "https://json-schema.org/draft/2020-12/schema", 3 "$id": "scan.schema.json", 4 "title": "Paper Scan Result", 5 "description": "Structured output from the scan agent for a single research paper. Quality assessment uses boolean checklist questions (verifiable, auditable) rather than subjective scores.", 6 "type": "object", 7 "required": [ 8 "paper", 9 "checklist", 10 "claims", 11 "methodology_tags", 12 "key_findings", 13 "red_flags", 14 "cited_papers" 15 ], 16 "properties": { 17 "paper": { 18 "type": "object", 19 "description": "Paper metadata.", 20 "required": ["title", "authors", "year"], 21 "properties": { 22 "title": { "type": "string" }, 23 "authors": { 24 "type": "array", 25 "items": { "type": "string" } 26 }, 27 "year": { "type": "integer" }, 28 "venue": { "type": "string" }, 29 "arxiv_id": { "type": "string", "pattern": "^\\d{4}\\.\\d{4,5}$" }, 30 "doi": { "type": "string" } 31 } 32 }, 33 "checklist": { 34 "type": "object", 35 "description": "Boolean quality checklist. Each question has two boolean fields — 'applies' (is this relevant to this paper type?) and 'answer' (does the paper satisfy the criterion?) — plus justification text. This two-field design separates applicability from compliance, eliminating ambiguity in NA boundary decisions.", 36 "required": [ 37 "artifacts", 38 "statistical_methodology", 39 "evaluation_design", 40 "claims_and_evidence", 41 "setup_transparency", 42 "limitations_and_scope", 43 "data_integrity", 44 "conflicts_of_interest", 45 "contamination", 46 "human_studies", 47 "cost_and_practicality" 48 ], 49 "properties": { 50 "artifacts": { 51 "type": "object", 52 "description": "Can someone reproduce this work from what was released?", 53 "required": [ 54 "code_released", 55 "data_released", 56 "environment_specified", 57 "reproduction_instructions" 58 ], 59 "properties": { 60 "code_released": { 61 "$ref": "#/$defs/checklist_item", 62 "description": "Is source code released (e.g., GitHub link, Zenodo archive)? Look for: repository URLs in the paper, footnotes, or abstract. A promise of future release counts as NO. Code 'available upon request' counts as NO. Only YES if a working URL or archive is provided." 63 }, 64 "data_released": { 65 "$ref": "#/$defs/checklist_item", 66 "description": "Is the dataset released or publicly available? Look for: dataset download links, references to public datasets used (e.g., 'we use the publicly available SWE-bench dataset' = YES). If they collected proprietary data and did not release it, NO. If the data is a standard public benchmark they didn't modify, YES." 67 }, 68 "environment_specified": { 69 "$ref": "#/$defs/checklist_item", 70 "description": "Are environment or dependency specifications provided? Look for: requirements.txt, Dockerfile, conda environment file, or a detailed 'Environment Setup' section listing library versions. Mentioning 'Python 3.x' alone is NOT enough — there must be enough detail to recreate the environment." 71 }, 72 "reproduction_instructions": { 73 "$ref": "#/$defs/checklist_item", 74 "description": "Are step-by-step reproduction instructions included? Look for: a README with commands to run, a 'Reproducing Results' section, or scripts that replicate the main experiments. The instructions must be specific enough that a competent researcher could follow them without guessing." 75 } 76 } 77 }, 78 "statistical_methodology": { 79 "type": "object", 80 "description": "Are the numbers treated with appropriate rigor?", 81 "required": [ 82 "confidence_intervals_or_error_bars", 83 "significance_tests", 84 "effect_sizes_reported", 85 "sample_size_justified", 86 "variance_reported" 87 ], 88 "properties": { 89 "confidence_intervals_or_error_bars": { 90 "$ref": "#/$defs/checklist_item", 91 "description": "Are confidence intervals or error bars reported for main results? Look for: CI notation (e.g., '95% CI [x, y]'), error bars on figures, ± notation in tables. If the paper reports only point estimates (e.g., '43.2% accuracy') with no uncertainty, NO." 92 }, 93 "significance_tests": { 94 "$ref": "#/$defs/checklist_item", 95 "description": "Are statistical significance tests used where claims of difference are made? Look for: p-values, t-tests, Mann-Whitney U, chi-squared, ANOVA, bootstrap tests, permutation tests. If the paper claims 'X outperforms Y' based solely on comparing two numbers without any test, NO. NA if the paper makes no comparative claims." 96 }, 97 "effect_sizes_reported": { 98 "$ref": "#/$defs/checklist_item", 99 "description": "Are effect sizes reported (not just p-values or raw differences)? Look for: Cohen's d, odds ratios, relative risk, percentage improvement with baseline context. A paper that says 'p < 0.05' without indicating the magnitude of the effect is NO. A paper that says '12% improvement over baseline (from 45% to 57%)' provides enough context for YES." 100 }, 101 "sample_size_justified": { 102 "$ref": "#/$defs/checklist_item", 103 "description": "Is the sample size justified or is a power analysis discussed? Look for: explicit justification for why N participants/examples were chosen, power analysis, or acknowledgment that the sample may be too small for certain claims. If N is small and no justification is given, NO. NA for theoretical papers." 104 }, 105 "variance_reported": { 106 "$ref": "#/$defs/checklist_item", 107 "description": "Is variance or standard deviation reported across experimental runs? Look for: std dev in tables, variance across seeds, interquartile range, multiple-run results with spread measures. If the paper reports single-run numbers only, NO. If it explicitly states 'averaged over K runs with std dev' YES. Reporting medians across runs WITHOUT any spread measure (std dev, IQR, min/max range) is NO — the reader cannot assess result stability." 108 } 109 } 110 }, 111 "evaluation_design": { 112 "type": "object", 113 "description": "Is the evaluation designed to actually test the claims?", 114 "required": [ 115 "baselines_included", 116 "baselines_contemporary", 117 "ablation_study", 118 "multiple_metrics", 119 "human_evaluation", 120 "held_out_test_set", 121 "per_category_breakdown", 122 "failure_cases_discussed", 123 "negative_results_reported" 124 ], 125 "properties": { 126 "baselines_included": { 127 "$ref": "#/$defs/checklist_item", 128 "description": "Are baseline comparisons included? Look for: comparison against prior work, naive baselines, or ablated versions. A paper that only reports its own system's numbers with no comparison is NO. NA for papers that define a new task with no prior work." 129 }, 130 "baselines_contemporary": { 131 "$ref": "#/$defs/checklist_item", 132 "description": "Are the baselines contemporary and competitive? Look for: whether the baselines are recent and represent the state of the art, or whether they are suspiciously old/weak. If the newest baseline is 3+ years old when newer alternatives exist, NO. If the paper justifies why older baselines are appropriate, YES." 133 }, 134 "ablation_study": { 135 "$ref": "#/$defs/checklist_item", 136 "description": "Is there an ablation study showing which components matter? Look for: experiments that remove or modify individual components to measure their contribution. NA if the system has only one component." 137 }, 138 "multiple_metrics": { 139 "$ref": "#/$defs/checklist_item", 140 "description": "Are multiple evaluation metrics used? Look for: at least two different metrics (e.g., accuracy AND F1, or Pass@1 AND Pass@10). If the paper reports only a single metric, NO." 141 }, 142 "human_evaluation": { 143 "$ref": "#/$defs/checklist_item", 144 "description": "Is human evaluation included (not just automated metrics)? Look for: human ratings, manual inspection, user studies, expert review of the system's OUTPUTS. The humans must be evaluating what the system produced — manual classification of the benchmark or dataset itself does not count. If evaluation of the system is entirely automated (e.g., pass/fail on test suites), NO. NA if human evaluation is clearly irrelevant to the claims." 145 }, 146 "held_out_test_set": { 147 "$ref": "#/$defs/checklist_item", 148 "description": "Are results reported on a held-out test set (not the dev/validation set used for tuning)? Look for: explicit separation of dev and test splits. If unclear whether the reported numbers are on data used for any selection decisions, NO." 149 }, 150 "per_category_breakdown": { 151 "$ref": "#/$defs/checklist_item", 152 "description": "Are per-category or per-task breakdowns provided (not just overall averages)? Look for: tables showing performance on individual tasks, categories, or splits. A single aggregate number hides important variation — if a system scores 80% overall but 20% on hard cases, the average is misleading." 153 }, 154 "failure_cases_discussed": { 155 "$ref": "#/$defs/checklist_item", 156 "description": "Are failure cases shown or discussed? Look for: error analysis, qualitative examples of failures, discussion of where the approach breaks down. If the paper only shows successes, NO." 157 }, 158 "negative_results_reported": { 159 "$ref": "#/$defs/checklist_item", 160 "description": "Are negative results reported (things that didn't work)? Look for: ablations that hurt performance, approaches that were tried and abandoned, configurations that failed. If every experiment shows improvement, be skeptical — NO unless the paper explicitly addresses this." 161 } 162 } 163 }, 164 "claims_and_evidence": { 165 "type": "object", 166 "description": "Do the claims stay within what the evidence supports?", 167 "required": [ 168 "abstract_claims_supported", 169 "causal_claims_justified", 170 "generalization_bounded", 171 "alternative_explanations_discussed" 172 ], 173 "properties": { 174 "abstract_claims_supported": { 175 "$ref": "#/$defs/checklist_item", 176 "description": "Are all claims in the abstract supported by results in the paper? Read the abstract and check each empirical claim against the results section. If the abstract says 'our method achieves state-of-the-art' but the results show it's second-best, NO. If the abstract hedges appropriately, YES." 177 }, 178 "causal_claims_justified": { 179 "$ref": "#/$defs/checklist_item", 180 "description": "If the paper makes causal claims, is the study design adequate for causal inference? Look for: RCT, natural experiment, instrumental variables, difference-in-differences, or other causal identification strategies. If the paper says 'X improves Y' from observational data without addressing confounds, NO. NA if no causal claims are made. Note: ablation studies ('removing component X reduces performance by Y%') ARE causal claims — check whether the ablation design is adequate (controlled single-variable manipulation counts as YES). Language like 'improves', 'causes', 'leads to', 'enables' signals causal claims." 181 }, 182 "generalization_bounded": { 183 "$ref": "#/$defs/checklist_item", 184 "description": "Are generalizations bounded to the tested setting? Look for: claims that extend beyond the tested models, languages, tasks, or populations. If the paper tests on Python and claims results for 'code generation' generally, NO. If it says 'on Python tasks with GPT-4' YES. Check the title and abstract — broad titles like 'LLM-based Software Engineering' when results are on a single benchmark in a single language is NO." 185 }, 186 "alternative_explanations_discussed": { 187 "$ref": "#/$defs/checklist_item", 188 "description": "Are alternative explanations for the results discussed? Look for: consideration of confounds, other factors that could explain the results, robustness checks. If the paper presents one interpretation without considering alternatives, NO. A threats-to-validity section counts only if it discusses specific alternative explanations for the observed results, not just generic methodological limitations. NA only for papers that present no empirical results (e.g., pure surveys or taxonomies)." 189 }, 190 "proxy_outcome_distinction": { 191 "$ref": "#/$defs/checklist_item", 192 "description": "Does the paper distinguish between the proxy it measures and the outcome it claims? Three things matter: (1) what was actually measured (e.g., task completion time, lines of code, pass@1), (2) what the paper frames this as (e.g., 'productivity', 'code quality', 'developer effectiveness'), and (3) whether the gap between measurement and framing is acknowledged. A paper that measures lines added per hour and calls it 'productivity' without discussing what productivity actually entails is NO. A paper that says 'we use task completion time as a proxy for productivity, noting this does not capture code quality, maintenance burden, or team dynamics' is YES. If the paper's claims match the granularity of its measurements (e.g., 'pass@1 on HumanEval' without broader framing), YES — no proxy gap exists. NA only for theoretical papers with no measurements." 193 } 194 } 195 }, 196 "setup_transparency": { 197 "type": "object", 198 "description": "Is the experimental setup described well enough to understand what was actually tested?", 199 "required": [ 200 "model_versions_specified", 201 "prompts_provided", 202 "hyperparameters_reported", 203 "scaffolding_described", 204 "data_preprocessing_documented" 205 ], 206 "properties": { 207 "model_versions_specified": { 208 "$ref": "#/$defs/checklist_item", 209 "description": "Are exact model versions or sizes specified? Look for: specific model names with version (e.g., 'gpt-4-0613', 'Claude 3.5 Sonnet', 'Llama-2-70b-chat'). If the paper says just 'GPT-4' or 'Claude' without a version or snapshot date, NO — model behavior changes across versions. Marketing names like 'Gemini-2.5' or 'GPT-4o' without a snapshot date or API version do NOT count as specified versions." 210 }, 211 "prompts_provided": { 212 "$ref": "#/$defs/checklist_item", 213 "description": "Are the prompts or system instructions used in experiments provided? Look for: full prompt text in the paper or appendix, or a link to a repository containing prompts. If prompts are described only in natural language ('we asked the model to...') without the actual text, NO. A prompt TEMPLATE with placeholders (e.g., '[Task Description]') does NOT count unless the actual fill values are also provided — the reader must be able to reconstruct every prompt sent to the model. NA if the paper does not use prompting." 214 }, 215 "hyperparameters_reported": { 216 "$ref": "#/$defs/checklist_item", 217 "description": "Are hyperparameters reported (temperature, top-p, max tokens, learning rate, etc.)? Look for: a hyperparameters table or section. If the paper uses an LLM API without stating temperature/sampling settings, NO — these significantly affect output." 218 }, 219 "scaffolding_described": { 220 "$ref": "#/$defs/checklist_item", 221 "description": "If the approach uses agentic scaffolding, is it described in detail? Look for: tool descriptions, workflow diagrams, retry logic, feedback mechanisms, memory/context management. If the paper says 'we used an agent' without describing the scaffold, NO. NA if no scaffolding is used. Also NA if the paper evaluates third-party tools (e.g., Claude Code, Copilot) as black boxes — the authors cannot be expected to describe internal scaffolding they have no access to." 222 }, 223 "data_preprocessing_documented": { 224 "$ref": "#/$defs/checklist_item", 225 "description": "Are data preprocessing and filtering steps documented? Look for: how raw data was cleaned, filtered, or transformed before use. If the paper goes from 'we collected data' to 'here are the results' without describing intermediate processing, NO. For survey papers: describing the filtering pipeline stages with counts (e.g., '500 initial results → 200 after title screening → 80 after full-text review') is YES only if the actual filtering CRITERIA at each stage are also stated. Listing stages without criteria is NO." 226 } 227 } 228 }, 229 "limitations_and_scope": { 230 "type": "object", 231 "description": "Does the paper honestly discuss what it does not show?", 232 "required": [ 233 "limitations_section_present", 234 "threats_to_validity_specific", 235 "scope_boundaries_stated" 236 ], 237 "properties": { 238 "limitations_section_present": { 239 "$ref": "#/$defs/checklist_item", 240 "description": "Is there a limitations or threats-to-validity section? Look for: a dedicated section or subsection titled 'Limitations', 'Threats to Validity', or similar. A single sentence buried in the conclusion does not count — there must be substantive discussion." 241 }, 242 "threats_to_validity_specific": { 243 "$ref": "#/$defs/checklist_item", 244 "description": "Are specific threats to validity discussed (not just boilerplate)? Look for: threats that are specific to THIS study, not generic disclaimers like 'our results may not generalize.' Good: 'Our sample of 16 developers is too small for subgroup analysis.' Bad: 'More research is needed.' If the limitations are all generic, NO." 245 }, 246 "scope_boundaries_stated": { 247 "$ref": "#/$defs/checklist_item", 248 "description": "Are scope boundaries explicitly stated (what the results do NOT show)? Look for: explicit statements about what was not tested, what populations/settings are excluded, what claims the authors are NOT making. The METR paper's Table 2 ('What the evidence does not show') is the gold standard. Generic limitations like 'our results may not generalize' do NOT count — the paper must state specific things it did NOT test or claim." 249 } 250 } 251 }, 252 "data_integrity": { 253 "type": "object", 254 "description": "Can the underlying data be verified? Inspired by cases like the Wakefield MMR paper where fabricated data went undetected for 12 years because no one could check it.", 255 "required": [ 256 "raw_data_available", 257 "data_collection_described", 258 "recruitment_methods_described", 259 "data_pipeline_documented" 260 ], 261 "properties": { 262 "raw_data_available": { 263 "$ref": "#/$defs/checklist_item", 264 "description": "Is raw data available for independent verification? Look for: data downloads, supplementary data files, database access. If only processed/aggregated results are shown with no way to verify the underlying data, NO. This is the check that would have caught Wakefield — if the raw medical records had been available, fabrication would have been detected immediately." 265 }, 266 "data_collection_described": { 267 "$ref": "#/$defs/checklist_item", 268 "description": "Is the data collection procedure described in detail? Look for: how data was gathered, what instruments were used, what time period, what inclusion/exclusion criteria. If the paper says 'we collected N examples' without explaining how, NO." 269 }, 270 "recruitment_methods_described": { 271 "$ref": "#/$defs/checklist_item", 272 "description": "Are participant or sample recruitment methods described? Look for: how participants were found, what channels were used, whether recruitment could introduce bias. Wakefield recruited through anti-vaccine activists, biasing the sample. If participants/samples were selected without description of the selection process, NO. For crowd-sourced events (competitions, red-teaming), simply stating 'we ran a competition' is not enough — describe how participants were recruited and whether this introduces selection bias. NA if no human participants and data source is a standard benchmark." 273 }, 274 "data_pipeline_documented": { 275 "$ref": "#/$defs/checklist_item", 276 "description": "Is the full data pipeline from collection to final analysis documented? Look for: each transformation step, filtering criteria and how many examples were removed at each stage, any manual annotation steps. If there are unexplained jumps (e.g., 'we started with 1000 examples' then results show 500 with no explanation), NO." 277 } 278 } 279 }, 280 "conflicts_of_interest": { 281 "type": "object", 282 "description": "Are potential biases from funding, affiliation, or financial interest disclosed?", 283 "required": [ 284 "funding_disclosed", 285 "affiliations_disclosed", 286 "funder_independent_of_outcome", 287 "financial_interests_declared" 288 ], 289 "properties": { 290 "funding_disclosed": { 291 "$ref": "#/$defs/checklist_item", 292 "description": "Is the funding source disclosed? Look for: an acknowledgments section listing grants, corporate sponsors, or funding agencies. If there is no mention of funding at all, NO. NA only if it's clearly unfunded work (e.g., a solo independent researcher)." 293 }, 294 "affiliations_disclosed": { 295 "$ref": "#/$defs/checklist_item", 296 "description": "Are author affiliations with the evaluated product or company disclosed? Look for: authors who work at the company whose product is being tested. If Google employees evaluate Gemini, or OpenAI employees evaluate GPT, this must be prominent. If affiliations are listed but the conflict is not explicitly acknowledged, still YES for this question (the conflict-of-interest flag is separate)." 297 }, 298 "funder_independent_of_outcome": { 299 "$ref": "#/$defs/checklist_item", 300 "description": "Is the funder independent of the outcome? Look for: whether the entity paying for the research has a financial interest in a particular result. Wakefield was secretly paid by lawyers suing vaccine makers. A paper funded by OpenAI evaluating GPT-4 has a non-independent funder. YES if the funder has no stake in the results, NO if they do, NA if unfunded." 301 }, 302 "financial_interests_declared": { 303 "$ref": "#/$defs/checklist_item", 304 "description": "Do any authors hold patents, equity, or other financial interests related to the findings? Look for: competing interests statements, patent disclosures, author-affiliated startups. If there is no competing interests statement at all, NO — absence of disclosure is not the same as absence of conflict." 305 } 306 } 307 }, 308 "contamination": { 309 "type": "object", 310 "description": "Could the model have seen the test data during training? This is the 'you measured it wrong' problem — if the benchmark is in the training data, the results are meaningless.", 311 "required": [ 312 "training_cutoff_stated", 313 "train_test_overlap_discussed", 314 "benchmark_contamination_addressed" 315 ], 316 "properties": { 317 "training_cutoff_stated": { 318 "$ref": "#/$defs/checklist_item", 319 "description": "Is the model's training data cutoff date stated? Look for: explicit mention of when the training data ends. This is necessary to assess whether test examples could have been in the training set. If the paper uses a model without stating when its training data was collected, NO. NA if the paper does not evaluate a pre-trained model's capability on any benchmark (e.g., mining studies, interview studies, surveys, or studies that test defenses/tools rather than model knowledge)." 320 }, 321 "train_test_overlap_discussed": { 322 "$ref": "#/$defs/checklist_item", 323 "description": "Is potential train/test overlap discussed? Look for: any analysis of whether test examples appeared in the training data. Canary strings, membership inference, or temporal splits all count. If the paper uses a public benchmark with a model that could have trained on it and doesn't address this, NO. NA if the paper does not evaluate a pre-trained model on any benchmark (same NA rule as training_cutoff_stated)." 324 }, 325 "benchmark_contamination_addressed": { 326 "$ref": "#/$defs/checklist_item", 327 "description": "Were benchmark examples available online before the model's training cutoff? Look for: whether the benchmark was published before the model's training data was collected. HumanEval was published in 2021; any model trained after 2021 may have seen it. If the paper uses such a benchmark without discussing contamination risk, NO. NA if using a benchmark created after the model's training cutoff, OR if the paper does not evaluate a pre-trained model on any benchmark (same NA rule as training_cutoff_stated)." 328 } 329 } 330 }, 331 "human_studies": { 332 "type": "object", 333 "description": "For papers involving human participants. All items NA if the paper has no human subjects.", 334 "required": [ 335 "pre_registered", 336 "irb_or_ethics_approval", 337 "demographics_reported", 338 "inclusion_exclusion_criteria", 339 "randomization_described", 340 "blinding_described", 341 "attrition_reported" 342 ], 343 "properties": { 344 "pre_registered": { 345 "$ref": "#/$defs/checklist_item", 346 "description": "Is the study pre-registered? Look for: a link to a pre-registration (OSF, AsPredicted, ClinicalTrials.gov, AEA registry). Pre-registration commits the researchers to their analysis plan before seeing the data, preventing p-hacking and outcome switching. Very rare in CS but standard in medicine. NA if no human participants. Mining public repositories or analyzing public data does NOT make participants — use NA." 347 }, 348 "irb_or_ethics_approval": { 349 "$ref": "#/$defs/checklist_item", 350 "description": "Is IRB or ethics board approval mentioned? Look for: 'This study was approved by [institution] IRB' or equivalent. If the study collects data from human participants without mentioning ethics review, NO. NA if no human participants." 351 }, 352 "demographics_reported": { 353 "$ref": "#/$defs/checklist_item", 354 "description": "Are participant demographics reported? Look for: experience level, years of experience, gender, geographic distribution, programming languages known, company size. If the paper says 'N developers' without characterizing them, NO. NA if no human participants." 355 }, 356 "inclusion_exclusion_criteria": { 357 "$ref": "#/$defs/checklist_item", 358 "description": "Are inclusion and exclusion criteria for participants stated? Look for: who was eligible, who was excluded and why, any screening process. If participants just 'were recruited' with no selection criteria, NO. NA if no human participants." 359 }, 360 "randomization_described": { 361 "$ref": "#/$defs/checklist_item", 362 "description": "Is the randomization procedure described (if applicable)? Look for: how participants were assigned to conditions, whether randomization was stratified, what tool was used. If the paper compares treatment vs. control without explaining how assignment worked, NO. NA if not an experimental study (e.g., cross-sectional surveys, observational studies, repository mining) or no human participants." 363 }, 364 "blinding_described": { 365 "$ref": "#/$defs/checklist_item", 366 "description": "Is blinding described (if applicable)? Look for: whether participants knew which condition they were in, whether evaluators knew which outputs came from which system. If applicable and not mentioned, NO. NA if blinding is not feasible, no human participants, or not an experimental study (e.g., cross-sectional surveys, observational studies)." 367 }, 368 "attrition_reported": { 369 "$ref": "#/$defs/checklist_item", 370 "description": "Is participant attrition or dropout reported? Look for: how many participants started vs. finished, reasons for dropout, whether intention-to-treat analysis was used. If participants are mentioned at the start but the final N is smaller with no explanation, NO. NA if no human participants." 371 } 372 } 373 }, 374 "cost_and_practicality": { 375 "type": "object", 376 "description": "Is the practical cost of the approach reported? Important for assessing real-world applicability.", 377 "required": [ 378 "inference_cost_reported", 379 "compute_budget_stated" 380 ], 381 "properties": { 382 "inference_cost_reported": { 383 "$ref": "#/$defs/checklist_item", 384 "description": "Is inference cost or latency reported? Look for: API costs, tokens consumed, wall-clock time, cost per example. If the paper proposes a method that calls GPT-4 100 times per example without mentioning cost, NO. NA if cost is clearly irrelevant (e.g., theoretical paper, survey paper). If a survey reports costs of systems it reviews, that does NOT count — this question asks about the cost of the paper's own method." 385 }, 386 "compute_budget_stated": { 387 "$ref": "#/$defs/checklist_item", 388 "description": "Is the total computational budget stated? Look for: GPU hours, total API spend, hardware used, training time. If the approach required significant compute and this is not quantified, NO." 389 } 390 } 391 }, 392 "experimental_rigor": { 393 "type": "object", 394 "description": "Conditional module: activated when methodology_tags includes 'benchmark-eval'. Addresses systematic issues identified by Henderson et al. (2018), Dodge et al. (2019), and Lucic et al. (2018) in ML experimental methodology. NOT activated for 'rct' — human-subjects RCTs are covered by the human_studies category.", 395 "properties": { 396 "seed_sensitivity_reported": { 397 "$ref": "#/$defs/checklist_item", 398 "description": "Are results reported across multiple random seeds? Look for: results tables showing mean/std across seeds, or explicit statement of seed sensitivity analysis. Henderson et al. (2018) showed RL results can vary by 2x across seeds. If the paper reports single-seed results, NO." 399 }, 400 "number_of_runs_stated": { 401 "$ref": "#/$defs/checklist_item", 402 "description": "Is the exact number of experimental runs explicitly stated? Look for: 'averaged over K runs', 'N trials', or equivalent. If results are presented without stating how many runs produced them, NO." 403 }, 404 "hyperparameter_search_budget": { 405 "$ref": "#/$defs/checklist_item", 406 "description": "Is the hyperparameter search budget reported? Look for: number of configurations tried, search method (grid, random, Bayesian), total compute spent on search. Dodge et al. (2019) showed that search budget dramatically affects reported results. If hyperparameters appear tuned but no search budget is stated, NO." 407 }, 408 "best_config_selection_justified": { 409 "$ref": "#/$defs/checklist_item", 410 "description": "Is the selection of the best configuration justified and not cherry-picked? Look for: selection on validation set (not test), clear description of selection criterion, or reporting all configurations tried. If only the best result is shown with no explanation of how it was selected, NO." 411 }, 412 "multiple_comparison_correction": { 413 "$ref": "#/$defs/checklist_item", 414 "description": "When multiple statistical tests are performed, is correction for multiple comparisons applied? Look for: Bonferroni, Holm, Benjamini-Hochberg, or other family-wise error rate corrections. If the paper runs many comparisons and reports p-values without correction, NO. NA if only one or two comparisons are made." 415 }, 416 "self_comparison_bias_addressed": { 417 "$ref": "#/$defs/checklist_item", 418 "description": "Do the authors acknowledge the bias of evaluating their own system? Look for: explicit discussion of author-evaluation bias, independent evaluation, or mitigation strategies. Lucic et al. (2018) showed that authors' implementations of baselines systematically underperform. If authors compare their system against their own re-implementation of baselines without acknowledging this bias, NO." 419 }, 420 "compute_budget_vs_performance": { 421 "$ref": "#/$defs/checklist_item", 422 "description": "Is performance reported as a function of compute budget? Look for: performance curves across compute levels, or explicit comparison at matched compute budgets. If the proposed method uses 10x more compute than baselines and this is not discussed, NO. NA if compute differences are negligible." 423 }, 424 "benchmark_construct_validity": { 425 "$ref": "#/$defs/checklist_item", 426 "description": "Does the paper discuss whether the benchmark actually measures what is claimed? Look for: analysis of what the benchmark tests vs. what the paper claims to evaluate, discussion of construct validity, or comparison with alternative benchmarks. Kapoor & Narayanan (2024) documented widespread validity gaps. If the paper uses a benchmark without questioning whether it measures the claimed capability, NO." 427 }, 428 "scaffold_confound_addressed": { 429 "$ref": "#/$defs/checklist_item", 430 "description": "When comparing models or reporting benchmark results, is the scaffolding/tooling confound addressed? SWE-bench scores vary 2.7% to 28.3% for the same model depending on scaffold — the scaffold effect often exceeds the model effect. Look for: same scaffold used across model comparisons, scaffold described as a variable, or results reported for multiple scaffolds. If the paper compares Model A in Scaffold X vs Model B in Scaffold Y and attributes the difference to the model, NO. If the paper evaluates a tool (e.g., Cursor, Copilot) as a bundled product without claiming to isolate the model, NA — the scaffold IS the thing being tested. NA also if no scaffolding is involved." 431 } 432 } 433 }, 434 "data_leakage": { 435 "type": "object", 436 "description": "Conditional module: activated when methodology_tags includes 'benchmark-eval'. Addresses the taxonomy of leakage types from Kapoor & Narayanan (2024).", 437 "properties": { 438 "temporal_leakage_addressed": { 439 "$ref": "#/$defs/checklist_item", 440 "description": "Is temporal leakage addressed? Look for: discussion of whether training data includes information from after the prediction target's time period, or whether benchmark problems existed before model training. If a model trained on 2024 data is tested on tasks created in 2022, the model may have seen solutions. NO if this is not discussed." 441 }, 442 "feature_leakage_addressed": { 443 "$ref": "#/$defs/checklist_item", 444 "description": "Is feature leakage addressed? Look for: discussion of whether input features contain information that would not be available at prediction time, or whether the evaluation setup leaks answer information through context. If test harness provides hints not available in real usage, NO." 445 }, 446 "non_independence_addressed": { 447 "$ref": "#/$defs/checklist_item", 448 "description": "Is non-independence of train and test data addressed? Look for: discussion of whether train and test examples are drawn from the same distribution or share structural similarities (e.g., same repositories, same authors, duplicate or near-duplicate problems). If the paper does not verify independence, NO." 449 }, 450 "leakage_detection_method": { 451 "$ref": "#/$defs/checklist_item", 452 "description": "Is a concrete leakage detection or prevention method used? Look for: canary strings, membership inference tests, n-gram overlap analysis, temporal splits, decontamination pipelines. If the paper only discusses leakage conceptually without applying a detection method, NO." 453 } 454 } 455 }, 456 "survey_methodology": { 457 "type": "object", 458 "description": "Conditional module: activated when methodology_tags includes 'meta-analysis'. Assesses whether surveys and systematic reviews follow structured review protocols.", 459 "properties": { 460 "prisma_or_structured_protocol": { 461 "$ref": "#/$defs/checklist_item", 462 "description": "Does the survey follow PRISMA or another structured review protocol? Look for: PRISMA flow diagram, explicit protocol registration, structured search strategy with reproducible queries, or reference to an established review methodology. Ad-hoc paper collection without a systematic protocol is NO." 463 }, 464 "quality_assessment_of_sources": { 465 "$ref": "#/$defs/checklist_item", 466 "description": "Does the survey assess the quality of its source papers? Look for: quality scoring rubric, risk-of-bias assessment, or structured evaluation of included studies. If the survey treats all papers equally regardless of methodological quality, NO. Leech et al. and the Trust AI Benchmarks paper both note that surveys without quality assessment launder weak results." 467 }, 468 "publication_bias_discussed": { 469 "$ref": "#/$defs/checklist_item", 470 "description": "Does the survey discuss publication bias? Look for: funnel plots, discussion of negative-result underrepresentation, acknowledgment that published papers skew positive, or tests for publication bias (Egger's test, trim-and-fill). If the survey does not consider whether its sources are biased toward positive results, NO." 471 } 472 } 473 } 474 } 475 }, 476 "scan_version": { 477 "type": "integer", 478 "description": "Schema version. 1 = base 50 questions only. 2 = base + conditional modules. Omitted = 1.", 479 "default": 1 480 }, 481 "active_modules": { 482 "type": "array", 483 "description": "Which conditional checklist modules were activated for this paper, based on methodology_tags. Empty or omitted for v1 scans.", 484 "items": { 485 "type": "string", 486 "enum": ["experimental_rigor", "data_leakage", "survey_methodology"] 487 } 488 }, 489 "claims": { 490 "type": "array", 491 "description": "Key empirical claims extracted from the paper with supporting evidence.", 492 "items": { 493 "type": "object", 494 "required": ["claim", "evidence", "supported"], 495 "properties": { 496 "claim": { 497 "type": "string", 498 "description": "The claim as stated or paraphrased from the paper." 499 }, 500 "evidence": { 501 "type": "string", 502 "description": "The evidence cited in support, with page/section references." 503 }, 504 "supported": { 505 "type": "string", 506 "enum": ["strong", "moderate", "weak", "unsupported"], 507 "description": "How well the evidence supports the claim." 508 } 509 } 510 } 511 }, 512 "methodology_tags": { 513 "type": "array", 514 "description": "Methodology type tags assigned by the scan agent.", 515 "items": { 516 "type": "string", 517 "enum": [ 518 "rct", 519 "observational", 520 "benchmark-eval", 521 "case-study", 522 "meta-analysis", 523 "theoretical", 524 "qualitative" 525 ] 526 } 527 }, 528 "key_findings": { 529 "type": "string", 530 "description": "Brief summary of the paper's key findings (2-4 sentences)." 531 }, 532 "red_flags": { 533 "type": "array", 534 "description": "Methodological red flags identified during the scan.", 535 "items": { 536 "type": "object", 537 "required": ["flag", "detail"], 538 "properties": { 539 "flag": { 540 "type": "string", 541 "description": "Short label for the red flag." 542 }, 543 "detail": { 544 "type": "string", 545 "description": "Explanation of why this is a concern." 546 } 547 } 548 } 549 }, 550 "cited_papers": { 551 "type": "array", 552 "description": "Papers cited in this paper that are relevant to the survey scope. Used for citation-chasing: these become candidates for the registry.", 553 "items": { 554 "type": "object", 555 "required": ["title", "relevance"], 556 "properties": { 557 "title": { 558 "type": "string", 559 "description": "Title of the cited paper as it appears in the references." 560 }, 561 "authors": { 562 "type": "array", 563 "items": { "type": "string" }, 564 "description": "Author names if available from the reference." 565 }, 566 "year": { 567 "type": "integer", 568 "description": "Publication year if available." 569 }, 570 "arxiv_id": { 571 "type": "string", 572 "pattern": "^\\d{4}\\.\\d{4,5}$", 573 "description": "arXiv ID if available." 574 }, 575 "doi": { 576 "type": "string", 577 "description": "DOI if available." 578 }, 579 "relevance": { 580 "type": "string", 581 "description": "Why this cited paper is relevant to the survey (1 sentence)." 582 } 583 } 584 } 585 } 586 }, 587 "$defs": { 588 "checklist_item": { 589 "type": "object", 590 "required": ["applies", "answer", "justification"], 591 "properties": { 592 "applies": { 593 "type": "boolean", 594 "description": "Does this criterion apply to this paper type? false = structurally inapplicable (e.g., human_studies questions for a benchmark paper). true = the criterion is applicable, even if the paper does not satisfy it." 595 }, 596 "answer": { 597 "type": "boolean", 598 "description": "Does the paper satisfy this criterion? Only meaningful when applies=true. Set to false when applies=false." 599 }, 600 "justification": { 601 "type": "string", 602 "description": "1-3 sentences explaining the answer. When applies=true: cite specific sections for answer=true, or state what is missing for answer=false. When applies=false: state why the criterion does not apply to this paper type." 603 } 604 } 605 } 606 } 607 }