ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.schema.json (42226B)


      1 {
      2   "$schema": "https://json-schema.org/draft/2020-12/schema",
      3   "$id": "scan-v4.schema.json",
      4   "title": "V4 Paper Scan Result",
      5   "description": "Type-routed scan instrument. Shared core (15q) + type-specific module. Two-field boolean design: applies + answer.",
      6   "type": "object",
      7   "required": [
      8     "scan_version",
      9     "paper_type",
     10     "paper",
     11     "checklist",
     12     "type_checklist",
     13     "claims",
     14     "methodology_tags",
     15     "key_findings",
     16     "red_flags",
     17     "cited_papers",
     18     "engagement_factors",
     19     "hn_data"
     20   ],
     21   "properties": {
     22     "scan_version": {
     23       "type": "integer",
     24       "const": 4
     25     },
     26     "paper_type": {
     27       "type": "string",
     28       "enum": [
     29         "empirical",
     30         "benchmark-creation",
     31         "survey",
     32         "position",
     33         "theoretical"
     34       ]
     35     },
     36     "paper": {
     37       "type": "object",
     38       "required": [
     39         "title",
     40         "authors",
     41         "year"
     42       ],
     43       "properties": {
     44         "title": {
     45           "type": "string"
     46         },
     47         "authors": {
     48           "type": "array",
     49           "items": {
     50             "type": "string"
     51           }
     52         },
     53         "year": {
     54           "type": "integer"
     55         },
     56         "venue": {
     57           "type": "string"
     58         },
     59         "arxiv_id": {
     60           "type": "string"
     61         },
     62         "doi": {
     63           "type": "string"
     64         }
     65       }
     66     },
     67     "checklist": {
     68       "type": "object",
     69       "description": "Shared core checklist — applies to ALL paper types.",
     70       "required": [
     71         "claims_and_evidence",
     72         "limitations_and_scope",
     73         "conflicts_of_interest",
     74         "scope_and_framing"
     75       ],
     76       "properties": {
     77         "claims_and_evidence": {
     78           "type": "object",
     79           "description": "Do the claims stay within what the evidence supports?",
     80           "required": [
     81             "abstract_claims_supported",
     82             "causal_claims_justified",
     83             "generalization_bounded",
     84             "alternative_explanations_discussed",
     85             "proxy_outcome_distinction"
     86           ],
     87           "properties": {
     88             "abstract_claims_supported": {
     89               "$ref": "#/$defs/checklist_item",
     90               "description": "Are all claims in the abstract supported by the paper's content? Check each claim against the evidence presented."
     91             },
     92             "causal_claims_justified": {
     93               "$ref": "#/$defs/checklist_item",
     94               "description": "If the paper makes causal claims ('X improves Y', 'X causes Y'), is the study design adequate for causal inference? Ablation studies count as causal claims. NA if no causal claims."
     95             },
     96             "generalization_bounded": {
     97               "$ref": "#/$defs/checklist_item",
     98               "description": "Are generalizations bounded to the tested/argued setting? Broad titles or conclusions beyond the scope of the evidence = NO."
     99             },
    100             "alternative_explanations_discussed": {
    101               "$ref": "#/$defs/checklist_item",
    102               "description": "Are alternative explanations or interpretations discussed? If only one interpretation is presented without considering alternatives = NO. NA for pure theoretical papers with no empirical claims."
    103             },
    104             "proxy_outcome_distinction": {
    105               "$ref": "#/$defs/checklist_item",
    106               "description": "Does the paper distinguish between what was measured and what is claimed? If claims match measurement granularity, YES. If 'lines of code' is called 'productivity' without discussion, NO."
    107             }
    108           }
    109         },
    110         "limitations_and_scope": {
    111           "type": "object",
    112           "description": "Does the paper honestly discuss what it does not show?",
    113           "required": [
    114             "limitations_section_present",
    115             "threats_to_validity_specific",
    116             "scope_boundaries_stated"
    117           ],
    118           "properties": {
    119             "limitations_section_present": {
    120               "$ref": "#/$defs/checklist_item",
    121               "description": "Is there a dedicated limitations or threats-to-validity section? A single sentence in the conclusion does not count."
    122             },
    123             "threats_to_validity_specific": {
    124               "$ref": "#/$defs/checklist_item",
    125               "description": "Are specific threats discussed (not just boilerplate)? 'Our results may not generalize' is NO. 'Our sample of 16 developers is too small for subgroup analysis' is YES."
    126             },
    127             "scope_boundaries_stated": {
    128               "$ref": "#/$defs/checklist_item",
    129               "description": "Are explicit scope boundaries stated — what the results/arguments do NOT show? Generic disclaimers don't count."
    130             }
    131           }
    132         },
    133         "conflicts_of_interest": {
    134           "type": "object",
    135           "description": "Are potential biases from funding, affiliation, or financial interest disclosed?",
    136           "required": [
    137             "funding_disclosed",
    138             "affiliations_disclosed",
    139             "funder_independent_of_outcome",
    140             "financial_interests_declared"
    141           ],
    142           "properties": {
    143             "funding_disclosed": {
    144               "$ref": "#/$defs/checklist_item",
    145               "description": "Is the funding source disclosed? No mention of funding = NO. NA only if clearly unfunded independent work."
    146             },
    147             "affiliations_disclosed": {
    148               "$ref": "#/$defs/checklist_item",
    149               "description": "Are author affiliations with the evaluated product or company disclosed?"
    150             },
    151             "funder_independent_of_outcome": {
    152               "$ref": "#/$defs/checklist_item",
    153               "description": "Is the funder independent of the outcome? Company employees evaluating their own product = NO. NA if unfunded."
    154             },
    155             "financial_interests_declared": {
    156               "$ref": "#/$defs/checklist_item",
    157               "description": "Are financial interests (patents, equity, consulting) declared? No competing interests statement = NO."
    158             }
    159           }
    160         },
    161         "scope_and_framing": {
    162           "type": "object",
    163           "description": "Is the paper's contribution clearly framed and situated?",
    164           "required": [
    165             "key_terms_defined",
    166             "intended_contribution_clear",
    167             "engagement_with_prior_work"
    168           ],
    169           "properties": {
    170             "key_terms_defined": {
    171               "$ref": "#/$defs/checklist_item",
    172               "description": "Are key terms defined precisely? If the paper uses terms like 'agent', 'productivity', 'alignment' without defining what it means in context, NO."
    173             },
    174             "intended_contribution_clear": {
    175               "$ref": "#/$defs/checklist_item",
    176               "description": "Is the intended contribution explicitly stated? The reader should know what the paper claims to add — a tool, a finding, a framework, a dataset, an argument."
    177             },
    178             "engagement_with_prior_work": {
    179               "$ref": "#/$defs/checklist_item",
    180               "description": "Does the paper engage with prior work in its area? Not just a related work section listing papers, but showing how this work relates to, builds on, or differs from existing contributions."
    181             }
    182           }
    183         }
    184       }
    185     },
    186     "type_checklist": {
    187       "type": "object",
    188       "description": "Type-specific checklist module. Exactly one key matching paper_type.",
    189       "properties": {
    190         "empirical": {
    191           "type": "object",
    192           "properties": {
    193             "artifacts": {
    194               "type": "object",
    195               "description": "Can someone reproduce this work from what was released?",
    196               "required": [
    197                 "code_released",
    198                 "data_released",
    199                 "environment_specified",
    200                 "reproduction_instructions"
    201               ],
    202               "properties": {
    203                 "code_released": {
    204                   "$ref": "#/$defs/checklist_item",
    205                   "description": "Is source code released? A promise of future release or 'available upon request' = NO."
    206                 },
    207                 "data_released": {
    208                   "$ref": "#/$defs/checklist_item",
    209                   "description": "Is the dataset released or publicly available? Standard public benchmarks used unmodified = YES."
    210                 },
    211                 "environment_specified": {
    212                   "$ref": "#/$defs/checklist_item",
    213                   "description": "Are environment/dependency specs provided (requirements.txt, Dockerfile, etc.)? 'Python 3.x' alone is NOT enough."
    214                 },
    215                 "reproduction_instructions": {
    216                   "$ref": "#/$defs/checklist_item",
    217                   "description": "Are step-by-step reproduction instructions included? Must be specific enough to follow without guessing."
    218                 }
    219               }
    220             },
    221             "statistical_methodology": {
    222               "type": "object",
    223               "description": "Are the numbers treated with appropriate rigor?",
    224               "required": [
    225                 "confidence_intervals_or_error_bars",
    226                 "significance_tests",
    227                 "effect_sizes_reported",
    228                 "sample_size_justified",
    229                 "variance_reported"
    230               ],
    231               "properties": {
    232                 "confidence_intervals_or_error_bars": {
    233                   "$ref": "#/$defs/checklist_item",
    234                   "description": "Are CIs or error bars reported for main results?"
    235                 },
    236                 "significance_tests": {
    237                   "$ref": "#/$defs/checklist_item",
    238                   "description": "Are statistical significance tests used where comparative claims are made? NA if no comparative claims."
    239                 },
    240                 "effect_sizes_reported": {
    241                   "$ref": "#/$defs/checklist_item",
    242                   "description": "Are effect sizes reported, not just p-values? Percentage improvement with baseline context counts."
    243                 },
    244                 "sample_size_justified": {
    245                   "$ref": "#/$defs/checklist_item",
    246                   "description": "Is sample size justified or power analysis discussed?"
    247                 },
    248                 "variance_reported": {
    249                   "$ref": "#/$defs/checklist_item",
    250                   "description": "Is variance/std dev reported across runs? Medians without spread = NO."
    251                 }
    252               }
    253             },
    254             "evaluation_design": {
    255               "type": "object",
    256               "description": "Is the evaluation designed to actually test the claims?",
    257               "required": [
    258                 "baselines_included",
    259                 "baselines_contemporary",
    260                 "ablation_study",
    261                 "multiple_metrics",
    262                 "human_evaluation",
    263                 "held_out_test_set",
    264                 "per_category_breakdown",
    265                 "failure_cases_discussed",
    266                 "negative_results_reported"
    267               ],
    268               "properties": {
    269                 "baselines_included": {
    270                   "$ref": "#/$defs/checklist_item",
    271                   "description": "Are baseline comparisons included?"
    272                 },
    273                 "baselines_contemporary": {
    274                   "$ref": "#/$defs/checklist_item",
    275                   "description": "Are baselines contemporary and competitive? Suspiciously old/weak baselines = NO."
    276                 },
    277                 "ablation_study": {
    278                   "$ref": "#/$defs/checklist_item",
    279                   "description": "Is there an ablation study? NA if only one component."
    280                 },
    281                 "multiple_metrics": {
    282                   "$ref": "#/$defs/checklist_item",
    283                   "description": "Are multiple evaluation metrics used?"
    284                 },
    285                 "human_evaluation": {
    286                   "$ref": "#/$defs/checklist_item",
    287                   "description": "Is human evaluation included? Must evaluate system outputs, not just dataset construction. NA if clearly irrelevant."
    288                 },
    289                 "held_out_test_set": {
    290                   "$ref": "#/$defs/checklist_item",
    291                   "description": "Are results on a held-out test set? NA if not a prediction task."
    292                 },
    293                 "per_category_breakdown": {
    294                   "$ref": "#/$defs/checklist_item",
    295                   "description": "Are per-category or per-task breakdowns provided?"
    296                 },
    297                 "failure_cases_discussed": {
    298                   "$ref": "#/$defs/checklist_item",
    299                   "description": "Are failure cases shown or discussed?"
    300                 },
    301                 "negative_results_reported": {
    302                   "$ref": "#/$defs/checklist_item",
    303                   "description": "Are negative results reported?"
    304                 }
    305               }
    306             },
    307             "setup_transparency": {
    308               "type": "object",
    309               "description": "Is the setup described well enough to understand what was tested?",
    310               "required": [
    311                 "model_versions_specified",
    312                 "prompts_provided",
    313                 "hyperparameters_reported",
    314                 "scaffolding_described",
    315                 "data_preprocessing_documented"
    316               ],
    317               "properties": {
    318                 "model_versions_specified": {
    319                   "$ref": "#/$defs/checklist_item",
    320                   "description": "Are exact model versions specified? Marketing names without snapshot dates = NO."
    321                 },
    322                 "prompts_provided": {
    323                   "$ref": "#/$defs/checklist_item",
    324                   "description": "Are actual prompts/system instructions provided? Templates with placeholders = NO unless fill values also given."
    325                 },
    326                 "hyperparameters_reported": {
    327                   "$ref": "#/$defs/checklist_item",
    328                   "description": "Are hyperparameters reported (temperature, top-p, etc.)?"
    329                 },
    330                 "scaffolding_described": {
    331                   "$ref": "#/$defs/checklist_item",
    332                   "description": "Is agentic scaffolding described in detail? NA if no scaffolding or evaluating black-box tools."
    333                 },
    334                 "data_preprocessing_documented": {
    335                   "$ref": "#/$defs/checklist_item",
    336                   "description": "Are data preprocessing and filtering steps documented?"
    337                 }
    338               }
    339             },
    340             "data_integrity": {
    341               "type": "object",
    342               "description": "Can the underlying data be verified?",
    343               "required": [
    344                 "raw_data_available",
    345                 "data_collection_described",
    346                 "recruitment_methods_described",
    347                 "data_pipeline_documented"
    348               ],
    349               "properties": {
    350                 "raw_data_available": {
    351                   "$ref": "#/$defs/checklist_item",
    352                   "description": "Is raw data available for independent verification?"
    353                 },
    354                 "data_collection_described": {
    355                   "$ref": "#/$defs/checklist_item",
    356                   "description": "Is the data collection procedure described in detail?"
    357                 },
    358                 "recruitment_methods_described": {
    359                   "$ref": "#/$defs/checklist_item",
    360                   "description": "Are participant/sample recruitment methods described? NA if standard benchmark with no recruitment."
    361                 },
    362                 "data_pipeline_documented": {
    363                   "$ref": "#/$defs/checklist_item",
    364                   "description": "Is the full data pipeline from collection to analysis documented?"
    365                 }
    366               }
    367             },
    368             "contamination": {
    369               "type": "object",
    370               "description": "Could the model have seen the test data during training?",
    371               "required": [
    372                 "training_cutoff_stated",
    373                 "train_test_overlap_discussed",
    374                 "benchmark_contamination_addressed"
    375               ],
    376               "properties": {
    377                 "training_cutoff_stated": {
    378                   "$ref": "#/$defs/checklist_item",
    379                   "description": "Is the model's training data cutoff stated? NA if not evaluating model capabilities on benchmarks."
    380                 },
    381                 "train_test_overlap_discussed": {
    382                   "$ref": "#/$defs/checklist_item",
    383                   "description": "Is potential train/test overlap discussed? NA same as above."
    384                 },
    385                 "benchmark_contamination_addressed": {
    386                   "$ref": "#/$defs/checklist_item",
    387                   "description": "Were benchmark examples available before training cutoff? NA if benchmark created after cutoff or not evaluating model on benchmarks."
    388                 }
    389               }
    390             },
    391             "human_studies": {
    392               "type": "object",
    393               "description": "For papers involving human participants. All NA if no human subjects.",
    394               "required": [
    395                 "pre_registered",
    396                 "irb_or_ethics_approval",
    397                 "demographics_reported",
    398                 "inclusion_exclusion_criteria",
    399                 "randomization_described",
    400                 "blinding_described",
    401                 "attrition_reported"
    402               ],
    403               "properties": {
    404                 "pre_registered": {
    405                   "$ref": "#/$defs/checklist_item",
    406                   "description": "Is the study pre-registered? NA if no human participants."
    407                 },
    408                 "irb_or_ethics_approval": {
    409                   "$ref": "#/$defs/checklist_item",
    410                   "description": "Is IRB/ethics approval mentioned? NA if no human participants."
    411                 },
    412                 "demographics_reported": {
    413                   "$ref": "#/$defs/checklist_item",
    414                   "description": "Are participant demographics reported? NA if no human participants."
    415                 },
    416                 "inclusion_exclusion_criteria": {
    417                   "$ref": "#/$defs/checklist_item",
    418                   "description": "Are inclusion/exclusion criteria stated? NA if no human participants."
    419                 },
    420                 "randomization_described": {
    421                   "$ref": "#/$defs/checklist_item",
    422                   "description": "Is randomization described? NA if not experimental or no human participants."
    423                 },
    424                 "blinding_described": {
    425                   "$ref": "#/$defs/checklist_item",
    426                   "description": "Is blinding described? NA if not feasible or no human participants."
    427                 },
    428                 "attrition_reported": {
    429                   "$ref": "#/$defs/checklist_item",
    430                   "description": "Is attrition/dropout reported? NA if no human participants."
    431                 }
    432               }
    433             },
    434             "cost_and_practicality": {
    435               "type": "object",
    436               "description": "Is the practical cost reported?",
    437               "required": [
    438                 "inference_cost_reported",
    439                 "compute_budget_stated"
    440               ],
    441               "properties": {
    442                 "inference_cost_reported": {
    443                   "$ref": "#/$defs/checklist_item",
    444                   "description": "Is inference cost or latency reported? NA if clearly irrelevant."
    445                 },
    446                 "compute_budget_stated": {
    447                   "$ref": "#/$defs/checklist_item",
    448                   "description": "Is the total computational budget stated?"
    449                 }
    450               }
    451             }
    452           }
    453         },
    454         "benchmark-creation": {
    455           "type": "object",
    456           "properties": {
    457             "construct_design": {
    458               "type": "object",
    459               "description": "Is the benchmark designed to measure what it claims?",
    460               "required": [
    461                 "construct_validity_argued",
    462                 "difficulty_distribution_characterized",
    463                 "ceiling_floor_effects_checked",
    464                 "human_baseline_included",
    465                 "scoring_rubric_justified"
    466               ],
    467               "properties": {
    468                 "construct_validity_argued": {
    469                   "$ref": "#/$defs/checklist_item",
    470                   "description": "Does the paper argue why this benchmark measures the claimed capability? Not just 'we test X' but 'X measures Y because Z'."
    471                 },
    472                 "difficulty_distribution_characterized": {
    473                   "$ref": "#/$defs/checklist_item",
    474                   "description": "Is the difficulty distribution of benchmark items characterized? Are there easy, medium, hard tiers? Is difficulty measured or just assumed?"
    475                 },
    476                 "ceiling_floor_effects_checked": {
    477                   "$ref": "#/$defs/checklist_item",
    478                   "description": "Are ceiling/floor effects checked? If all models score >90% or <10%, the benchmark isn't discriminating."
    479                 },
    480                 "human_baseline_included": {
    481                   "$ref": "#/$defs/checklist_item",
    482                   "description": "Is there a human baseline? How do humans perform on this benchmark?"
    483                 },
    484                 "scoring_rubric_justified": {
    485                   "$ref": "#/$defs/checklist_item",
    486                   "description": "Is the scoring rubric justified? Why this metric and not another? Are edge cases in scoring addressed?"
    487                 }
    488               }
    489             },
    490             "robustness": {
    491               "type": "object",
    492               "description": "Will this benchmark remain useful over time?",
    493               "required": [
    494                 "contamination_resistance_designed",
    495                 "temporal_robustness_discussed",
    496                 "failure_modes_discussed",
    497                 "baseline_implementations_provided"
    498               ],
    499               "properties": {
    500                 "contamination_resistance_designed": {
    501                   "$ref": "#/$defs/checklist_item",
    502                   "description": "Is contamination resistance designed in? Temporal splits, canary strings, dynamic generation, or other anti-gaming measures?"
    503                 },
    504                 "temporal_robustness_discussed": {
    505                   "$ref": "#/$defs/checklist_item",
    506                   "description": "Is temporal robustness discussed? Will this benchmark be gamed or obsoleted in 6 months? Is there a plan for updates?"
    507                 },
    508                 "failure_modes_discussed": {
    509                   "$ref": "#/$defs/checklist_item",
    510                   "description": "Are failure modes of the benchmark itself discussed? What doesn't it measure? What could game it?"
    511                 },
    512                 "baseline_implementations_provided": {
    513                   "$ref": "#/$defs/checklist_item",
    514                   "description": "Are baseline implementations provided so others can reproduce the reported numbers?"
    515                 }
    516               }
    517             },
    518             "documentation": {
    519               "type": "object",
    520               "description": "Is the benchmark documented for reuse?",
    521               "required": [
    522                 "dataset_documentation_complete",
    523                 "licensing_and_access_clear",
    524                 "intended_use_specified"
    525               ],
    526               "properties": {
    527                 "dataset_documentation_complete": {
    528                   "$ref": "#/$defs/checklist_item",
    529                   "description": "Is dataset documentation complete? Data card, source description, collection methodology, preprocessing steps."
    530                 },
    531                 "licensing_and_access_clear": {
    532                   "$ref": "#/$defs/checklist_item",
    533                   "description": "Is licensing and access clear? Can others actually use this benchmark? Under what terms?"
    534                 },
    535                 "intended_use_specified": {
    536                   "$ref": "#/$defs/checklist_item",
    537                   "description": "Is the intended use specified? What should and should NOT be concluded from benchmark results?"
    538                 }
    539               }
    540             }
    541           }
    542         },
    543         "survey": {
    544           "type": "object",
    545           "properties": {
    546             "search_and_selection": {
    547               "type": "object",
    548               "description": "Is the review process systematic and reproducible?",
    549               "required": [
    550                 "search_strategy_reproducible",
    551                 "inclusion_exclusion_explicit",
    552                 "prisma_or_structured_protocol",
    553                 "search_terms_provided",
    554                 "databases_listed",
    555                 "screening_process_documented",
    556                 "review_scope_justified"
    557               ],
    558               "properties": {
    559                 "search_strategy_reproducible": {
    560                   "$ref": "#/$defs/checklist_item",
    561                   "description": "Is the search strategy reproducible? Could someone re-run the same searches and get the same initial result set?"
    562                 },
    563                 "inclusion_exclusion_explicit": {
    564                   "$ref": "#/$defs/checklist_item",
    565                   "description": "Are inclusion/exclusion criteria explicit and applied consistently?"
    566                 },
    567                 "prisma_or_structured_protocol": {
    568                   "$ref": "#/$defs/checklist_item",
    569                   "description": "Does the survey follow PRISMA or another structured review protocol?"
    570                 },
    571                 "search_terms_provided": {
    572                   "$ref": "#/$defs/checklist_item",
    573                   "description": "Are the actual search terms/queries provided?"
    574                 },
    575                 "databases_listed": {
    576                   "$ref": "#/$defs/checklist_item",
    577                   "description": "Are the databases/sources searched listed explicitly?"
    578                 },
    579                 "screening_process_documented": {
    580                   "$ref": "#/$defs/checklist_item",
    581                   "description": "Is the screening process documented with counts at each stage?"
    582                 },
    583                 "review_scope_justified": {
    584                   "$ref": "#/$defs/checklist_item",
    585                   "description": "Is the review scope justified? Why these years, venues, topics?"
    586                 }
    587               }
    588             },
    589             "synthesis_quality": {
    590               "type": "object",
    591               "description": "Does the synthesis add value beyond listing papers?",
    592               "required": [
    593                 "conflicting_findings_acknowledged",
    594                 "quality_assessment_of_sources",
    595                 "publication_bias_discussed",
    596                 "quantitative_synthesis_present",
    597                 "recommendations_supported_by_evidence"
    598               ],
    599               "properties": {
    600                 "conflicting_findings_acknowledged": {
    601                   "$ref": "#/$defs/checklist_item",
    602                   "description": "Are conflicting findings across reviewed papers acknowledged and discussed?"
    603                 },
    604                 "quality_assessment_of_sources": {
    605                   "$ref": "#/$defs/checklist_item",
    606                   "description": "Does the survey assess the quality of its source papers? A quality rubric, risk-of-bias assessment, or structured evaluation?"
    607                 },
    608                 "publication_bias_discussed": {
    609                   "$ref": "#/$defs/checklist_item",
    610                   "description": "Is publication bias discussed? Does the survey acknowledge that published papers skew positive?"
    611                 },
    612                 "quantitative_synthesis_present": {
    613                   "$ref": "#/$defs/checklist_item",
    614                   "description": "Is there quantitative synthesis (meta-analysis, vote counting, effect size aggregation) or just narrative?"
    615                 },
    616                 "recommendations_supported_by_evidence": {
    617                   "$ref": "#/$defs/checklist_item",
    618                   "description": "Are recommendations supported by the reviewed evidence, not just author opinion?"
    619                 }
    620               }
    621             }
    622           }
    623         },
    624         "position": {
    625           "type": "object",
    626           "properties": {
    627             "argument_quality": {
    628               "type": "object",
    629               "description": "Is the argument well-constructed?",
    630               "required": [
    631                 "argument_internally_consistent",
    632                 "counterarguments_addressed",
    633                 "analogies_appropriate",
    634                 "prescriptions_proportional",
    635                 "evidence_for_claims_cited",
    636                 "alternatives_discussed",
    637                 "historical_context_accurate"
    638               ],
    639               "properties": {
    640                 "argument_internally_consistent": {
    641                   "$ref": "#/$defs/checklist_item",
    642                   "description": "Is the argument internally consistent? Do the conclusions follow from the premises? Are there contradictions?"
    643                 },
    644                 "counterarguments_addressed": {
    645                   "$ref": "#/$defs/checklist_item",
    646                   "description": "Are counterarguments addressed — the strongest version, not a strawman? Does the paper engage with the best opposing view?"
    647                 },
    648                 "analogies_appropriate": {
    649                   "$ref": "#/$defs/checklist_item",
    650                   "description": "Are analogies appropriate? If the paper draws parallels (e.g., 'AI is like electricity'), are the parallels valid or are they false equivalences?"
    651                 },
    652                 "prescriptions_proportional": {
    653                   "$ref": "#/$defs/checklist_item",
    654                   "description": "Are prescriptive claims proportional to the argument? Sweeping policy recommendations require stronger support than narrow suggestions."
    655                 },
    656                 "evidence_for_claims_cited": {
    657                   "$ref": "#/$defs/checklist_item",
    658                   "description": "Is evidence cited for factual claims? Assertions presented as fact should reference sources."
    659                 },
    660                 "alternatives_discussed": {
    661                   "$ref": "#/$defs/checklist_item",
    662                   "description": "Are alternatives to the proposed viewpoint/framework discussed? Not just 'our view is right' but 'here are other views and why we prefer ours'."
    663                 },
    664                 "historical_context_accurate": {
    665                   "$ref": "#/$defs/checklist_item",
    666                   "description": "Is historical context accurate? If the paper references historical events, technologies, or intellectual traditions, are those references correct?"
    667                 }
    668               }
    669             },
    670             "clarity_and_scope": {
    671               "type": "object",
    672               "description": "Is the paper's scope and audience clear?",
    673               "required": [
    674                 "key_terms_defined_precisely",
    675                 "engages_with_existing_literature",
    676                 "intended_audience_clear",
    677                 "assumptions_stated",
    678                 "scope_of_applicability_discussed"
    679               ],
    680               "properties": {
    681                 "key_terms_defined_precisely": {
    682                   "$ref": "#/$defs/checklist_item",
    683                   "description": "Are key terms defined precisely in context? Not just used but defined — what does 'agent' or 'alignment' mean in this paper specifically?"
    684                 },
    685                 "engages_with_existing_literature": {
    686                   "$ref": "#/$defs/checklist_item",
    687                   "description": "Does it engage substantively with existing literature on this position? Not just citing but discussing, comparing, building on."
    688                 },
    689                 "intended_audience_clear": {
    690                   "$ref": "#/$defs/checklist_item",
    691                   "description": "Is the intended audience clear? Is this for policymakers, researchers, practitioners, the public?"
    692                 },
    693                 "assumptions_stated": {
    694                   "$ref": "#/$defs/checklist_item",
    695                   "description": "Are the paper's assumptions stated explicitly? What must the reader accept for the argument to work?"
    696                 },
    697                 "scope_of_applicability_discussed": {
    698                   "$ref": "#/$defs/checklist_item",
    699                   "description": "Is the scope of applicability discussed? Where does this argument apply and where doesn't it?"
    700                 }
    701               }
    702             }
    703           }
    704         },
    705         "theoretical": {
    706           "type": "object",
    707           "properties": {
    708             "formal_quality": {
    709               "type": "object",
    710               "description": "Is the formal work rigorous?",
    711               "required": [
    712                 "assumptions_stated_explicitly",
    713                 "proofs_complete_or_sketched",
    714                 "bounds_tight_or_discussed",
    715                 "counterexamples_explored",
    716                 "notation_consistent",
    717                 "constructive_vs_existence_noted"
    718               ],
    719               "properties": {
    720                 "assumptions_stated_explicitly": {
    721                   "$ref": "#/$defs/checklist_item",
    722                   "description": "Are all assumptions stated explicitly? Hidden assumptions in proofs or models = NO."
    723                 },
    724                 "proofs_complete_or_sketched": {
    725                   "$ref": "#/$defs/checklist_item",
    726                   "description": "Are proofs complete, or clearly sketched with references to full versions? Proof 'left to reader' without sketch = NO."
    727                 },
    728                 "bounds_tight_or_discussed": {
    729                   "$ref": "#/$defs/checklist_item",
    730                   "description": "Are bounds tight, or is tightness discussed? If a bound is loose, is that acknowledged?"
    731                 },
    732                 "counterexamples_explored": {
    733                   "$ref": "#/$defs/checklist_item",
    734                   "description": "Are counterexamples or edge cases explored? Does the paper test the limits of its own results?"
    735                 },
    736                 "notation_consistent": {
    737                   "$ref": "#/$defs/checklist_item",
    738                   "description": "Is notation consistent throughout? Overloaded symbols or inconsistent conventions = NO."
    739                 },
    740                 "constructive_vs_existence_noted": {
    741                   "$ref": "#/$defs/checklist_item",
    742                   "description": "Is it noted whether results are constructive or existence-only? Can you compute the thing proved to exist?"
    743                 }
    744               }
    745             },
    746             "connections": {
    747               "type": "object",
    748               "description": "Is the theoretical work connected to practice and prior work?",
    749               "required": [
    750                 "connection_to_practice_discussed",
    751                 "relationship_to_prior_work_clear",
    752                 "computational_complexity_discussed",
    753                 "limitations_of_formal_model_stated"
    754               ],
    755               "properties": {
    756                 "connection_to_practice_discussed": {
    757                   "$ref": "#/$defs/checklist_item",
    758                   "description": "Is the connection to practice discussed? What does this theorem mean for practitioners?"
    759                 },
    760                 "relationship_to_prior_work_clear": {
    761                   "$ref": "#/$defs/checklist_item",
    762                   "description": "Is the relationship to prior theoretical work clear? What does this extend, generalize, or contradict?"
    763                 },
    764                 "computational_complexity_discussed": {
    765                   "$ref": "#/$defs/checklist_item",
    766                   "description": "Is computational complexity discussed where relevant? Is the algorithm tractable?"
    767                 },
    768                 "limitations_of_formal_model_stated": {
    769                   "$ref": "#/$defs/checklist_item",
    770                   "description": "Are limitations of the formal model stated? What does the model NOT capture about reality?"
    771                 }
    772               }
    773             }
    774           }
    775         }
    776       }
    777     },
    778     "claims": {
    779       "type": "array",
    780       "items": {
    781         "type": "object",
    782         "required": [
    783           "claim",
    784           "evidence",
    785           "supported"
    786         ],
    787         "properties": {
    788           "claim": {
    789             "type": "string"
    790           },
    791           "evidence": {
    792             "type": "string"
    793           },
    794           "supported": {
    795             "type": "string",
    796             "enum": [
    797               "strong",
    798               "moderate",
    799               "weak",
    800               "unsupported"
    801             ]
    802           }
    803         }
    804       }
    805     },
    806     "methodology_tags": {
    807       "type": "array",
    808       "items": {
    809         "type": "string",
    810         "enum": [
    811           "rct",
    812           "observational",
    813           "benchmark-eval",
    814           "case-study",
    815           "meta-analysis",
    816           "theoretical",
    817           "qualitative"
    818         ]
    819       }
    820     },
    821     "key_findings": {
    822       "type": "string"
    823     },
    824     "red_flags": {
    825       "type": "array",
    826       "items": {
    827         "type": "object",
    828         "required": [
    829           "flag",
    830           "detail"
    831         ],
    832         "properties": {
    833           "flag": {
    834             "type": "string"
    835           },
    836           "detail": {
    837             "type": "string"
    838           }
    839         }
    840       }
    841     },
    842     "cited_papers": {
    843       "type": "array",
    844       "items": {
    845         "type": "object",
    846         "required": [
    847           "title",
    848           "relevance"
    849         ],
    850         "properties": {
    851           "title": {
    852             "type": "string"
    853           },
    854           "authors": {
    855             "type": "array",
    856             "items": {
    857               "type": "string"
    858             }
    859           },
    860           "year": {
    861             "type": "integer"
    862           },
    863           "arxiv_id": {
    864             "type": "string"
    865           },
    866           "doi": {
    867             "type": "string"
    868           },
    869           "relevance": {
    870             "type": "string"
    871           }
    872         }
    873       }
    874     },
    875     "engagement_factors": {
    876       "type": "object",
    877       "required": [
    878         "practical_relevance",
    879         "surprise_contrarian",
    880         "fear_safety",
    881         "drama_conflict",
    882         "demo_ability",
    883         "brand_recognition"
    884       ],
    885       "properties": {
    886         "practical_relevance": {
    887           "type": "object",
    888           "required": [
    889             "score",
    890             "justification"
    891           ],
    892           "properties": {
    893             "score": {
    894               "type": "integer",
    895               "minimum": 0,
    896               "maximum": 3
    897             },
    898             "justification": {
    899               "type": "string"
    900             }
    901           }
    902         },
    903         "surprise_contrarian": {
    904           "type": "object",
    905           "required": [
    906             "score",
    907             "justification"
    908           ],
    909           "properties": {
    910             "score": {
    911               "type": "integer",
    912               "minimum": 0,
    913               "maximum": 3
    914             },
    915             "justification": {
    916               "type": "string"
    917             }
    918           }
    919         },
    920         "fear_safety": {
    921           "type": "object",
    922           "required": [
    923             "score",
    924             "justification"
    925           ],
    926           "properties": {
    927             "score": {
    928               "type": "integer",
    929               "minimum": 0,
    930               "maximum": 3
    931             },
    932             "justification": {
    933               "type": "string"
    934             }
    935           }
    936         },
    937         "drama_conflict": {
    938           "type": "object",
    939           "required": [
    940             "score",
    941             "justification"
    942           ],
    943           "properties": {
    944             "score": {
    945               "type": "integer",
    946               "minimum": 0,
    947               "maximum": 3
    948             },
    949             "justification": {
    950               "type": "string"
    951             }
    952           }
    953         },
    954         "demo_ability": {
    955           "type": "object",
    956           "required": [
    957             "score",
    958             "justification"
    959           ],
    960           "properties": {
    961             "score": {
    962               "type": "integer",
    963               "minimum": 0,
    964               "maximum": 3
    965             },
    966             "justification": {
    967               "type": "string"
    968             }
    969           }
    970         },
    971         "brand_recognition": {
    972           "type": "object",
    973           "required": [
    974             "score",
    975             "justification"
    976           ],
    977           "properties": {
    978             "score": {
    979               "type": "integer",
    980               "minimum": 0,
    981               "maximum": 3
    982             },
    983             "justification": {
    984               "type": "string"
    985             }
    986           }
    987         }
    988       }
    989     },
    990     "hn_data": {
    991       "type": "object",
    992       "properties": {
    993         "threads": {
    994           "type": "array"
    995         },
    996         "top_points": {
    997           "type": "integer"
    998         },
    999         "total_points": {
   1000           "type": "integer"
   1001         },
   1002         "total_comments": {
   1003           "type": "integer"
   1004         }
   1005       }
   1006     }
   1007   },
   1008   "$defs": {
   1009     "checklist_item": {
   1010       "type": "object",
   1011       "required": [
   1012         "applies",
   1013         "answer",
   1014         "justification"
   1015       ],
   1016       "properties": {
   1017         "applies": {
   1018           "type": "boolean",
   1019           "description": "Does this criterion apply to this paper type?"
   1020         },
   1021         "answer": {
   1022           "type": "boolean",
   1023           "description": "Does the paper satisfy this criterion? false when applies=false."
   1024         },
   1025         "justification": {
   1026           "type": "string",
   1027           "description": "1-3 sentences explaining the answer."
   1028         },
   1029         "source": {
   1030           "type": "string",
   1031           "enum": [
   1032             "opus",
   1033             "haiku"
   1034           ],
   1035           "description": "Which model produced this answer."
   1036         }
   1037       }
   1038     }
   1039   }
   1040 }

Impressum · Datenschutz