commit c9f58bde8535e444a35685593af2b9b4b2f9d55b
parent 736a50a032a47708cf7293b93076df2b494eb27b
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Tue, 31 Mar 2026 08:40:03 +0200
V4 Haiku scan pipeline: type-routed instrument with Opus overlay
New schema (scan-v4.schema.json): shared core (15q) + 5 type-specific
modules (empirical 39q, benchmark 12q, survey 12q, position 12q,
theoretical 10q). Two-field boolean design preserved.
New script (run-scan-v4-haiku.py):
- Haiku for papers <50K chars, auto-fallback to Sonnet for larger
- Reads paper_type.json for routing
- Merges existing Opus v2/v3 answers (Opus always overrides)
- Tracks source per question (opus/haiku/sonnet)
- Free calibration: reports Haiku-Opus agreement rate
- Fetches HN data inline
- Writes scan-v4.json (separate from scan.json)
Tested:
- Tao (position): 12 opus + 15 haiku. 75% agreement. Position module
shows 7/7 argument quality, 1/5 clarity. Much better than v2's 10%.
- METR (empirical): 51 opus + 3 sonnet. 86.3% agreement.
Run: python3 scripts/run-scan-v4-haiku.py --parallel 8
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
5 files changed, 2895 insertions(+), 0 deletions(-)
diff --git a/papers/mathematical-methods-human-2026/scan-v4.json b/papers/mathematical-methods-human-2026/scan-v4.json
@@ -0,0 +1,337 @@
+{
+ "scan_version": 4,
+ "paper_type": "position",
+ "paper": {
+ "title": "Mathematical Methods and Human Thought in the Age of AI",
+ "authors": [
+ "Tanya Klowden",
+ "Terence Tao"
+ ],
+ "year": 2026,
+ "venue": "arXiv (math.HO)",
+ "arxiv_id": "2603.26524",
+ "doi": ""
+ },
+ "checklist": {
+ "claims_and_evidence": {
+ "abstract_claims_supported": {
+ "applies": true,
+ "answer": true,
+ "justification": "The abstract claims that AI is a natural evolution of human tools, that development should be human-centered, and proposes a pathway for integration. Sections 2 (historical parallels), 5 (costs and benefits), and 6 (human/AI interface) develop each of these arguments at length. The abstract's claims are philosophical positions that are elaborated and argued throughout the paper.",
+ "source": "opus"
+ },
+ "causal_claims_justified": {
+ "applies": true,
+ "answer": false,
+ "justification": "The paper makes numerous causal assertions: 'strict regulation imposed at this point would disproportionately shut down the more positive use cases of AI' (Section 1.3), 'AI technologies... have dramatically shifted social, intellectual, and economic spheres' (Section 7), and AI could 'crowd out the more traditional paradigms' (Section 4.6). These are stated as arguments from analogy and assertion, without empirical evidence or causal identification strategies.",
+ "source": "opus"
+ },
+ "generalization_bounded": {
+ "applies": true,
+ "answer": false,
+ "justification": "While the paper acknowledges mathematics as a 'sandbox' (Section 3), it regularly generalizes to 'all humankind' and 'society as a whole' (abstract, Section 5.1, conclusion). The title itself — 'human thought in the age of AI' — claims scope far beyond the mathematical case study. The paper does not bound its philosophical conclusions to the mathematical domain from which most of its arguments are drawn.",
+ "source": "opus"
+ },
+ "alternative_explanations_discussed": {
+ "applies": false,
+ "answer": false,
+ "justification": "The paper presents no empirical results. It is a philosophical position paper, so alternative explanations for observed data are not applicable.",
+ "source": "opus"
+ },
+ "proxy_outcome_distinction": {
+ "applies": false,
+ "answer": false,
+ "justification": "No measurements or proxies are used. This is a theoretical paper.",
+ "source": "opus"
+ }
+ },
+ "limitations_and_scope": {
+ "limitations_section_present": {
+ "applies": true,
+ "answer": false,
+ "justification": "There is no dedicated limitations section. The paper contains scattered hedges such as 'we of course do not pretend to have definitive resolutions to any of them; and the speed of change in this space is such that any proclamations we make are at risk of being overtaken by striking new technological advances' (Section 3), but these are not collected in a substantive limitations discussion.",
+ "source": "opus"
+ },
+ "threats_to_validity_specific": {
+ "applies": true,
+ "answer": false,
+ "justification": "No specific threats to the validity of the paper's own arguments are discussed. The paper acknowledges that AI is changing rapidly but does not identify specific ways in which its philosophical positions or analogies might be wrong or misleading.",
+ "source": "opus"
+ },
+ "scope_boundaries_stated": {
+ "applies": true,
+ "answer": false,
+ "justification": "The paper does not explicitly state what it does NOT claim. While it notes that mathematics is used as a 'sandbox' (Section 3) and hedges with 'we of course do not pretend to have definitive resolutions,' it does not list specific exclusions, untested scenarios, or things the reader should not conclude from the paper.",
+ "source": "opus"
+ }
+ },
+ "conflicts_of_interest": {
+ "funding_disclosed": {
+ "applies": true,
+ "answer": false,
+ "justification": "The Acknowledgments section (Section 7.1) thanks Silvia de Toffoli for comments but does not mention any funding sources. No funding disclosure is present.",
+ "source": "opus"
+ },
+ "affiliations_disclosed": {
+ "applies": true,
+ "answer": false,
+ "justification": "The paper lists author names but no institutional affiliations are visible in the text. The authors' academic positions and departments are not stated.",
+ "source": "opus"
+ },
+ "funder_independent_of_outcome": {
+ "applies": true,
+ "answer": false,
+ "justification": "No funding is disclosed, so independence cannot be assessed. The paper discusses AI companies and their practices but does not clarify whether the authors have any financial relationships with such entities.",
+ "source": "opus"
+ },
+ "financial_interests_declared": {
+ "applies": true,
+ "answer": false,
+ "justification": "No competing interests statement or financial disclosure is present in the paper.",
+ "source": "opus"
+ }
+ },
+ "scope_and_framing": {
+ "key_terms_defined": {
+ "applies": true,
+ "answer": true,
+ "justification": "Section 1.1 explicitly defines 'AI' as 'the broad spectrum of computer tools designed to perform increasingly complex cognitive tasks', distinguishing ML/LLMs, diffusion models, and GOFAI (automated theorem provers, chess engines).",
+ "source": "haiku"
+ },
+ "intended_contribution_clear": {
+ "applies": true,
+ "answer": true,
+ "justification": "Section 1.2 explicitly states the paper will use mathematics as a model to consider benefits, risks, ethics, and outcomes of AI, and 'propose a pathway' to integrating AI in human-centered ways.",
+ "source": "haiku"
+ },
+ "engagement_with_prior_work": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper substantively engages with Searle's Chinese Room, Thurston on proof and progress, the Jaffe-Quinn 'theoretical mathematics' debate, formal verification literature (Lean, Rocq), and multiple AMS special issues on AI and mathematics.",
+ "source": "haiku"
+ }
+ }
+ },
+ "type_checklist": {
+ "position": {
+ "argument_quality": {
+ "argument_internally_consistent": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper's core thesis — that AI should be human-centered, that mathematics is a good sandbox, and that a Copernican coexistence model is preferable to the three extremes — is internally consistent throughout; the vanilla extract and red-team/blue-team framings reinforce rather than contradict the overall argument.",
+ "source": "haiku"
+ },
+ "counterarguments_addressed": {
+ "applies": true,
+ "answer": true,
+ "justification": "Section 6.3 presents three explicit opposing positions (formalist/technicist retreat, human-chauvinist exceptionalism, full AI supersession) in reasonable detail before arguing for the Copernican middle ground, engaging the best version of each rather than strawmen.",
+ "source": "haiku"
+ },
+ "analogies_appropriate": {
+ "applies": true,
+ "answer": true,
+ "justification": "The Copernican revolution analogy (Section 6.4) is apt and carefully qualified; the chess analogy is concrete and accurate; the vanilla extract analogy is idiosyncratic but clearly bounded ('some upper limit') and serves the limited-use argument well.",
+ "source": "haiku"
+ },
+ "prescriptions_proportional": {
+ "applies": true,
+ "answer": true,
+ "justification": "Prescriptions remain at a general normative level ('AI should benefit humanity', 'equitable access is paramount') rather than making sweeping specific policy demands; the authors explicitly hedge ('we are cautiously optimistic', 'we of course do not pretend to have definitive resolutions').",
+ "source": "haiku"
+ },
+ "evidence_for_claims_cited": {
+ "applies": true,
+ "answer": true,
+ "justification": "Factual claims are consistently sourced: AlphaFold Nobel [1], Turing test results [9], AI model collapse [43], autoformalization [20], four color theorem [22], AlphaProof IMO results [34]; only anecdotal claims (the 'three AI insertions' footnote) are uncited.",
+ "source": "haiku"
+ },
+ "alternatives_discussed": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper discusses three alternative philosophical frameworks in Section 6.3 and explicitly engages the Jaffe-Quinn proposal for 'theoretical mathematics' and its negative reception as a historical alternative (Section 4.6 footnote 14).",
+ "source": "haiku"
+ },
+ "historical_context_accurate": {
+ "applies": true,
+ "answer": true,
+ "justification": "Historical references are accurate: the Luddite account correctly situates them as skilled textile workers opposing automation in a harsh economic climate; the Copernican, Darwinian, and non-Euclidean geometry revolutions are correctly characterized; Bourbaki's role is accurately described.",
+ "source": "haiku"
+ }
+ },
+ "clarity_and_scope": {
+ "key_terms_defined_precisely": {
+ "applies": true,
+ "answer": false,
+ "justification": "'AI' is defined in Section 1.1, but 'human-centered' (the paper's central normative concept), 'intelligence', 'understanding', and 'creativity' — which bear the weight of the philosophical argument — are discussed but never precisely defined in context.",
+ "source": "haiku"
+ },
+ "engages_with_existing_literature": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper goes beyond citation-listing to substantively compare with Thurston's view of proof, engage the Jaffe-Quinn 'theoretical mathematics' debate, and build on the AMS special issues on AI and mathematics; it discusses how this work relates to and extends prior philosophical positions.",
+ "source": "haiku"
+ },
+ "intended_audience_clear": {
+ "applies": true,
+ "answer": false,
+ "justification": "The intended audience is never explicitly stated; the paper oscillates between technical mathematics (discussing Lean, Mathlib, reverse mathematics) and general-public policy arguments, making it unclear whether it targets mathematicians, policymakers, or general readers.",
+ "source": "haiku"
+ },
+ "assumptions_stated": {
+ "applies": true,
+ "answer": false,
+ "justification": "Key assumptions — that human-centeredness is the correct normative frame, that mathematics is representative of other domains, that the 'Faustian bargain' framing is apt — are asserted rather than identified as assumptions the reader must accept for the argument to work.",
+ "source": "haiku"
+ },
+ "scope_of_applicability_discussed": {
+ "applies": true,
+ "answer": false,
+ "justification": "Although Section 3 frames mathematics as a 'sandbox', the paper does not explicitly discuss where the mathematical case study fails to generalize — e.g., formal verification has no analogue in most humanities or social science domains.",
+ "source": "haiku"
+ }
+ }
+ }
+ },
+ "claims": [
+ {
+ "claim": "AI is a natural evolution of human cognitive tools throughout history, continuous with the printing press and LaTeX.",
+ "evidence": "Historical analogy in Section 2; authors distinguish AI from prior tools primarily by degree (automation of creative process itself) not kind.",
+ "supported": "moderate"
+ },
+ {
+ "claim": "Current AI exhibits inconsistent performance — superhuman in some mathematical tasks, elementary errors in others (e.g., asserting all odd numbers are prime).",
+ "evidence": "Section 3 cites AI producing superficially flawless but fundamentally flawed proofs, contrasted with correct solutions to complex problems; AlphaProof IMO results cited [34].",
+ "supported": "strong"
+ },
+ {
+ "claim": "Formal verification of AI-generated proofs is necessary but insufficient — errors in translation between formal and intended statements remain possible.",
+ "evidence": "Section 4.4 provides a concrete Fermat's Last Theorem example where an AI could produce a formally correct but semantically wrong proof by misinterpreting the domain of natural numbers.",
+ "supported": "strong"
+ },
+ {
+ "claim": "AI collapse becomes a serious risk if models are trained recursively on AI-generated content without sufficient genuine human-generated data.",
+ "evidence": "Cited to Shumailov et al. [43] in Nature; authors also provide a concrete anecdote about citogenesis on Erdős Problems site via deep research tools.",
+ "supported": "moderate"
+ },
+ {
+ "claim": "The 'Faustian bargain' has already been adopted de facto — AI has been deployed before adequate philosophical and ethical evaluation occurred.",
+ "evidence": "Section 1.3 argues market competition created a prisoner's dilemma preventing deliberate evaluation; contrasted with slower technologies like stem cell research that allowed philosophical debate first.",
+ "supported": "moderate"
+ },
+ {
+ "claim": "A 'digital divide' in AI access will exacerbate existing research inequalities, with frontier models accessible only to well-financed groups.",
+ "evidence": "Section 5.3 argues capitalized models compete for finite resources; smaller community models proposed as partial remedy; PCAST report [44] and Jones [45] cited.",
+ "supported": "moderate"
+ },
+ {
+ "claim": "A 'Copernican' philosophical framework — treating human and artificial intelligence as ontologically equivalent but complementary — is preferable to formalist retreat, human-chauvinist exceptionalism, or AI supersession.",
+ "evidence": "Section 6.4 develops the analogy but the preference is argued normatively rather than demonstrated empirically.",
+ "supported": "weak"
+ }
+ ],
+ "methodology_tags": [
+ "theoretical",
+ "qualitative"
+ ],
+ "key_findings": "This position paper by a mathematician (Tao) and art scholar (Klowden) argues that AI represents a natural but qualitatively distinct evolution of cognitive tools, requiring human-centered governance rather than either wholesale rejection or uncritical adoption. Using mathematics as a case study, the authors examine how AI changes standards of proof, intellectual property attribution, and the nature of mathematical understanding, noting that formal verification is necessary but insufficient. They propose a 'Copernican' philosophical framework — treating human and artificial intelligence as ontologically equivalent but complementary — as preferable to formalist, exceptionalist, or supersessionist extremes, while emphasizing equitable access and harm reduction as urgent near-term imperatives.",
+ "red_flags": [
+ {
+ "flag": "No funding disclosure",
+ "detail": "No funding source is disclosed anywhere in the paper, including the acknowledgments section."
+ },
+ {
+ "flag": "No affiliations stated",
+ "detail": "Author institutional affiliations (e.g., UCLA for Tao) are not stated in the paper text, making conflict-of-interest assessment impossible from the document itself."
+ },
+ {
+ "flag": "Scope overreach",
+ "detail": "The paper uses mathematics as a 'sandbox' in Section 3 but draws unrestricted prescriptive conclusions for 'sciences and society in general' without arguing why the mathematical case generalizes."
+ },
+ {
+ "flag": "Key normative terms undefined",
+ "detail": "'Human-centered' — the paper's central prescriptive concept — is never defined precisely, weakening the actionability of the policy recommendations."
+ },
+ {
+ "flag": "No limitations section",
+ "detail": "There is no acknowledgment of the argument's weaknesses, e.g., that mathematics is atypical due to formal verification, or that the authors' expertise is asymmetric (one mathematician, one art scholar)."
+ }
+ ],
+ "cited_papers": [
+ {
+ "title": "AI models collapse when trained on recursively generated data",
+ "relevance": "Empirical foundation for AI collapse risk discussed in Section 5.2; directly relevant to contamination concerns in AI-assisted research."
+ },
+ {
+ "title": "Autoformalization with Large Language Models",
+ "relevance": "Core technical reference for the autoformalization capabilities discussed in Section 4.3; Wu et al. 2022 NeurIPS."
+ },
+ {
+ "title": "A Turing test of whether AI chatbots are behaviorally similar to humans",
+ "relevance": "Cited as evidence that modern LLMs have effectively passed the Turing test, a key empirical claim in Section 2.2."
+ },
+ {
+ "title": "On Proof and Progress in Mathematics (Thurston)",
+ "relevance": "Central philosophical reference for the 'smell' of mathematical proofs and the role of understanding vs. formal correctness; foundational to the paper's argument."
+ },
+ {
+ "title": "AlephZero and mathematical experience (DeDeo)",
+ "relevance": "Recent philosophical treatment of AI in mathematics, directly relevant to Section 4 discussion of AI-generated proofs and mathematical insight."
+ },
+ {
+ "title": "The technological turn in mathematics (de Toffoli & Tanswell)",
+ "relevance": "2025 Blackwell Companion chapter on formal proof assistants and their philosophical implications; directly supports Section 4.3."
+ },
+ {
+ "title": "Early science acceleration experiments with GPT-5 (Bubeck et al.)",
+ "relevance": "Cited for AI deep research tools discovering solutions to open problems; illustrates citogenesis risk discussed in Section 4.8."
+ },
+ {
+ "title": "Minds, brains, and programs (Searle)",
+ "relevance": "Chinese Room thought experiment foundational to the paper's discussion of AI understanding vs. symbol manipulation in Section 2.2."
+ },
+ {
+ "title": "Is mathematics obsolete? (Avigad, 2025)",
+ "relevance": "Contemporary philosophical treatment directly addressing the paper's central question about AI and the future of mathematical practice."
+ }
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 0,
+ "justification": "Pure philosophical reflection with no tools, techniques, or methods that a practitioner could apply."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The Copernican framing of AI intelligence is a novel metaphor, but the overall positions (AI should be human-centered, has costs and benefits) are mainstream."
+ },
+ "fear_safety": {
+ "score": 1,
+ "justification": "Mentions existential risks, model collapse, digital divide, and environmental costs, but does not present novel threats or demonstrate specific attacks."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "References the Faustian bargain and Luddite parallels, and critiques unchecked AI development, but avoids direct confrontation with specific companies or claims."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "No code, tool, or demo. The paper is entirely discursive."
+ },
+ "brand_recognition": {
+ "score": 3,
+ "justification": "Terence Tao is a Fields Medalist and one of the most recognized mathematicians alive; his name alone drives significant attention to any paper he co-authors."
+ }
+ },
+ "hn_data": {
+ "threads": [
+ {
+ "hn_id": "47572771",
+ "title": "Mathematical methods and human thought in the age of AI",
+ "points": 162,
+ "comments": 62,
+ "url": "https://news.ycombinator.com/item?id=47572771"
+ }
+ ],
+ "top_points": 162,
+ "total_points": 162,
+ "total_comments": 62
+ }
+}
+\ No newline at end of file
diff --git a/papers/metr-rct-2025/scan-v4.json b/papers/metr-rct-2025/scan-v4.json
@@ -0,0 +1,589 @@
+{
+ "scan_version": 4,
+ "paper_type": "empirical",
+ "paper": {
+ "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
+ "authors": [
+ "Joel Becker",
+ "Nate Rush",
+ "Beth Barnes",
+ "David Rein"
+ ],
+ "year": 2025,
+ "venue": "arXiv",
+ "arxiv_id": "2507.09089",
+ "doi": null
+ },
+ "checklist": {
+ "claims_and_evidence": {
+ "abstract_claims_supported": {
+ "applies": true,
+ "answer": true,
+ "justification": "The abstract claims (19% slowdown, developer forecast of 24% speedup, post-hoc estimate of 20% speedup, expert predictions of 38-39% speedup) are all supported by the results in the paper (Figure 1, Section 3.1, Table 6).",
+ "source": "opus"
+ },
+ "causal_claims_justified": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper makes causal claims ('AI tooling slowed developers down') and uses an RCT design with randomized treatment assignment, which is the gold standard for causal inference. Balance checks confirm successful randomization (Table 4). Issues are defined before randomization to prevent confounding.",
+ "source": "opus"
+ },
+ "generalization_bounded": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper explicitly bounds generalization in Section 4.1 ('Key Caveats'), Table 2 ('What the evidence does not show'), and throughout the discussion. It states results do not imply AI is unhelpful in other settings, with future models, or with better elicitation strategies.",
+ "source": "opus"
+ },
+ "alternative_explanations_discussed": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper systematically investigates 21 alternative explanations for the slowdown in Section 3.3 and Appendix C, categorized into direct productivity loss, experimental artifacts, factors raising developer performance, and factors limiting AI performance.",
+ "source": "opus"
+ },
+ "proxy_outcome_distinction": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper explicitly discusses the proxy-outcome distinction. Section 1 critiques prior work for using non-fixed outcome measures (lines of code, PRs) that may not correspond to productivity. It uses task completion time as its measure, notes this is 'a fixed outcome measure,' and discusses scope creep (Section C.2.3) as a potential gap between time and productivity.",
+ "source": "opus"
+ }
+ },
+ "limitations_and_scope": {
+ "limitations_section_present": {
+ "applies": true,
+ "answer": true,
+ "justification": "Section 4.1 'Key Caveats' serves as a dedicated limitations section, discussing setting-specific factors, AI-specific factors, and agent capabilities. Table 2 explicitly lists what the evidence does not show.",
+ "source": "opus"
+ },
+ "threats_to_validity_specific": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper discusses 21 specific threats grouped into four categories (Table 1, Appendix C). These are highly specific: 'Our sample of 16 developers' is acknowledged, experimentally driven overuse, sampling bias in recruitment, unfamiliar development environment, and more.",
+ "source": "opus"
+ },
+ "scope_boundaries_stated": {
+ "applies": true,
+ "answer": true,
+ "justification": "Table 2 is a gold-standard scope boundary statement, listing six specific things the evidence does NOT show (e.g., 'AI systems do not currently speed up many or most software developers'). Section 4.1 adds specific caveats about setting-specific factors and AI-specific factors.",
+ "source": "opus"
+ }
+ },
+ "conflicts_of_interest": {
+ "funding_disclosed": {
+ "applies": true,
+ "answer": false,
+ "justification": "No funding source is disclosed. METR (Model Evaluation & Threat Research) is the authors' organization, but no grants, sponsors, or funding agencies are mentioned.",
+ "source": "opus"
+ },
+ "affiliations_disclosed": {
+ "applies": true,
+ "answer": true,
+ "justification": "Authors are identified as being from METR (Model Evaluation & Threat Research). METR is not a developer of the AI tools being evaluated (Cursor, Claude, GPT-4o).",
+ "source": "opus"
+ },
+ "funder_independent_of_outcome": {
+ "applies": true,
+ "answer": false,
+ "justification": "Funding source is not disclosed, so independence cannot be assessed. METR's organizational mission involves AI safety and capability evaluation, which could create incentives in either direction regarding AI productivity results.",
+ "source": "opus"
+ },
+ "financial_interests_declared": {
+ "applies": true,
+ "answer": false,
+ "justification": "No competing interests statement or financial interest disclosure is present in the paper.",
+ "source": "opus"
+ }
+ },
+ "scope_and_framing": {
+ "key_terms_defined": {
+ "applies": true,
+ "answer": true,
+ "justification": "Productivity is operationalized as task completion time with the speedup formula S defined mathematically in Section 2.3; 'AI-allowed/disallowed' conditions are precisely specified; 'speedup' and 'slowdown' are given quantitative definitions with noted abuse of language flagged.",
+ "source": "haiku"
+ },
+ "intended_contribution_clear": {
+ "applies": true,
+ "answer": true,
+ "justification": "Section 1 explicitly lists five ways this study complements existing literature: frontier models, live OSS tasks, fixed outcome measure, experienced engineers with repository familiarity, and rich data collection.",
+ "source": "haiku"
+ },
+ "engagement_with_prior_work": {
+ "applies": true,
+ "answer": true,
+ "justification": "Section 1.1 reviews relevant literature across five subsections; Table 3 systematically compares this study to six prior works on four key methodological dimensions, explaining how design choices account for divergent findings.",
+ "source": "haiku"
+ }
+ }
+ },
+ "type_checklist": {
+ "empirical": {
+ "artifacts": {
+ "code_released": {
+ "applies": true,
+ "answer": false,
+ "justification": "No repository URL, code archive, or analysis scripts are provided in the paper. The paper describes detailed data collection and regression analyses but does not release code.",
+ "source": "opus"
+ },
+ "data_released": {
+ "applies": true,
+ "answer": false,
+ "justification": "No dataset download link is provided. The paper describes collecting screen recordings, developer forecasts, and implementation times, but does not release the underlying data.",
+ "source": "opus"
+ },
+ "environment_specified": {
+ "applies": true,
+ "answer": false,
+ "justification": "No environment specifications, requirements files, or dependency information is provided for reproducing the analyses.",
+ "source": "opus"
+ },
+ "reproduction_instructions": {
+ "applies": true,
+ "answer": false,
+ "justification": "No step-by-step reproduction instructions are provided. The regression specification is described in Appendix D, but there are no scripts or instructions to replicate the analysis.",
+ "source": "opus"
+ }
+ },
+ "statistical_methodology": {
+ "confidence_intervals_or_error_bars": {
+ "applies": true,
+ "answer": true,
+ "justification": "95% confidence intervals are reported throughout, using HC3 standard errors (Section D.2, Figure 15). The paper also reports CIs from alternative uncertainty estimation procedures including clustered standard errors and hierarchical bootstrap.",
+ "source": "opus"
+ },
+ "significance_tests": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper uses regression-based inference with HC3 standard errors and reports p-values (e.g., Table 4 balance checks with Welch t-tests, Table 5 chi-square test). Figure 15 shows alternative uncertainty procedures.",
+ "source": "opus"
+ },
+ "effect_sizes_reported": {
+ "applies": true,
+ "answer": true,
+ "justification": "The primary result is reported as a 19% increase in completion time (from the log-linear regression coefficient transformed via exp(β)-1). Context is provided with baseline completion times (Figure 4) and the effect is reported with confidence intervals.",
+ "source": "opus"
+ },
+ "sample_size_justified": {
+ "applies": true,
+ "answer": false,
+ "justification": "No power analysis or sample size justification is provided. The study has 16 developers and 246 tasks. The paper acknowledges being 'not powered for statistically significant multiple comparisons when subsetting our data' (Section 3.3) but does not justify why 16 developers was the chosen sample size.",
+ "source": "opus"
+ },
+ "variance_reported": {
+ "applies": true,
+ "answer": true,
+ "justification": "Standard deviations are reported in Table 4 for forecasted times. Multiple uncertainty estimation procedures are compared in Figure 15. The paper reports variance across developers (Figure 17) and across subsets.",
+ "source": "opus"
+ }
+ },
+ "evaluation_design": {
+ "baselines_included": {
+ "applies": true,
+ "answer": true,
+ "justification": "The study design inherently includes a baseline: the AI-disallowed condition serves as the control/baseline against which AI-allowed performance is compared. The paper also compares results against prior literature (Table 3).",
+ "source": "opus"
+ },
+ "baselines_contemporary": {
+ "applies": true,
+ "answer": true,
+ "justification": "The comparison is between AI-allowed and AI-disallowed conditions within the same study, using contemporary AI tools (Claude 3.5/3.7 Sonnet, GPT-4o, Gemini 2.5 Pro). Prior work comparisons in Table 3 include contemporary studies.",
+ "source": "opus"
+ },
+ "ablation_study": {
+ "applies": false,
+ "answer": false,
+ "justification": "This is an RCT measuring a single treatment (AI allowed vs. not allowed), not a multi-component system. There is no system to ablate.",
+ "source": "opus"
+ },
+ "multiple_metrics": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper reports multiple outcome measures: self-reported implementation time, screen recording time, pre-review and post-review time, lines of code per hour, and activity time breakdowns (Figure 6, Figure 22). Alternative estimators are also compared (Figure 13).",
+ "source": "opus"
+ },
+ "human_evaluation": {
+ "applies": true,
+ "answer": true,
+ "justification": "The study includes extensive human evaluation: 143 hours of screen recordings were manually labeled with fine-grained activity labels (Section 2.4), exit interviews and surveys were conducted (Section G.5), and PR quality was assessed through the repositories' standard review processes.",
+ "source": "opus"
+ },
+ "held_out_test_set": {
+ "applies": false,
+ "answer": false,
+ "justification": "This is an RCT, not a prediction task. There is no train/test split concept applicable here.",
+ "source": "opus"
+ },
+ "per_category_breakdown": {
+ "applies": true,
+ "answer": true,
+ "justification": "Extensive breakdowns are provided: per-developer speedup (Figure 17), by prior task exposure and external resource needs (Figure 7), by AI experience (Figure 10), by scope creep (Figure 9), by month (Figure 23), and by activity type (Figures 6, 19-21).",
+ "source": "opus"
+ },
+ "failure_cases_discussed": {
+ "applies": true,
+ "answer": true,
+ "justification": "The entire paper is essentially a discussion of a failure case (AI slowing developers down). Section C.1.4 discusses low AI reliability and developers' experiences with failed AI generations. Qualitative examples of AI failures are provided throughout.",
+ "source": "opus"
+ },
+ "negative_results_reported": {
+ "applies": true,
+ "answer": true,
+ "justification": "The core finding is a negative result: AI tools slow down experienced developers by 19%, contradicting expectations. The paper also reports which hypothesized contributing factors had evidence against them (Table 1, Section C.3).",
+ "source": "opus"
+ }
+ },
+ "setup_transparency": {
+ "model_versions_specified": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper specifies models used: Claude 3.7 Sonnet (thinking mode), Claude 3.7 Sonnet, Claude 3.5 Sonnet, GPT-4o, Gemini 2.5 Pro, and o1, with usage percentages (Section C.3.6). These are marketing names without snapshot dates, but for an RCT studying tool usage in the wild, this represents what developers actually used.",
+ "source": "opus"
+ },
+ "prompts_provided": {
+ "applies": false,
+ "answer": false,
+ "justification": "This is an RCT where developers use AI tools naturally. There are no researcher-designed prompts — developers prompt AI tools as they see fit. The study measures the effect of allowing AI usage, not of specific prompts.",
+ "source": "opus"
+ },
+ "hyperparameters_reported": {
+ "applies": false,
+ "answer": false,
+ "justification": "This is an RCT where developers use commercial AI tools (Cursor Pro) with default settings. The researchers do not control hyperparameters — they are studying the tools as used in practice.",
+ "source": "opus"
+ },
+ "scaffolding_described": {
+ "applies": false,
+ "answer": false,
+ "justification": "The study evaluates Cursor Pro as a third-party tool used as-is by developers. The authors cannot describe Cursor's internal scaffolding. Section F.2.2 provides a primer on Cursor's features (chat, agent mode, autocomplete) as understood from the user perspective.",
+ "source": "opus"
+ },
+ "data_preprocessing_documented": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper documents data preprocessing: imputation of post-review times for unreviewed issues (Section C.3.4), filtering criteria for screen recordings (>10% broken time, >20% discrepancy with self-reported time, cheating removal), resulting in 74 of 128 recordings (Section 2.4). Issue dropout is documented (Section C.3.3).",
+ "source": "opus"
+ }
+ },
+ "data_integrity": {
+ "raw_data_available": {
+ "applies": true,
+ "answer": false,
+ "justification": "Raw data (screen recordings, implementation times, forecasts, survey responses) is not publicly available. Only aggregated results and regression outputs are presented.",
+ "source": "opus"
+ },
+ "data_collection_described": {
+ "applies": true,
+ "answer": true,
+ "justification": "Data collection is described in detail: Section 2.2.2 covers screen recordings, Cursor analytics, implementation notes. Section 2.4 covers developer forecasts, expert forecasts, exit interviews, and fine-grained activity labels. Appendix G provides full developer instructions.",
+ "source": "opus"
+ },
+ "recruitment_methods_described": {
+ "applies": true,
+ "answer": true,
+ "justification": "Section G describes recruitment: outreach via professional networks, Reddit communities, and GitHub profiles; filtering for 500+ star repos, 5+ recent commits; 51 initial respondents filtered to ~20 then 16. Selection criteria are explicit.",
+ "source": "opus"
+ },
+ "data_pipeline_documented": {
+ "applies": true,
+ "answer": true,
+ "justification": "The pipeline is documented: 51 interested developers → ~20 meeting criteria → 16 final participants (3 dropped, Section G.6). 246 tasks completed (136 AI-allowed, 110 AI-disallowed). Screen recording filtering: 128 recordings → 74 valid after quality filters (Section 2.4). Imputation methods for missing post-review times are described (Section C.3.4).",
+ "source": "opus"
+ }
+ },
+ "contamination": {
+ "training_cutoff_stated": {
+ "applies": false,
+ "answer": false,
+ "justification": "This is an RCT measuring developer productivity with AI tools, not an evaluation of a pre-trained model's capability on a benchmark.",
+ "source": "opus"
+ },
+ "train_test_overlap_discussed": {
+ "applies": false,
+ "answer": false,
+ "justification": "This is an RCT measuring developer productivity, not a benchmark evaluation where train/test overlap is relevant.",
+ "source": "opus"
+ },
+ "benchmark_contamination_addressed": {
+ "applies": false,
+ "answer": false,
+ "justification": "This is an RCT, not a benchmark evaluation.",
+ "source": "opus"
+ }
+ },
+ "human_studies": {
+ "pre_registered": {
+ "applies": true,
+ "answer": false,
+ "justification": "No mention of pre-registration (OSF, AsPredicted, AEA registry, or similar) anywhere in the paper.",
+ "source": "opus"
+ },
+ "irb_or_ethics_approval": {
+ "applies": true,
+ "answer": false,
+ "justification": "No mention of IRB or ethics board approval. The study involves human participants (developers) completing tasks and being recorded, but no ethics review is mentioned.",
+ "source": "opus"
+ },
+ "demographics_reported": {
+ "applies": true,
+ "answer": true,
+ "justification": "Developer demographics are reported: typically over a decade of software experience, 5 years average on their repository, 1,500 commits on average, 59% of repository lifetime. AI experience levels: 93% used LLMs, 44% used Cursor (Section 2.1). Table 7 provides per-developer statistics.",
+ "source": "opus"
+ },
+ "inclusion_exclusion_criteria": {
+ "applies": true,
+ "answer": true,
+ "justification": "Section G.2.1 lists explicit eligibility criteria: 1+ year professional experience, 6+ months as active maintainer, repository must be open source with 500+ stars and 3000+ LOC, must have a list of issues. Section G describes further filtering steps.",
+ "source": "opus"
+ },
+ "randomization_described": {
+ "applies": true,
+ "answer": true,
+ "justification": "Randomization is described: issues randomized via 'simulated fair coin flip' (Section 2.2). 25 early issues used block randomization before switching (Section E.3). Balance checks confirm successful randomization (Table 4, Table 5).",
+ "source": "opus"
+ },
+ "blinding_described": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper explicitly states 'this study is not blinded' (Section C.2.1) — developers know which condition they are in. This is acknowledged as a limitation and investigated as a potential confound (experimentally driven overuse, Section C.2.1).",
+ "source": "opus"
+ },
+ "attrition_reported": {
+ "applies": true,
+ "answer": true,
+ "justification": "Section G.6 reports 3 developers were dropped (2 due to repository pausing development, 1 due to cheating). Section C.3.3 reports issue dropout: 7 AI-disallowed and 6 AI-allowed issues not completed, plus 24 intentionally dropped issues with reasons. Robustness checks on dropout subsets are provided.",
+ "source": "opus"
+ }
+ },
+ "cost_and_practicality": {
+ "inference_cost_reported": {
+ "applies": true,
+ "answer": false,
+ "justification": "No AI inference costs or token consumption is reported. The paper mentions providing Cursor Pro subscriptions and paying developers $150/hour but does not report API costs or token usage.",
+ "source": "opus"
+ },
+ "compute_budget_stated": {
+ "applies": true,
+ "answer": false,
+ "justification": "Total study cost is partially disclosed ($150/hour for developers, with ~40 hour cap per developer) but total computational budget, API costs, and infrastructure costs are not stated.",
+ "source": "opus"
+ }
+ }
+ }
+ },
+ "claims": [
+ {
+ "claim": "AI tools increased task completion time by 19% for experienced OSS developers using Cursor Pro with Claude 3.5/3.7 Sonnet",
+ "evidence": "Log-linear regression on 246 tasks (136 AI-allowed, 110 AI-disallowed) with 95% CIs using HC3 standard errors; robust across five alternative estimators including simple ratio, alternative regression specifications, and screen-recording-based times",
+ "supported": "strong"
+ },
+ {
+ "claim": "Developers predicted 24% speedup before the study and estimated 20% speedup after, despite experiencing 19% slowdown",
+ "evidence": "Pre-task developer forecasts collected for all issues before randomization; post-study exit survey with weighted mean methodology (Equations 14-15); both directions confirmed with confidence intervals in Figure 1",
+ "supported": "strong"
+ },
+ {
+ "claim": "Economics experts predicted 39% speedup and ML experts predicted 38% speedup, dramatically overestimating AI productivity benefits",
+ "evidence": "Incentivized forecasts from 34 economics experts and 54 ML experts; Table 6 shows full distribution statistics including min, quartiles, max",
+ "supported": "strong"
+ },
+ {
+ "claim": "High developer familiarity with repositories contributes to the slowdown effect",
+ "evidence": "Moderate evidence from subgroup analyses (Figure 7): higher prior task exposure and lower external resource needs correlate with greater slowdown; qualitative developer reports corroborate; analysis is underpowered for statistical significance",
+ "supported": "moderate"
+ },
+ {
+ "claim": "AI code generation acceptance rate below 44% and high cleanup burden indicate low AI reliability in this setting",
+ "evidence": "Cursor analytics from 13 of 16 developers show <44% acceptance rate; 56% of developers report needing major changes to AI code; 100% report needing to modify AI-generated code; 9% of time spent reviewing/cleaning outputs",
+ "supported": "strong"
+ },
+ {
+ "claim": "75% of individual developers experienced slowdown when AI usage was allowed",
+ "evidence": "Per-developer speedup estimates in Figure 17 using heterogeneous treatment effects methodology (Equations 8-13); 12 of 16 developers show point estimates indicating slowdown",
+ "supported": "strong"
+ },
+ {
+ "claim": "Fully autonomous agents can implement core issue functionality but fail on quality requirements (documentation, linting, tests)",
+ "evidence": "Described as 'preliminary evidence (forthcoming)' in Section 4.1 for Claude 3.7 Sonnet on study repositories; not yet published and not directly supported by this paper's data",
+ "supported": "weak"
+ }
+ ],
+ "methodology_tags": [
+ "rct"
+ ],
+ "key_findings": "An RCT with 16 experienced open-source developers (5-year average repository tenure) completing 246 real tasks found that frontier AI tools (Cursor Pro with Claude 3.5/3.7 Sonnet) increased task completion time by 19%, directly contradicting developer predictions of 24% speedup and expert forecasts of 38-39% speedup from 88 economics and ML researchers. Developers maintained their mistaken perception of AI benefit even after completing the study, with post-hoc estimates of 20% speedup. Contributing factors identified include high developer familiarity making AI less useful, large/complex repository contexts exceeding AI context window reliability (<44% code acceptance rate), and over-optimism leading to continued AI use despite diminishing returns. Results are robust across multiple estimators, outcome measures, and analysis subsets, though 16-developer sample size limits subgroup power.",
+ "red_flags": [
+ {
+ "flag": "No pre-registration",
+ "detail": "The 21-factor analysis in Appendix C is described as 'a priori' but no pre-registration on OSF, AEA, or similar registry is cited; the boundary between genuinely pre-specified and post-hoc hypotheses cannot be verified."
+ },
+ {
+ "flag": "Small sample (n=16 developers)",
+ "detail": "Only 16 developers participate; the paper acknowledges being 'not powered for statistically significant multiple comparisons when subsetting our data,' yet multiple subgroup conclusions are drawn with wide confidence intervals."
+ },
+ {
+ "flag": "Non-blinded design",
+ "detail": "Developers know their condition assignment; Section C.2.1 finds mixed evidence about experimentally-driven AI overuse but cannot rule it out; screen-recording labelers are also not blinded to condition."
+ },
+ {
+ "flag": "Self-reported primary outcome",
+ "detail": "Completion times are self-reported per issue; while validated against screen recording durations on a subset (25% vs 24% slowdown), systematic bias from time-tracking burden in the study context cannot be eliminated."
+ },
+ {
+ "flag": "No IRB or ethics disclosure",
+ "detail": "Human subjects research involving paid participants, screen recordings of work, and exit interviews contains no mention of institutional ethics review or informed consent procedures."
+ },
+ {
+ "flag": "No data or code release",
+ "detail": "Neither raw data nor analysis code is made publicly available, preventing independent verification of the primary statistical claims despite detailed methodological appendices."
+ },
+ {
+ "flag": "No funding disclosure",
+ "detail": "METR's funding sources are not disclosed despite the organization evaluating tools from Anthropic (Claude) and Anysphere (Cursor); potential funder relationships to evaluated companies cannot be assessed."
+ }
+ ],
+ "cited_papers": [
+ {
+ "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
+ "relevance": "Primary methodological comparison point; found 56% speedup on synthetic tasks using non-frontier models, contrasting with this study's 19% slowdown on real tasks"
+ },
+ {
+ "title": "How Much Does AI Impact Development Speed? An Enterprise-Based Randomized Controlled Trial",
+ "relevance": "Another RCT finding 21% speedup using synthetic tasks; used as comparison for study design differences (synthetic vs. real tasks, non-fixed vs. fixed outcomes)"
+ },
+ {
+ "title": "The Effects of Generative AI on High-Skilled Work: Evidence from Three Field Experiments with Software Developers",
+ "relevance": "Three field experiments finding 26% output increase using non-fixed outcome measures (PRs); key methodological contrast for fixed vs. non-fixed outcome measure debate"
+ },
+ {
+ "title": "Generative AI at Work",
+ "relevance": "Customer service AI study finding benefits concentrated among less experienced workers; cited for heterogeneous effects framework motivating focus on expert developers"
+ },
+ {
+ "title": "Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence",
+ "relevance": "Non-software domain (writing) RCT finding productivity gains; provides cross-domain comparison for AI productivity effects"
+ },
+ {
+ "title": "Generative AI and Labour Productivity: A Field Experiment on Coding",
+ "relevance": "BIS field experiment finding 55% output increase measured in PRs; cited as example of non-fixed outcome measure limitations"
+ },
+ {
+ "title": "Artificial Intelligence: The Ambiguous Labor Market Impact of Automating Prediction",
+ "relevance": "Agrawal et al. theoretical framework treating AI as prediction cost reduction with distributional consequences; motivates heterogeneous effects by experience level"
+ },
+ {
+ "title": "Measuring AI Ability to Complete Long Tasks",
+ "relevance": "METR's related work on autonomous task completion benchmarks; provides context on AI capability measurement and benchmark vs. real-world performance gap"
+ }
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 3,
+ "justification": "Directly measures impact of currently available tools (Cursor Pro, Claude 3.5/3.7 Sonnet) on real developer workflows, immediately actionable for practitioners and managers evaluating AI tool adoption."
+ },
+ "surprise_contrarian": {
+ "score": 3,
+ "justification": "Finding is dramatically contrary to near-universal expert and practitioner consensus; 19% slowdown vs. 38-39% predicted speedup from 88 experts represents a complete reversal of expectations with rigorous methodology."
+ },
+ "fear_safety": {
+ "score": 1,
+ "justification": "Indirectly relevant to AI safety by demonstrating that benchmark performance substantially overestimates real-world AI capabilities, suggesting capability assessments for AI risk may be miscalibrated."
+ },
+ "drama_conflict": {
+ "score": 2,
+ "justification": "Results directly contradict multiple prior published studies and expert consensus, creating methodological controversy about which study designs are appropriate for measuring AI productivity impact."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Cannot replicate the RCT; tools (Cursor Pro, Claude) are publicly available for personal experimentation but no interactive demo or replication dataset is released."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "METR is a recognized AI safety evaluation organization; study evaluates Claude (Anthropic) and Cursor Pro (Anysphere), lending credibility through well-known tool and lab associations."
+ }
+ },
+ "hn_data": {
+ "threads": [
+ {
+ "hn_id": "36781015",
+ "title": "How is ChatGPT's behavior changing over time?",
+ "points": 289,
+ "comments": 178,
+ "url": "https://news.ycombinator.com/item?id=36781015",
+ "created_at": "2023-07-19T01:06:12Z"
+ },
+ {
+ "hn_id": "41215631",
+ "title": "Ask HN: Has degradation in the quality of ChatGPT and Claude been proven?",
+ "points": 42,
+ "comments": 40,
+ "url": "https://news.ycombinator.com/item?id=41215631",
+ "created_at": "2024-08-11T12:12:39Z"
+ },
+ {
+ "hn_id": "42764969",
+ "title": "Evolving Deeper LLM Thinking",
+ "points": 12,
+ "comments": 0,
+ "url": "https://news.ycombinator.com/item?id=42764969",
+ "created_at": "2025-01-20T04:24:10Z"
+ },
+ {
+ "hn_id": "45661775",
+ "title": "Measuring the Impact of Early-2025 AI on Experienced Developer Productivity",
+ "points": 4,
+ "comments": 2,
+ "url": "https://news.ycombinator.com/item?id=45661775",
+ "created_at": "2025-10-21T21:12:22Z"
+ },
+ {
+ "hn_id": "37265952",
+ "title": "The AI Reproducibility Crisis",
+ "points": 4,
+ "comments": 3,
+ "url": "https://news.ycombinator.com/item?id=37265952",
+ "created_at": "2023-08-25T19:15:29Z"
+ },
+ {
+ "hn_id": "45497568",
+ "title": "Fine-Tuning Small Language Models with Low-Rank Adapters to Mimic User Behaviors",
+ "points": 3,
+ "comments": 0,
+ "url": "https://news.ycombinator.com/item?id=45497568",
+ "created_at": "2025-10-06T23:40:50Z"
+ },
+ {
+ "hn_id": "45249175",
+ "title": "What do the fundamental constants of physics tell us about life?",
+ "points": 3,
+ "comments": 0,
+ "url": "https://news.ycombinator.com/item?id=45249175",
+ "created_at": "2025-09-15T13:02:16Z"
+ },
+ {
+ "hn_id": "44593569",
+ "title": "Measuring the Impact of Early-2025 AI on Experienced Developer Productivity",
+ "points": 2,
+ "comments": 0,
+ "url": "https://news.ycombinator.com/item?id=44593569",
+ "created_at": "2025-07-17T14:03:27Z"
+ },
+ {
+ "hn_id": "44783441",
+ "title": "Measuring the Impact of AI on Experienced Open-Source Developer Productivity",
+ "points": 1,
+ "comments": 1,
+ "url": "https://news.ycombinator.com/item?id=44783441",
+ "created_at": "2025-08-04T09:03:06Z"
+ },
+ {
+ "hn_id": "46254932",
+ "title": "Measuring Impact of Early-2025 AI on Experienced Open-Source Dev Productivity",
+ "points": 1,
+ "comments": 0,
+ "url": "https://news.ycombinator.com/item?id=46254932",
+ "created_at": "2025-12-13T14:54:29Z"
+ }
+ ],
+ "top_points": 289,
+ "total_points": 361,
+ "total_comments": 224
+ }
+}
+\ No newline at end of file
diff --git a/schema/scan-v4.schema.json b/schema/scan-v4.schema.json
@@ -0,0 +1,1040 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "scan-v4.schema.json",
+ "title": "V4 Paper Scan Result",
+ "description": "Type-routed scan instrument. Shared core (15q) + type-specific module. Two-field boolean design: applies + answer.",
+ "type": "object",
+ "required": [
+ "scan_version",
+ "paper_type",
+ "paper",
+ "checklist",
+ "type_checklist",
+ "claims",
+ "methodology_tags",
+ "key_findings",
+ "red_flags",
+ "cited_papers",
+ "engagement_factors",
+ "hn_data"
+ ],
+ "properties": {
+ "scan_version": {
+ "type": "integer",
+ "const": 4
+ },
+ "paper_type": {
+ "type": "string",
+ "enum": [
+ "empirical",
+ "benchmark-creation",
+ "survey",
+ "position",
+ "theoretical"
+ ]
+ },
+ "paper": {
+ "type": "object",
+ "required": [
+ "title",
+ "authors",
+ "year"
+ ],
+ "properties": {
+ "title": {
+ "type": "string"
+ },
+ "authors": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "year": {
+ "type": "integer"
+ },
+ "venue": {
+ "type": "string"
+ },
+ "arxiv_id": {
+ "type": "string"
+ },
+ "doi": {
+ "type": "string"
+ }
+ }
+ },
+ "checklist": {
+ "type": "object",
+ "description": "Shared core checklist — applies to ALL paper types.",
+ "required": [
+ "claims_and_evidence",
+ "limitations_and_scope",
+ "conflicts_of_interest",
+ "scope_and_framing"
+ ],
+ "properties": {
+ "claims_and_evidence": {
+ "type": "object",
+ "description": "Do the claims stay within what the evidence supports?",
+ "required": [
+ "abstract_claims_supported",
+ "causal_claims_justified",
+ "generalization_bounded",
+ "alternative_explanations_discussed",
+ "proxy_outcome_distinction"
+ ],
+ "properties": {
+ "abstract_claims_supported": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are all claims in the abstract supported by the paper's content? Check each claim against the evidence presented."
+ },
+ "causal_claims_justified": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "If the paper makes causal claims ('X improves Y', 'X causes Y'), is the study design adequate for causal inference? Ablation studies count as causal claims. NA if no causal claims."
+ },
+ "generalization_bounded": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are generalizations bounded to the tested/argued setting? Broad titles or conclusions beyond the scope of the evidence = NO."
+ },
+ "alternative_explanations_discussed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are alternative explanations or interpretations discussed? If only one interpretation is presented without considering alternatives = NO. NA for pure theoretical papers with no empirical claims."
+ },
+ "proxy_outcome_distinction": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Does the paper distinguish between what was measured and what is claimed? If claims match measurement granularity, YES. If 'lines of code' is called 'productivity' without discussion, NO."
+ }
+ }
+ },
+ "limitations_and_scope": {
+ "type": "object",
+ "description": "Does the paper honestly discuss what it does not show?",
+ "required": [
+ "limitations_section_present",
+ "threats_to_validity_specific",
+ "scope_boundaries_stated"
+ ],
+ "properties": {
+ "limitations_section_present": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is there a dedicated limitations or threats-to-validity section? A single sentence in the conclusion does not count."
+ },
+ "threats_to_validity_specific": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are specific threats discussed (not just boilerplate)? 'Our results may not generalize' is NO. 'Our sample of 16 developers is too small for subgroup analysis' is YES."
+ },
+ "scope_boundaries_stated": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are explicit scope boundaries stated — what the results/arguments do NOT show? Generic disclaimers don't count."
+ }
+ }
+ },
+ "conflicts_of_interest": {
+ "type": "object",
+ "description": "Are potential biases from funding, affiliation, or financial interest disclosed?",
+ "required": [
+ "funding_disclosed",
+ "affiliations_disclosed",
+ "funder_independent_of_outcome",
+ "financial_interests_declared"
+ ],
+ "properties": {
+ "funding_disclosed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the funding source disclosed? No mention of funding = NO. NA only if clearly unfunded independent work."
+ },
+ "affiliations_disclosed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are author affiliations with the evaluated product or company disclosed?"
+ },
+ "funder_independent_of_outcome": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the funder independent of the outcome? Company employees evaluating their own product = NO. NA if unfunded."
+ },
+ "financial_interests_declared": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are financial interests (patents, equity, consulting) declared? No competing interests statement = NO."
+ }
+ }
+ },
+ "scope_and_framing": {
+ "type": "object",
+ "description": "Is the paper's contribution clearly framed and situated?",
+ "required": [
+ "key_terms_defined",
+ "intended_contribution_clear",
+ "engagement_with_prior_work"
+ ],
+ "properties": {
+ "key_terms_defined": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are key terms defined precisely? If the paper uses terms like 'agent', 'productivity', 'alignment' without defining what it means in context, NO."
+ },
+ "intended_contribution_clear": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the intended contribution explicitly stated? The reader should know what the paper claims to add — a tool, a finding, a framework, a dataset, an argument."
+ },
+ "engagement_with_prior_work": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Does the paper engage with prior work in its area? Not just a related work section listing papers, but showing how this work relates to, builds on, or differs from existing contributions."
+ }
+ }
+ }
+ }
+ },
+ "type_checklist": {
+ "type": "object",
+ "description": "Type-specific checklist module. Exactly one key matching paper_type.",
+ "properties": {
+ "empirical": {
+ "type": "object",
+ "properties": {
+ "artifacts": {
+ "type": "object",
+ "description": "Can someone reproduce this work from what was released?",
+ "required": [
+ "code_released",
+ "data_released",
+ "environment_specified",
+ "reproduction_instructions"
+ ],
+ "properties": {
+ "code_released": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is source code released? A promise of future release or 'available upon request' = NO."
+ },
+ "data_released": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the dataset released or publicly available? Standard public benchmarks used unmodified = YES."
+ },
+ "environment_specified": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are environment/dependency specs provided (requirements.txt, Dockerfile, etc.)? 'Python 3.x' alone is NOT enough."
+ },
+ "reproduction_instructions": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are step-by-step reproduction instructions included? Must be specific enough to follow without guessing."
+ }
+ }
+ },
+ "statistical_methodology": {
+ "type": "object",
+ "description": "Are the numbers treated with appropriate rigor?",
+ "required": [
+ "confidence_intervals_or_error_bars",
+ "significance_tests",
+ "effect_sizes_reported",
+ "sample_size_justified",
+ "variance_reported"
+ ],
+ "properties": {
+ "confidence_intervals_or_error_bars": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are CIs or error bars reported for main results?"
+ },
+ "significance_tests": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are statistical significance tests used where comparative claims are made? NA if no comparative claims."
+ },
+ "effect_sizes_reported": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are effect sizes reported, not just p-values? Percentage improvement with baseline context counts."
+ },
+ "sample_size_justified": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is sample size justified or power analysis discussed?"
+ },
+ "variance_reported": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is variance/std dev reported across runs? Medians without spread = NO."
+ }
+ }
+ },
+ "evaluation_design": {
+ "type": "object",
+ "description": "Is the evaluation designed to actually test the claims?",
+ "required": [
+ "baselines_included",
+ "baselines_contemporary",
+ "ablation_study",
+ "multiple_metrics",
+ "human_evaluation",
+ "held_out_test_set",
+ "per_category_breakdown",
+ "failure_cases_discussed",
+ "negative_results_reported"
+ ],
+ "properties": {
+ "baselines_included": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are baseline comparisons included?"
+ },
+ "baselines_contemporary": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are baselines contemporary and competitive? Suspiciously old/weak baselines = NO."
+ },
+ "ablation_study": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is there an ablation study? NA if only one component."
+ },
+ "multiple_metrics": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are multiple evaluation metrics used?"
+ },
+ "human_evaluation": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is human evaluation included? Must evaluate system outputs, not just dataset construction. NA if clearly irrelevant."
+ },
+ "held_out_test_set": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are results on a held-out test set? NA if not a prediction task."
+ },
+ "per_category_breakdown": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are per-category or per-task breakdowns provided?"
+ },
+ "failure_cases_discussed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are failure cases shown or discussed?"
+ },
+ "negative_results_reported": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are negative results reported?"
+ }
+ }
+ },
+ "setup_transparency": {
+ "type": "object",
+ "description": "Is the setup described well enough to understand what was tested?",
+ "required": [
+ "model_versions_specified",
+ "prompts_provided",
+ "hyperparameters_reported",
+ "scaffolding_described",
+ "data_preprocessing_documented"
+ ],
+ "properties": {
+ "model_versions_specified": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are exact model versions specified? Marketing names without snapshot dates = NO."
+ },
+ "prompts_provided": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are actual prompts/system instructions provided? Templates with placeholders = NO unless fill values also given."
+ },
+ "hyperparameters_reported": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are hyperparameters reported (temperature, top-p, etc.)?"
+ },
+ "scaffolding_described": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is agentic scaffolding described in detail? NA if no scaffolding or evaluating black-box tools."
+ },
+ "data_preprocessing_documented": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are data preprocessing and filtering steps documented?"
+ }
+ }
+ },
+ "data_integrity": {
+ "type": "object",
+ "description": "Can the underlying data be verified?",
+ "required": [
+ "raw_data_available",
+ "data_collection_described",
+ "recruitment_methods_described",
+ "data_pipeline_documented"
+ ],
+ "properties": {
+ "raw_data_available": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is raw data available for independent verification?"
+ },
+ "data_collection_described": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the data collection procedure described in detail?"
+ },
+ "recruitment_methods_described": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are participant/sample recruitment methods described? NA if standard benchmark with no recruitment."
+ },
+ "data_pipeline_documented": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the full data pipeline from collection to analysis documented?"
+ }
+ }
+ },
+ "contamination": {
+ "type": "object",
+ "description": "Could the model have seen the test data during training?",
+ "required": [
+ "training_cutoff_stated",
+ "train_test_overlap_discussed",
+ "benchmark_contamination_addressed"
+ ],
+ "properties": {
+ "training_cutoff_stated": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the model's training data cutoff stated? NA if not evaluating model capabilities on benchmarks."
+ },
+ "train_test_overlap_discussed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is potential train/test overlap discussed? NA same as above."
+ },
+ "benchmark_contamination_addressed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Were benchmark examples available before training cutoff? NA if benchmark created after cutoff or not evaluating model on benchmarks."
+ }
+ }
+ },
+ "human_studies": {
+ "type": "object",
+ "description": "For papers involving human participants. All NA if no human subjects.",
+ "required": [
+ "pre_registered",
+ "irb_or_ethics_approval",
+ "demographics_reported",
+ "inclusion_exclusion_criteria",
+ "randomization_described",
+ "blinding_described",
+ "attrition_reported"
+ ],
+ "properties": {
+ "pre_registered": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the study pre-registered? NA if no human participants."
+ },
+ "irb_or_ethics_approval": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is IRB/ethics approval mentioned? NA if no human participants."
+ },
+ "demographics_reported": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are participant demographics reported? NA if no human participants."
+ },
+ "inclusion_exclusion_criteria": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are inclusion/exclusion criteria stated? NA if no human participants."
+ },
+ "randomization_described": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is randomization described? NA if not experimental or no human participants."
+ },
+ "blinding_described": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is blinding described? NA if not feasible or no human participants."
+ },
+ "attrition_reported": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is attrition/dropout reported? NA if no human participants."
+ }
+ }
+ },
+ "cost_and_practicality": {
+ "type": "object",
+ "description": "Is the practical cost reported?",
+ "required": [
+ "inference_cost_reported",
+ "compute_budget_stated"
+ ],
+ "properties": {
+ "inference_cost_reported": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is inference cost or latency reported? NA if clearly irrelevant."
+ },
+ "compute_budget_stated": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the total computational budget stated?"
+ }
+ }
+ }
+ }
+ },
+ "benchmark-creation": {
+ "type": "object",
+ "properties": {
+ "construct_design": {
+ "type": "object",
+ "description": "Is the benchmark designed to measure what it claims?",
+ "required": [
+ "construct_validity_argued",
+ "difficulty_distribution_characterized",
+ "ceiling_floor_effects_checked",
+ "human_baseline_included",
+ "scoring_rubric_justified"
+ ],
+ "properties": {
+ "construct_validity_argued": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Does the paper argue why this benchmark measures the claimed capability? Not just 'we test X' but 'X measures Y because Z'."
+ },
+ "difficulty_distribution_characterized": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the difficulty distribution of benchmark items characterized? Are there easy, medium, hard tiers? Is difficulty measured or just assumed?"
+ },
+ "ceiling_floor_effects_checked": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are ceiling/floor effects checked? If all models score >90% or <10%, the benchmark isn't discriminating."
+ },
+ "human_baseline_included": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is there a human baseline? How do humans perform on this benchmark?"
+ },
+ "scoring_rubric_justified": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the scoring rubric justified? Why this metric and not another? Are edge cases in scoring addressed?"
+ }
+ }
+ },
+ "robustness": {
+ "type": "object",
+ "description": "Will this benchmark remain useful over time?",
+ "required": [
+ "contamination_resistance_designed",
+ "temporal_robustness_discussed",
+ "failure_modes_discussed",
+ "baseline_implementations_provided"
+ ],
+ "properties": {
+ "contamination_resistance_designed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is contamination resistance designed in? Temporal splits, canary strings, dynamic generation, or other anti-gaming measures?"
+ },
+ "temporal_robustness_discussed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is temporal robustness discussed? Will this benchmark be gamed or obsoleted in 6 months? Is there a plan for updates?"
+ },
+ "failure_modes_discussed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are failure modes of the benchmark itself discussed? What doesn't it measure? What could game it?"
+ },
+ "baseline_implementations_provided": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are baseline implementations provided so others can reproduce the reported numbers?"
+ }
+ }
+ },
+ "documentation": {
+ "type": "object",
+ "description": "Is the benchmark documented for reuse?",
+ "required": [
+ "dataset_documentation_complete",
+ "licensing_and_access_clear",
+ "intended_use_specified"
+ ],
+ "properties": {
+ "dataset_documentation_complete": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is dataset documentation complete? Data card, source description, collection methodology, preprocessing steps."
+ },
+ "licensing_and_access_clear": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is licensing and access clear? Can others actually use this benchmark? Under what terms?"
+ },
+ "intended_use_specified": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the intended use specified? What should and should NOT be concluded from benchmark results?"
+ }
+ }
+ }
+ }
+ },
+ "survey": {
+ "type": "object",
+ "properties": {
+ "search_and_selection": {
+ "type": "object",
+ "description": "Is the review process systematic and reproducible?",
+ "required": [
+ "search_strategy_reproducible",
+ "inclusion_exclusion_explicit",
+ "prisma_or_structured_protocol",
+ "search_terms_provided",
+ "databases_listed",
+ "screening_process_documented",
+ "review_scope_justified"
+ ],
+ "properties": {
+ "search_strategy_reproducible": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the search strategy reproducible? Could someone re-run the same searches and get the same initial result set?"
+ },
+ "inclusion_exclusion_explicit": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are inclusion/exclusion criteria explicit and applied consistently?"
+ },
+ "prisma_or_structured_protocol": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Does the survey follow PRISMA or another structured review protocol?"
+ },
+ "search_terms_provided": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are the actual search terms/queries provided?"
+ },
+ "databases_listed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are the databases/sources searched listed explicitly?"
+ },
+ "screening_process_documented": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the screening process documented with counts at each stage?"
+ },
+ "review_scope_justified": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the review scope justified? Why these years, venues, topics?"
+ }
+ }
+ },
+ "synthesis_quality": {
+ "type": "object",
+ "description": "Does the synthesis add value beyond listing papers?",
+ "required": [
+ "conflicting_findings_acknowledged",
+ "quality_assessment_of_sources",
+ "publication_bias_discussed",
+ "quantitative_synthesis_present",
+ "recommendations_supported_by_evidence"
+ ],
+ "properties": {
+ "conflicting_findings_acknowledged": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are conflicting findings across reviewed papers acknowledged and discussed?"
+ },
+ "quality_assessment_of_sources": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Does the survey assess the quality of its source papers? A quality rubric, risk-of-bias assessment, or structured evaluation?"
+ },
+ "publication_bias_discussed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is publication bias discussed? Does the survey acknowledge that published papers skew positive?"
+ },
+ "quantitative_synthesis_present": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is there quantitative synthesis (meta-analysis, vote counting, effect size aggregation) or just narrative?"
+ },
+ "recommendations_supported_by_evidence": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are recommendations supported by the reviewed evidence, not just author opinion?"
+ }
+ }
+ }
+ }
+ },
+ "position": {
+ "type": "object",
+ "properties": {
+ "argument_quality": {
+ "type": "object",
+ "description": "Is the argument well-constructed?",
+ "required": [
+ "argument_internally_consistent",
+ "counterarguments_addressed",
+ "analogies_appropriate",
+ "prescriptions_proportional",
+ "evidence_for_claims_cited",
+ "alternatives_discussed",
+ "historical_context_accurate"
+ ],
+ "properties": {
+ "argument_internally_consistent": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the argument internally consistent? Do the conclusions follow from the premises? Are there contradictions?"
+ },
+ "counterarguments_addressed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are counterarguments addressed — the strongest version, not a strawman? Does the paper engage with the best opposing view?"
+ },
+ "analogies_appropriate": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are analogies appropriate? If the paper draws parallels (e.g., 'AI is like electricity'), are the parallels valid or are they false equivalences?"
+ },
+ "prescriptions_proportional": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are prescriptive claims proportional to the argument? Sweeping policy recommendations require stronger support than narrow suggestions."
+ },
+ "evidence_for_claims_cited": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is evidence cited for factual claims? Assertions presented as fact should reference sources."
+ },
+ "alternatives_discussed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are alternatives to the proposed viewpoint/framework discussed? Not just 'our view is right' but 'here are other views and why we prefer ours'."
+ },
+ "historical_context_accurate": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is historical context accurate? If the paper references historical events, technologies, or intellectual traditions, are those references correct?"
+ }
+ }
+ },
+ "clarity_and_scope": {
+ "type": "object",
+ "description": "Is the paper's scope and audience clear?",
+ "required": [
+ "key_terms_defined_precisely",
+ "engages_with_existing_literature",
+ "intended_audience_clear",
+ "assumptions_stated",
+ "scope_of_applicability_discussed"
+ ],
+ "properties": {
+ "key_terms_defined_precisely": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are key terms defined precisely in context? Not just used but defined — what does 'agent' or 'alignment' mean in this paper specifically?"
+ },
+ "engages_with_existing_literature": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Does it engage substantively with existing literature on this position? Not just citing but discussing, comparing, building on."
+ },
+ "intended_audience_clear": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the intended audience clear? Is this for policymakers, researchers, practitioners, the public?"
+ },
+ "assumptions_stated": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are the paper's assumptions stated explicitly? What must the reader accept for the argument to work?"
+ },
+ "scope_of_applicability_discussed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the scope of applicability discussed? Where does this argument apply and where doesn't it?"
+ }
+ }
+ }
+ }
+ },
+ "theoretical": {
+ "type": "object",
+ "properties": {
+ "formal_quality": {
+ "type": "object",
+ "description": "Is the formal work rigorous?",
+ "required": [
+ "assumptions_stated_explicitly",
+ "proofs_complete_or_sketched",
+ "bounds_tight_or_discussed",
+ "counterexamples_explored",
+ "notation_consistent",
+ "constructive_vs_existence_noted"
+ ],
+ "properties": {
+ "assumptions_stated_explicitly": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are all assumptions stated explicitly? Hidden assumptions in proofs or models = NO."
+ },
+ "proofs_complete_or_sketched": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are proofs complete, or clearly sketched with references to full versions? Proof 'left to reader' without sketch = NO."
+ },
+ "bounds_tight_or_discussed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are bounds tight, or is tightness discussed? If a bound is loose, is that acknowledged?"
+ },
+ "counterexamples_explored": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are counterexamples or edge cases explored? Does the paper test the limits of its own results?"
+ },
+ "notation_consistent": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is notation consistent throughout? Overloaded symbols or inconsistent conventions = NO."
+ },
+ "constructive_vs_existence_noted": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is it noted whether results are constructive or existence-only? Can you compute the thing proved to exist?"
+ }
+ }
+ },
+ "connections": {
+ "type": "object",
+ "description": "Is the theoretical work connected to practice and prior work?",
+ "required": [
+ "connection_to_practice_discussed",
+ "relationship_to_prior_work_clear",
+ "computational_complexity_discussed",
+ "limitations_of_formal_model_stated"
+ ],
+ "properties": {
+ "connection_to_practice_discussed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the connection to practice discussed? What does this theorem mean for practitioners?"
+ },
+ "relationship_to_prior_work_clear": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is the relationship to prior theoretical work clear? What does this extend, generalize, or contradict?"
+ },
+ "computational_complexity_discussed": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Is computational complexity discussed where relevant? Is the algorithm tractable?"
+ },
+ "limitations_of_formal_model_stated": {
+ "$ref": "#/$defs/checklist_item",
+ "description": "Are limitations of the formal model stated? What does the model NOT capture about reality?"
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ "claims": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "required": [
+ "claim",
+ "evidence",
+ "supported"
+ ],
+ "properties": {
+ "claim": {
+ "type": "string"
+ },
+ "evidence": {
+ "type": "string"
+ },
+ "supported": {
+ "type": "string",
+ "enum": [
+ "strong",
+ "moderate",
+ "weak",
+ "unsupported"
+ ]
+ }
+ }
+ }
+ },
+ "methodology_tags": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": [
+ "rct",
+ "observational",
+ "benchmark-eval",
+ "case-study",
+ "meta-analysis",
+ "theoretical",
+ "qualitative"
+ ]
+ }
+ },
+ "key_findings": {
+ "type": "string"
+ },
+ "red_flags": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "required": [
+ "flag",
+ "detail"
+ ],
+ "properties": {
+ "flag": {
+ "type": "string"
+ },
+ "detail": {
+ "type": "string"
+ }
+ }
+ }
+ },
+ "cited_papers": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "required": [
+ "title",
+ "relevance"
+ ],
+ "properties": {
+ "title": {
+ "type": "string"
+ },
+ "authors": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "year": {
+ "type": "integer"
+ },
+ "arxiv_id": {
+ "type": "string"
+ },
+ "doi": {
+ "type": "string"
+ },
+ "relevance": {
+ "type": "string"
+ }
+ }
+ }
+ },
+ "engagement_factors": {
+ "type": "object",
+ "required": [
+ "practical_relevance",
+ "surprise_contrarian",
+ "fear_safety",
+ "drama_conflict",
+ "demo_ability",
+ "brand_recognition"
+ ],
+ "properties": {
+ "practical_relevance": {
+ "type": "object",
+ "required": [
+ "score",
+ "justification"
+ ],
+ "properties": {
+ "score": {
+ "type": "integer",
+ "minimum": 0,
+ "maximum": 3
+ },
+ "justification": {
+ "type": "string"
+ }
+ }
+ },
+ "surprise_contrarian": {
+ "type": "object",
+ "required": [
+ "score",
+ "justification"
+ ],
+ "properties": {
+ "score": {
+ "type": "integer",
+ "minimum": 0,
+ "maximum": 3
+ },
+ "justification": {
+ "type": "string"
+ }
+ }
+ },
+ "fear_safety": {
+ "type": "object",
+ "required": [
+ "score",
+ "justification"
+ ],
+ "properties": {
+ "score": {
+ "type": "integer",
+ "minimum": 0,
+ "maximum": 3
+ },
+ "justification": {
+ "type": "string"
+ }
+ }
+ },
+ "drama_conflict": {
+ "type": "object",
+ "required": [
+ "score",
+ "justification"
+ ],
+ "properties": {
+ "score": {
+ "type": "integer",
+ "minimum": 0,
+ "maximum": 3
+ },
+ "justification": {
+ "type": "string"
+ }
+ }
+ },
+ "demo_ability": {
+ "type": "object",
+ "required": [
+ "score",
+ "justification"
+ ],
+ "properties": {
+ "score": {
+ "type": "integer",
+ "minimum": 0,
+ "maximum": 3
+ },
+ "justification": {
+ "type": "string"
+ }
+ }
+ },
+ "brand_recognition": {
+ "type": "object",
+ "required": [
+ "score",
+ "justification"
+ ],
+ "properties": {
+ "score": {
+ "type": "integer",
+ "minimum": 0,
+ "maximum": 3
+ },
+ "justification": {
+ "type": "string"
+ }
+ }
+ }
+ }
+ },
+ "hn_data": {
+ "type": "object",
+ "properties": {
+ "threads": {
+ "type": "array"
+ },
+ "top_points": {
+ "type": "integer"
+ },
+ "total_points": {
+ "type": "integer"
+ },
+ "total_comments": {
+ "type": "integer"
+ }
+ }
+ }
+ },
+ "$defs": {
+ "checklist_item": {
+ "type": "object",
+ "required": [
+ "applies",
+ "answer",
+ "justification"
+ ],
+ "properties": {
+ "applies": {
+ "type": "boolean",
+ "description": "Does this criterion apply to this paper type?"
+ },
+ "answer": {
+ "type": "boolean",
+ "description": "Does the paper satisfy this criterion? false when applies=false."
+ },
+ "justification": {
+ "type": "string",
+ "description": "1-3 sentences explaining the answer."
+ },
+ "source": {
+ "type": "string",
+ "enum": [
+ "opus",
+ "haiku"
+ ],
+ "description": "Which model produced this answer."
+ }
+ }
+ }
+ }
+}
diff --git a/scripts/gen-v4-schema.py b/scripts/gen-v4-schema.py
@@ -0,0 +1,412 @@
+#!/usr/bin/env python3
+"""Generate the v4 scan schema as JSON from a structured Python definition.
+
+This is easier to maintain than hand-editing a giant JSON file.
+Run: python3 scripts/gen-v4-schema.py > schema/scan-v4.schema.json
+"""
+
+import json
+
+def q(desc):
+ """Shorthand for a checklist item referencing the shared def."""
+ return {"$ref": "#/$defs/checklist_item", "description": desc}
+
+CHECKLIST_ITEM = {
+ "type": "object",
+ "required": ["applies", "answer", "justification"],
+ "properties": {
+ "applies": {"type": "boolean", "description": "Does this criterion apply to this paper type?"},
+ "answer": {"type": "boolean", "description": "Does the paper satisfy this criterion? false when applies=false."},
+ "justification": {"type": "string", "description": "1-3 sentences explaining the answer."},
+ "source": {"type": "string", "enum": ["opus", "haiku"], "description": "Which model produced this answer."},
+ },
+}
+
+ENGAGEMENT_FACTOR = {
+ "type": "object",
+ "required": ["score", "justification"],
+ "properties": {
+ "score": {"type": "integer", "minimum": 0, "maximum": 3},
+ "justification": {"type": "string"},
+ },
+}
+
+# ── Shared Core (15 questions, 4 categories) ──────────────────────────
+
+SHARED_CORE = {
+ "claims_and_evidence": {
+ "type": "object",
+ "description": "Do the claims stay within what the evidence supports?",
+ "required": ["abstract_claims_supported", "causal_claims_justified", "generalization_bounded", "alternative_explanations_discussed", "proxy_outcome_distinction"],
+ "properties": {
+ "abstract_claims_supported": q("Are all claims in the abstract supported by the paper's content? Check each claim against the evidence presented."),
+ "causal_claims_justified": q("If the paper makes causal claims ('X improves Y', 'X causes Y'), is the study design adequate for causal inference? Ablation studies count as causal claims. NA if no causal claims."),
+ "generalization_bounded": q("Are generalizations bounded to the tested/argued setting? Broad titles or conclusions beyond the scope of the evidence = NO."),
+ "alternative_explanations_discussed": q("Are alternative explanations or interpretations discussed? If only one interpretation is presented without considering alternatives = NO. NA for pure theoretical papers with no empirical claims."),
+ "proxy_outcome_distinction": q("Does the paper distinguish between what was measured and what is claimed? If claims match measurement granularity, YES. If 'lines of code' is called 'productivity' without discussion, NO."),
+ },
+ },
+ "limitations_and_scope": {
+ "type": "object",
+ "description": "Does the paper honestly discuss what it does not show?",
+ "required": ["limitations_section_present", "threats_to_validity_specific", "scope_boundaries_stated"],
+ "properties": {
+ "limitations_section_present": q("Is there a dedicated limitations or threats-to-validity section? A single sentence in the conclusion does not count."),
+ "threats_to_validity_specific": q("Are specific threats discussed (not just boilerplate)? 'Our results may not generalize' is NO. 'Our sample of 16 developers is too small for subgroup analysis' is YES."),
+ "scope_boundaries_stated": q("Are explicit scope boundaries stated — what the results/arguments do NOT show? Generic disclaimers don't count."),
+ },
+ },
+ "conflicts_of_interest": {
+ "type": "object",
+ "description": "Are potential biases from funding, affiliation, or financial interest disclosed?",
+ "required": ["funding_disclosed", "affiliations_disclosed", "funder_independent_of_outcome", "financial_interests_declared"],
+ "properties": {
+ "funding_disclosed": q("Is the funding source disclosed? No mention of funding = NO. NA only if clearly unfunded independent work."),
+ "affiliations_disclosed": q("Are author affiliations with the evaluated product or company disclosed?"),
+ "funder_independent_of_outcome": q("Is the funder independent of the outcome? Company employees evaluating their own product = NO. NA if unfunded."),
+ "financial_interests_declared": q("Are financial interests (patents, equity, consulting) declared? No competing interests statement = NO."),
+ },
+ },
+ "scope_and_framing": {
+ "type": "object",
+ "description": "Is the paper's contribution clearly framed and situated?",
+ "required": ["key_terms_defined", "intended_contribution_clear", "engagement_with_prior_work"],
+ "properties": {
+ "key_terms_defined": q("Are key terms defined precisely? If the paper uses terms like 'agent', 'productivity', 'alignment' without defining what it means in context, NO."),
+ "intended_contribution_clear": q("Is the intended contribution explicitly stated? The reader should know what the paper claims to add — a tool, a finding, a framework, a dataset, an argument."),
+ "engagement_with_prior_work": q("Does the paper engage with prior work in its area? Not just a related work section listing papers, but showing how this work relates to, builds on, or differs from existing contributions."),
+ },
+ },
+}
+
+# ── Type-Specific Modules ─────────────────────────────────────────────
+
+EMPIRICAL_MODULE = {
+ "artifacts": {
+ "type": "object",
+ "description": "Can someone reproduce this work from what was released?",
+ "required": ["code_released", "data_released", "environment_specified", "reproduction_instructions"],
+ "properties": {
+ "code_released": q("Is source code released? A promise of future release or 'available upon request' = NO."),
+ "data_released": q("Is the dataset released or publicly available? Standard public benchmarks used unmodified = YES."),
+ "environment_specified": q("Are environment/dependency specs provided (requirements.txt, Dockerfile, etc.)? 'Python 3.x' alone is NOT enough."),
+ "reproduction_instructions": q("Are step-by-step reproduction instructions included? Must be specific enough to follow without guessing."),
+ },
+ },
+ "statistical_methodology": {
+ "type": "object",
+ "description": "Are the numbers treated with appropriate rigor?",
+ "required": ["confidence_intervals_or_error_bars", "significance_tests", "effect_sizes_reported", "sample_size_justified", "variance_reported"],
+ "properties": {
+ "confidence_intervals_or_error_bars": q("Are CIs or error bars reported for main results?"),
+ "significance_tests": q("Are statistical significance tests used where comparative claims are made? NA if no comparative claims."),
+ "effect_sizes_reported": q("Are effect sizes reported, not just p-values? Percentage improvement with baseline context counts."),
+ "sample_size_justified": q("Is sample size justified or power analysis discussed?"),
+ "variance_reported": q("Is variance/std dev reported across runs? Medians without spread = NO."),
+ },
+ },
+ "evaluation_design": {
+ "type": "object",
+ "description": "Is the evaluation designed to actually test the claims?",
+ "required": ["baselines_included", "baselines_contemporary", "ablation_study", "multiple_metrics", "human_evaluation", "held_out_test_set", "per_category_breakdown", "failure_cases_discussed", "negative_results_reported"],
+ "properties": {
+ "baselines_included": q("Are baseline comparisons included?"),
+ "baselines_contemporary": q("Are baselines contemporary and competitive? Suspiciously old/weak baselines = NO."),
+ "ablation_study": q("Is there an ablation study? NA if only one component."),
+ "multiple_metrics": q("Are multiple evaluation metrics used?"),
+ "human_evaluation": q("Is human evaluation included? Must evaluate system outputs, not just dataset construction. NA if clearly irrelevant."),
+ "held_out_test_set": q("Are results on a held-out test set? NA if not a prediction task."),
+ "per_category_breakdown": q("Are per-category or per-task breakdowns provided?"),
+ "failure_cases_discussed": q("Are failure cases shown or discussed?"),
+ "negative_results_reported": q("Are negative results reported?"),
+ },
+ },
+ "setup_transparency": {
+ "type": "object",
+ "description": "Is the setup described well enough to understand what was tested?",
+ "required": ["model_versions_specified", "prompts_provided", "hyperparameters_reported", "scaffolding_described", "data_preprocessing_documented"],
+ "properties": {
+ "model_versions_specified": q("Are exact model versions specified? Marketing names without snapshot dates = NO."),
+ "prompts_provided": q("Are actual prompts/system instructions provided? Templates with placeholders = NO unless fill values also given."),
+ "hyperparameters_reported": q("Are hyperparameters reported (temperature, top-p, etc.)?"),
+ "scaffolding_described": q("Is agentic scaffolding described in detail? NA if no scaffolding or evaluating black-box tools."),
+ "data_preprocessing_documented": q("Are data preprocessing and filtering steps documented?"),
+ },
+ },
+ "data_integrity": {
+ "type": "object",
+ "description": "Can the underlying data be verified?",
+ "required": ["raw_data_available", "data_collection_described", "recruitment_methods_described", "data_pipeline_documented"],
+ "properties": {
+ "raw_data_available": q("Is raw data available for independent verification?"),
+ "data_collection_described": q("Is the data collection procedure described in detail?"),
+ "recruitment_methods_described": q("Are participant/sample recruitment methods described? NA if standard benchmark with no recruitment."),
+ "data_pipeline_documented": q("Is the full data pipeline from collection to analysis documented?"),
+ },
+ },
+ "contamination": {
+ "type": "object",
+ "description": "Could the model have seen the test data during training?",
+ "required": ["training_cutoff_stated", "train_test_overlap_discussed", "benchmark_contamination_addressed"],
+ "properties": {
+ "training_cutoff_stated": q("Is the model's training data cutoff stated? NA if not evaluating model capabilities on benchmarks."),
+ "train_test_overlap_discussed": q("Is potential train/test overlap discussed? NA same as above."),
+ "benchmark_contamination_addressed": q("Were benchmark examples available before training cutoff? NA if benchmark created after cutoff or not evaluating model on benchmarks."),
+ },
+ },
+ "human_studies": {
+ "type": "object",
+ "description": "For papers involving human participants. All NA if no human subjects.",
+ "required": ["pre_registered", "irb_or_ethics_approval", "demographics_reported", "inclusion_exclusion_criteria", "randomization_described", "blinding_described", "attrition_reported"],
+ "properties": {
+ "pre_registered": q("Is the study pre-registered? NA if no human participants."),
+ "irb_or_ethics_approval": q("Is IRB/ethics approval mentioned? NA if no human participants."),
+ "demographics_reported": q("Are participant demographics reported? NA if no human participants."),
+ "inclusion_exclusion_criteria": q("Are inclusion/exclusion criteria stated? NA if no human participants."),
+ "randomization_described": q("Is randomization described? NA if not experimental or no human participants."),
+ "blinding_described": q("Is blinding described? NA if not feasible or no human participants."),
+ "attrition_reported": q("Is attrition/dropout reported? NA if no human participants."),
+ },
+ },
+ "cost_and_practicality": {
+ "type": "object",
+ "description": "Is the practical cost reported?",
+ "required": ["inference_cost_reported", "compute_budget_stated"],
+ "properties": {
+ "inference_cost_reported": q("Is inference cost or latency reported? NA if clearly irrelevant."),
+ "compute_budget_stated": q("Is the total computational budget stated?"),
+ },
+ },
+}
+
+BENCHMARK_MODULE = {
+ "construct_design": {
+ "type": "object",
+ "description": "Is the benchmark designed to measure what it claims?",
+ "required": ["construct_validity_argued", "difficulty_distribution_characterized", "ceiling_floor_effects_checked", "human_baseline_included", "scoring_rubric_justified"],
+ "properties": {
+ "construct_validity_argued": q("Does the paper argue why this benchmark measures the claimed capability? Not just 'we test X' but 'X measures Y because Z'."),
+ "difficulty_distribution_characterized": q("Is the difficulty distribution of benchmark items characterized? Are there easy, medium, hard tiers? Is difficulty measured or just assumed?"),
+ "ceiling_floor_effects_checked": q("Are ceiling/floor effects checked? If all models score >90% or <10%, the benchmark isn't discriminating."),
+ "human_baseline_included": q("Is there a human baseline? How do humans perform on this benchmark?"),
+ "scoring_rubric_justified": q("Is the scoring rubric justified? Why this metric and not another? Are edge cases in scoring addressed?"),
+ },
+ },
+ "robustness": {
+ "type": "object",
+ "description": "Will this benchmark remain useful over time?",
+ "required": ["contamination_resistance_designed", "temporal_robustness_discussed", "failure_modes_discussed", "baseline_implementations_provided"],
+ "properties": {
+ "contamination_resistance_designed": q("Is contamination resistance designed in? Temporal splits, canary strings, dynamic generation, or other anti-gaming measures?"),
+ "temporal_robustness_discussed": q("Is temporal robustness discussed? Will this benchmark be gamed or obsoleted in 6 months? Is there a plan for updates?"),
+ "failure_modes_discussed": q("Are failure modes of the benchmark itself discussed? What doesn't it measure? What could game it?"),
+ "baseline_implementations_provided": q("Are baseline implementations provided so others can reproduce the reported numbers?"),
+ },
+ },
+ "documentation": {
+ "type": "object",
+ "description": "Is the benchmark documented for reuse?",
+ "required": ["dataset_documentation_complete", "licensing_and_access_clear", "intended_use_specified"],
+ "properties": {
+ "dataset_documentation_complete": q("Is dataset documentation complete? Data card, source description, collection methodology, preprocessing steps."),
+ "licensing_and_access_clear": q("Is licensing and access clear? Can others actually use this benchmark? Under what terms?"),
+ "intended_use_specified": q("Is the intended use specified? What should and should NOT be concluded from benchmark results?"),
+ },
+ },
+}
+
+SURVEY_MODULE = {
+ "search_and_selection": {
+ "type": "object",
+ "description": "Is the review process systematic and reproducible?",
+ "required": ["search_strategy_reproducible", "inclusion_exclusion_explicit", "prisma_or_structured_protocol", "search_terms_provided", "databases_listed", "screening_process_documented", "review_scope_justified"],
+ "properties": {
+ "search_strategy_reproducible": q("Is the search strategy reproducible? Could someone re-run the same searches and get the same initial result set?"),
+ "inclusion_exclusion_explicit": q("Are inclusion/exclusion criteria explicit and applied consistently?"),
+ "prisma_or_structured_protocol": q("Does the survey follow PRISMA or another structured review protocol?"),
+ "search_terms_provided": q("Are the actual search terms/queries provided?"),
+ "databases_listed": q("Are the databases/sources searched listed explicitly?"),
+ "screening_process_documented": q("Is the screening process documented with counts at each stage?"),
+ "review_scope_justified": q("Is the review scope justified? Why these years, venues, topics?"),
+ },
+ },
+ "synthesis_quality": {
+ "type": "object",
+ "description": "Does the synthesis add value beyond listing papers?",
+ "required": ["conflicting_findings_acknowledged", "quality_assessment_of_sources", "publication_bias_discussed", "quantitative_synthesis_present", "recommendations_supported_by_evidence"],
+ "properties": {
+ "conflicting_findings_acknowledged": q("Are conflicting findings across reviewed papers acknowledged and discussed?"),
+ "quality_assessment_of_sources": q("Does the survey assess the quality of its source papers? A quality rubric, risk-of-bias assessment, or structured evaluation?"),
+ "publication_bias_discussed": q("Is publication bias discussed? Does the survey acknowledge that published papers skew positive?"),
+ "quantitative_synthesis_present": q("Is there quantitative synthesis (meta-analysis, vote counting, effect size aggregation) or just narrative?"),
+ "recommendations_supported_by_evidence": q("Are recommendations supported by the reviewed evidence, not just author opinion?"),
+ },
+ },
+}
+
+POSITION_MODULE = {
+ "argument_quality": {
+ "type": "object",
+ "description": "Is the argument well-constructed?",
+ "required": ["argument_internally_consistent", "counterarguments_addressed", "analogies_appropriate", "prescriptions_proportional", "evidence_for_claims_cited", "alternatives_discussed", "historical_context_accurate"],
+ "properties": {
+ "argument_internally_consistent": q("Is the argument internally consistent? Do the conclusions follow from the premises? Are there contradictions?"),
+ "counterarguments_addressed": q("Are counterarguments addressed — the strongest version, not a strawman? Does the paper engage with the best opposing view?"),
+ "analogies_appropriate": q("Are analogies appropriate? If the paper draws parallels (e.g., 'AI is like electricity'), are the parallels valid or are they false equivalences?"),
+ "prescriptions_proportional": q("Are prescriptive claims proportional to the argument? Sweeping policy recommendations require stronger support than narrow suggestions."),
+ "evidence_for_claims_cited": q("Is evidence cited for factual claims? Assertions presented as fact should reference sources."),
+ "alternatives_discussed": q("Are alternatives to the proposed viewpoint/framework discussed? Not just 'our view is right' but 'here are other views and why we prefer ours'."),
+ "historical_context_accurate": q("Is historical context accurate? If the paper references historical events, technologies, or intellectual traditions, are those references correct?"),
+ },
+ },
+ "clarity_and_scope": {
+ "type": "object",
+ "description": "Is the paper's scope and audience clear?",
+ "required": ["key_terms_defined_precisely", "engages_with_existing_literature", "intended_audience_clear", "assumptions_stated", "scope_of_applicability_discussed"],
+ "properties": {
+ "key_terms_defined_precisely": q("Are key terms defined precisely in context? Not just used but defined — what does 'agent' or 'alignment' mean in this paper specifically?"),
+ "engages_with_existing_literature": q("Does it engage substantively with existing literature on this position? Not just citing but discussing, comparing, building on."),
+ "intended_audience_clear": q("Is the intended audience clear? Is this for policymakers, researchers, practitioners, the public?"),
+ "assumptions_stated": q("Are the paper's assumptions stated explicitly? What must the reader accept for the argument to work?"),
+ "scope_of_applicability_discussed": q("Is the scope of applicability discussed? Where does this argument apply and where doesn't it?"),
+ },
+ },
+}
+
+THEORETICAL_MODULE = {
+ "formal_quality": {
+ "type": "object",
+ "description": "Is the formal work rigorous?",
+ "required": ["assumptions_stated_explicitly", "proofs_complete_or_sketched", "bounds_tight_or_discussed", "counterexamples_explored", "notation_consistent", "constructive_vs_existence_noted"],
+ "properties": {
+ "assumptions_stated_explicitly": q("Are all assumptions stated explicitly? Hidden assumptions in proofs or models = NO."),
+ "proofs_complete_or_sketched": q("Are proofs complete, or clearly sketched with references to full versions? Proof 'left to reader' without sketch = NO."),
+ "bounds_tight_or_discussed": q("Are bounds tight, or is tightness discussed? If a bound is loose, is that acknowledged?"),
+ "counterexamples_explored": q("Are counterexamples or edge cases explored? Does the paper test the limits of its own results?"),
+ "notation_consistent": q("Is notation consistent throughout? Overloaded symbols or inconsistent conventions = NO."),
+ "constructive_vs_existence_noted": q("Is it noted whether results are constructive or existence-only? Can you compute the thing proved to exist?"),
+ },
+ },
+ "connections": {
+ "type": "object",
+ "description": "Is the theoretical work connected to practice and prior work?",
+ "required": ["connection_to_practice_discussed", "relationship_to_prior_work_clear", "computational_complexity_discussed", "limitations_of_formal_model_stated"],
+ "properties": {
+ "connection_to_practice_discussed": q("Is the connection to practice discussed? What does this theorem mean for practitioners?"),
+ "relationship_to_prior_work_clear": q("Is the relationship to prior theoretical work clear? What does this extend, generalize, or contradict?"),
+ "computational_complexity_discussed": q("Is computational complexity discussed where relevant? Is the algorithm tractable?"),
+ "limitations_of_formal_model_stated": q("Are limitations of the formal model stated? What does the model NOT capture about reality?"),
+ },
+ },
+}
+
+# ── Full Schema Assembly ──────────────────────────────────────────────
+
+schema = {
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "scan-v4.schema.json",
+ "title": "V4 Paper Scan Result",
+ "description": "Type-routed scan instrument. Shared core (15q) + type-specific module. Two-field boolean design: applies + answer.",
+ "type": "object",
+ "required": ["scan_version", "paper_type", "paper", "checklist", "type_checklist", "claims", "methodology_tags", "key_findings", "red_flags", "cited_papers", "engagement_factors", "hn_data"],
+ "properties": {
+ "scan_version": {"type": "integer", "const": 4},
+ "paper_type": {"type": "string", "enum": ["empirical", "benchmark-creation", "survey", "position", "theoretical"]},
+ "paper": {
+ "type": "object",
+ "required": ["title", "authors", "year"],
+ "properties": {
+ "title": {"type": "string"},
+ "authors": {"type": "array", "items": {"type": "string"}},
+ "year": {"type": "integer"},
+ "venue": {"type": "string"},
+ "arxiv_id": {"type": "string"},
+ "doi": {"type": "string"},
+ },
+ },
+ "checklist": {
+ "type": "object",
+ "description": "Shared core checklist — applies to ALL paper types.",
+ "required": list(SHARED_CORE.keys()),
+ "properties": SHARED_CORE,
+ },
+ "type_checklist": {
+ "type": "object",
+ "description": "Type-specific checklist module. Exactly one key matching paper_type.",
+ "properties": {
+ "empirical": {"type": "object", "properties": EMPIRICAL_MODULE},
+ "benchmark-creation": {"type": "object", "properties": BENCHMARK_MODULE},
+ "survey": {"type": "object", "properties": SURVEY_MODULE},
+ "position": {"type": "object", "properties": POSITION_MODULE},
+ "theoretical": {"type": "object", "properties": THEORETICAL_MODULE},
+ },
+ },
+ "claims": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "required": ["claim", "evidence", "supported"],
+ "properties": {
+ "claim": {"type": "string"},
+ "evidence": {"type": "string"},
+ "supported": {"type": "string", "enum": ["strong", "moderate", "weak", "unsupported"]},
+ },
+ },
+ },
+ "methodology_tags": {
+ "type": "array",
+ "items": {"type": "string", "enum": ["rct", "observational", "benchmark-eval", "case-study", "meta-analysis", "theoretical", "qualitative"]},
+ },
+ "key_findings": {"type": "string"},
+ "red_flags": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "required": ["flag", "detail"],
+ "properties": {"flag": {"type": "string"}, "detail": {"type": "string"}},
+ },
+ },
+ "cited_papers": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "required": ["title", "relevance"],
+ "properties": {
+ "title": {"type": "string"},
+ "authors": {"type": "array", "items": {"type": "string"}},
+ "year": {"type": "integer"},
+ "arxiv_id": {"type": "string"},
+ "doi": {"type": "string"},
+ "relevance": {"type": "string"},
+ },
+ },
+ },
+ "engagement_factors": {
+ "type": "object",
+ "required": ["practical_relevance", "surprise_contrarian", "fear_safety", "drama_conflict", "demo_ability", "brand_recognition"],
+ "properties": {
+ "practical_relevance": ENGAGEMENT_FACTOR,
+ "surprise_contrarian": ENGAGEMENT_FACTOR,
+ "fear_safety": ENGAGEMENT_FACTOR,
+ "drama_conflict": ENGAGEMENT_FACTOR,
+ "demo_ability": ENGAGEMENT_FACTOR,
+ "brand_recognition": ENGAGEMENT_FACTOR,
+ },
+ },
+ "hn_data": {
+ "type": "object",
+ "properties": {
+ "threads": {"type": "array"},
+ "top_points": {"type": "integer"},
+ "total_points": {"type": "integer"},
+ "total_comments": {"type": "integer"},
+ },
+ },
+ },
+ "$defs": {
+ "checklist_item": CHECKLIST_ITEM,
+ },
+}
+
+if __name__ == "__main__":
+ print(json.dumps(schema, indent=2, ensure_ascii=False))
diff --git a/scripts/run-scan-v4-haiku.py b/scripts/run-scan-v4-haiku.py
@@ -0,0 +1,515 @@
+#!/usr/bin/env python3
+"""
+V4 Haiku scan: fast coverage pass for all papers.
+
+For each paper with paper.txt and paper_type.json:
+1. Read paper text + paper_type
+2. Run Haiku to answer shared core + type-specific questions
+3. If existing v2/v3 Opus scan exists, merge: Opus answers override Haiku
+4. Write scan-v4.json (separate from scan.json to preserve v2/v3 data)
+
+Usage:
+ python3 scripts/run-scan-v4-haiku.py # All unscanned
+ python3 scripts/run-scan-v4-haiku.py --limit 10 # First N
+ python3 scripts/run-scan-v4-haiku.py --parallel 8 # Concurrent (Haiku is fast)
+ python3 scripts/run-scan-v4-haiku.py --id metr-rct-2025 # Specific paper
+ python3 scripts/run-scan-v4-haiku.py --force # Re-scan all
+"""
+
+import json
+import subprocess
+import sys
+import urllib.parse
+import urllib.request
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent.parent
+PAPERS_DIR = ROOT / "papers"
+SCHEMA_PATH = ROOT / "schema" / "scan-v4.schema.json"
+
+# Load schema for question descriptions
+with open(SCHEMA_PATH) as f:
+ SCHEMA = json.load(f)
+
+# ── Build prompt from schema ──────────────────────────────────────────
+
+def build_questions_text(category_obj):
+ """Extract question descriptions from a schema category object."""
+ lines = []
+ props = category_obj.get("properties", {})
+ for qname, qdef in props.items():
+ desc = qdef.get("description", "")
+ if not desc and "$ref" in qdef:
+ desc = qdef.get("description", qname)
+ lines.append(f" - **{qname}**: {desc}")
+ return "\n".join(lines)
+
+
+def build_prompt(paper_type, paper_text, paper_id, registry_entry, hn_data):
+ """Build the v4 Haiku scan prompt."""
+ core_cats = SCHEMA["properties"]["checklist"]["properties"]
+ type_mod = SCHEMA["properties"]["type_checklist"]["properties"].get(paper_type, {})
+ type_cats = type_mod.get("properties", {})
+
+ core_section = ""
+ for cat_name, cat_def in core_cats.items():
+ desc = cat_def.get("description", cat_name)
+ core_section += f"\n### {cat_name}\n{desc}\n{build_questions_text(cat_def)}\n"
+
+ type_section = ""
+ for cat_name, cat_def in type_cats.items():
+ desc = cat_def.get("description", cat_name)
+ type_section += f"\n### {cat_name}\n{desc}\n{build_questions_text(cat_def)}\n"
+
+ reg_json = json.dumps(registry_entry, indent=2, ensure_ascii=False) if registry_entry else "{}"
+
+ return f"""You are scanning a research paper for methodological quality. This is a {paper_type} paper.
+
+Answer every question with a JSON object containing:
+- "applies": true/false (is this criterion relevant to this paper?)
+- "answer": true/false (does the paper satisfy it? false when applies=false)
+- "justification": "1-2 sentences citing specific evidence"
+- "source": "haiku"
+
+Be strict. Absence of evidence = answer: false. Do not be generous.
+
+## Registry Entry
+```json
+{reg_json}
+```
+
+## Shared Core Questions (answer ALL of these)
+{core_section}
+
+## {paper_type.title()} Module Questions (answer ALL of these)
+{type_section}
+
+## Additional Required Fields
+
+### Claims
+Extract 3-8 key empirical claims. For each: {{"claim": "...", "evidence": "...", "supported": "strong|moderate|weak|unsupported"}}
+
+### Key Findings
+2-4 sentence summary of the paper's most important findings.
+
+### Red Flags
+List methodological concerns: {{"flag": "short label", "detail": "explanation"}}
+
+### Methodology Tags
+Assign from: rct, observational, benchmark-eval, case-study, meta-analysis, theoretical, qualitative
+
+### Cited Papers
+Extract 3-10 survey-relevant references: {{"title": "...", "relevance": "..."}}
+
+### Engagement Factors
+Rate 0-3 on each dimension:
+- practical_relevance: Can practitioners use this?
+- surprise_contrarian: Challenges conventional wisdom?
+- fear_safety: Raises AI risk concerns?
+- drama_conflict: Controversy angle?
+- demo_ability: Can someone try it now?
+- brand_recognition: Famous lab or product?
+Each: {{"score": 0-3, "justification": "1 sentence"}}
+
+## Output
+
+Respond with a single JSON object:
+{{
+ "scan_version": 4,
+ "paper_type": "{paper_type}",
+ "paper": {{"title": "...", "authors": [...], "year": ..., "venue": "...", "arxiv_id": "...", "doi": "..."}},
+ "checklist": {{<shared core categories with questions>}},
+ "type_checklist": {{"{paper_type}": {{<type-specific categories with questions>}}}},
+ "claims": [...],
+ "methodology_tags": [...],
+ "key_findings": "...",
+ "red_flags": [...],
+ "cited_papers": [...],
+ "engagement_factors": {{...}},
+ "hn_data": {json.dumps(hn_data)}
+}}
+
+## Paper Text
+{paper_text}
+"""
+
+
+# ── V2/V3 → V4 Merge Logic ───────────────────────────────────────────
+
+# Map v2/v3 category.question → v4 location
+V2_TO_V4_CORE = {
+ # claims_and_evidence → checklist.claims_and_evidence
+ "claims_and_evidence.abstract_claims_supported": ("checklist", "claims_and_evidence", "abstract_claims_supported"),
+ "claims_and_evidence.causal_claims_justified": ("checklist", "claims_and_evidence", "causal_claims_justified"),
+ "claims_and_evidence.generalization_bounded": ("checklist", "claims_and_evidence", "generalization_bounded"),
+ "claims_and_evidence.alternative_explanations_discussed": ("checklist", "claims_and_evidence", "alternative_explanations_discussed"),
+ "claims_and_evidence.proxy_outcome_distinction": ("checklist", "claims_and_evidence", "proxy_outcome_distinction"),
+ # limitations_and_scope → checklist.limitations_and_scope
+ "limitations_and_scope.limitations_section_present": ("checklist", "limitations_and_scope", "limitations_section_present"),
+ "limitations_and_scope.threats_to_validity_specific": ("checklist", "limitations_and_scope", "threats_to_validity_specific"),
+ "limitations_and_scope.scope_boundaries_stated": ("checklist", "limitations_and_scope", "scope_boundaries_stated"),
+ # conflicts_of_interest → checklist.conflicts_of_interest
+ "conflicts_of_interest.funding_disclosed": ("checklist", "conflicts_of_interest", "funding_disclosed"),
+ "conflicts_of_interest.affiliations_disclosed": ("checklist", "conflicts_of_interest", "affiliations_disclosed"),
+ "conflicts_of_interest.funder_independent_of_outcome": ("checklist", "conflicts_of_interest", "funder_independent_of_outcome"),
+ "conflicts_of_interest.financial_interests_declared": ("checklist", "conflicts_of_interest", "financial_interests_declared"),
+}
+
+# v2/v3 empirical questions → v4 type_checklist.empirical
+V2_TO_V4_EMPIRICAL = {}
+for cat in ["artifacts", "statistical_methodology", "evaluation_design", "setup_transparency",
+ "data_integrity", "contamination", "human_studies", "cost_and_practicality"]:
+ # Get question names from v2 schema
+ v2_schema_path = ROOT / "schema" / "scan.schema.json"
+ with open(v2_schema_path) as f:
+ v2_schema = json.load(f)
+ cat_props = v2_schema["properties"]["checklist"]["properties"].get(cat, {}).get("properties", {})
+ for qname in cat_props:
+ V2_TO_V4_EMPIRICAL[f"{cat}.{qname}"] = ("type_checklist", "empirical", cat, qname)
+
+# Also map conditional modules
+for cat in ["experimental_rigor", "data_leakage", "survey_methodology"]:
+ cat_props = v2_schema["properties"]["checklist"]["properties"].get(cat, {}).get("properties", {})
+ for qname in cat_props:
+ V2_TO_V4_EMPIRICAL[f"{cat}.{qname}"] = ("type_checklist", "empirical", cat, qname)
+
+
+def merge_opus_answers(v4_scan, v2_scan, paper_type):
+ """Overlay Opus v2/v3 answers onto Haiku v4 scan. Returns merged scan + agreement stats."""
+ agreements = 0
+ disagreements = 0
+ opus_overrides = 0
+
+ v2_checklist = v2_scan.get("checklist", {})
+
+ # Merge core questions
+ for v2_key, (section, cat, qname) in V2_TO_V4_CORE.items():
+ v2_cat, v2_qname = v2_key.split(".")
+ v2_answer = v2_checklist.get(v2_cat, {}).get(v2_qname)
+ if not v2_answer or not isinstance(v2_answer, dict):
+ continue
+
+ v4_section = v4_scan.get(section, {})
+ v4_cat_data = v4_section.get(cat, {})
+ v4_answer = v4_cat_data.get(qname)
+
+ if v4_answer and isinstance(v4_answer, dict):
+ # Compare
+ if v4_answer.get("applies") == v2_answer.get("applies") and v4_answer.get("answer") == v2_answer.get("answer"):
+ agreements += 1
+ else:
+ disagreements += 1
+
+ # Override with Opus
+ opus_item = {
+ "applies": v2_answer["applies"],
+ "answer": v2_answer["answer"],
+ "justification": v2_answer.get("justification", ""),
+ "source": "opus",
+ }
+ if cat not in v4_section:
+ v4_section[cat] = {}
+ v4_section[cat][qname] = opus_item
+ v4_scan[section] = v4_section
+ opus_overrides += 1
+
+ # Merge empirical type questions (only if paper is empirical)
+ if paper_type == "empirical":
+ for v2_key, path in V2_TO_V4_EMPIRICAL.items():
+ v2_cat, v2_qname = v2_key.split(".")
+ v2_answer = v2_checklist.get(v2_cat, {}).get(v2_qname)
+ if not v2_answer or not isinstance(v2_answer, dict):
+ continue
+
+ if len(path) == 4:
+ section, ptype, cat, qname = path
+ else:
+ continue
+
+ # Navigate to v4 location
+ type_cl = v4_scan.get(section, {})
+ type_data = type_cl.get(ptype, {})
+ cat_data = type_data.get(cat, {})
+ v4_answer = cat_data.get(qname)
+
+ if v4_answer and isinstance(v4_answer, dict):
+ if v4_answer.get("applies") == v2_answer.get("applies") and v4_answer.get("answer") == v2_answer.get("answer"):
+ agreements += 1
+ else:
+ disagreements += 1
+
+ opus_item = {
+ "applies": v2_answer["applies"],
+ "answer": v2_answer["answer"],
+ "justification": v2_answer.get("justification", ""),
+ "source": "opus",
+ }
+ if cat not in cat_data:
+ cat_data[qname] = opus_item
+ else:
+ cat_data[qname] = opus_item
+ type_data[cat] = cat_data
+ type_cl[ptype] = type_data
+ v4_scan[section] = type_cl
+ opus_overrides += 1
+
+ # Also merge engagement factors from v3 if Opus-generated
+ v2_ef = v2_scan.get("engagement_factors")
+ if v2_ef and v2_scan.get("scan_version", 1) >= 3:
+ v4_scan["engagement_factors"] = v2_ef
+
+ return v4_scan, {"agreements": agreements, "disagreements": disagreements, "opus_overrides": opus_overrides}
+
+
+# ── HN Fetch ──────────────────────────────────────────────────────────
+
+def fetch_hn(paper_id, arxiv_id=""):
+ """Fetch HN data. Returns dict compatible with hn_data schema."""
+ hn_path = PAPERS_DIR / paper_id / "hn.json"
+ if hn_path.exists():
+ with open(hn_path) as f:
+ return json.load(f)
+
+ if not arxiv_id:
+ return {"threads": [], "top_points": 0, "total_points": 0, "total_comments": 0}
+
+ try:
+ params = urllib.parse.urlencode({"query": arxiv_id, "tags": "story", "hitsPerPage": 10})
+ req = urllib.request.Request(f"https://hn.algolia.com/api/v1/search?{params}",
+ headers={"User-Agent": "research-survey/1.0"})
+ resp = urllib.request.urlopen(req, timeout=10)
+ data = json.loads(resp.read())
+ hits = data.get("hits", [])
+ threads = []
+ for h in hits:
+ threads.append({
+ "hn_id": h.get("objectID", ""),
+ "title": h.get("title", ""),
+ "points": h.get("points", 0) or 0,
+ "comments": h.get("num_comments", 0) or 0,
+ "url": f"https://news.ycombinator.com/item?id={h.get('objectID', '')}",
+ })
+ threads.sort(key=lambda t: -t["points"])
+ return {
+ "threads": threads,
+ "top_points": threads[0]["points"] if threads else 0,
+ "total_points": sum(t["points"] for t in threads),
+ "total_comments": sum(t["comments"] for t in threads),
+ }
+ except Exception:
+ return {"threads": [], "top_points": 0, "total_points": 0, "total_comments": 0}
+
+
+# ── Scan One Paper ────────────────────────────────────────────────────
+
+def load_registry():
+ entries = {}
+ with open(ROOT / "registry.jsonl") as f:
+ for line in f:
+ if line.strip():
+ e = json.loads(line)
+ entries[e["id"]] = e
+ return entries
+
+
+def scan_one(paper_id, registry, force=False):
+ """Run v4 Haiku scan on one paper. Returns (paper_id, ok, reason, stats)."""
+ v4_path = PAPERS_DIR / paper_id / "scan-v4.json"
+ if v4_path.exists() and not force:
+ return paper_id, True, "already scanned", {}
+
+ txt_path = PAPERS_DIR / paper_id / "paper.txt"
+ type_path = PAPERS_DIR / paper_id / "paper_type.json"
+
+ if not txt_path.exists():
+ return paper_id, False, "no paper.txt", {}
+ if not type_path.exists():
+ return paper_id, False, "no paper_type.json", {}
+
+ with open(type_path) as f:
+ paper_type = json.load(f).get("paper_type")
+ if not paper_type:
+ return paper_id, False, "invalid paper_type", {}
+
+ paper_text = txt_path.read_text(encoding="utf-8").replace("\x00", "")
+ reg_entry = registry.get(paper_id, {})
+ arxiv_id = reg_entry.get("arxiv_id", "")
+
+ # Fetch HN data
+ hn_data = fetch_hn(paper_id, arxiv_id)
+
+ # Build and run prompt
+ prompt = build_prompt(paper_type, paper_text, paper_id, reg_entry, hn_data)
+
+ # Pick model: haiku for most papers, sonnet for large ones
+ model = "haiku"
+ if len(paper_text) > 50000:
+ model = "sonnet"
+
+ try:
+ result = subprocess.run(
+ ["claude", "-p", "-", "--model", model, "--max-turns", "1"],
+ input=prompt,
+ capture_output=True, text=True, timeout=600,
+ cwd=str(ROOT),
+ )
+
+ if result.returncode != 0:
+ # Retry with sonnet if haiku failed
+ if model == "haiku":
+ model = "sonnet"
+ result = subprocess.run(
+ ["claude", "-p", "-", "--model", model, "--max-turns", "1"],
+ input=prompt,
+ capture_output=True, text=True, timeout=600,
+ cwd=str(ROOT),
+ )
+ if result.returncode != 0:
+ return paper_id, False, f"claude exit {result.returncode} (sonnet fallback)", {}
+ else:
+ return paper_id, False, f"claude exit {result.returncode}", {}
+
+ output = result.stdout.strip()
+ json_start = output.find("{")
+ json_end = output.rfind("}") + 1
+ if json_start == -1 or json_end == 0:
+ return paper_id, False, "no JSON in output", {}
+
+ v4_scan = json.loads(output[json_start:json_end])
+
+ # Ensure required fields
+ v4_scan["scan_version"] = 4
+ v4_scan["paper_type"] = paper_type
+ v4_scan["hn_data"] = hn_data
+
+ # Mark all answers with the model that produced them
+ scan_model = model # haiku or sonnet
+ for section_key in ["checklist", "type_checklist"]:
+ section = v4_scan.get(section_key, {})
+ if section_key == "type_checklist":
+ for ptype_key, ptype_data in section.items():
+ if isinstance(ptype_data, dict):
+ for cat_data in ptype_data.values():
+ if isinstance(cat_data, dict):
+ for qd in cat_data.values():
+ if isinstance(qd, dict) and "applies" in qd and "source" not in qd:
+ qd["source"] = scan_model
+ else:
+ for cat_data in section.values():
+ if isinstance(cat_data, dict):
+ for qd in cat_data.values():
+ if isinstance(qd, dict) and "applies" in qd and "source" not in qd:
+ qd["source"] = scan_model
+
+ # Merge Opus answers if v2/v3 scan exists
+ merge_stats = {}
+ v2_path = PAPERS_DIR / paper_id / "scan.json"
+ if v2_path.exists():
+ with open(v2_path) as f:
+ v2_scan = json.load(f)
+ if v2_scan.get("scan_version", 1) >= 2:
+ v4_scan, merge_stats = merge_opus_answers(v4_scan, v2_scan, paper_type)
+
+ # Write v4 scan
+ with open(v4_path, "w") as f:
+ json.dump(v4_scan, f, ensure_ascii=False, indent=2)
+
+ opus_n = merge_stats.get("opus_overrides", 0)
+ agree = merge_stats.get("agreements", 0)
+ disagree = merge_stats.get("disagreements", 0)
+ model_tag = scan_model
+ coverage = f"{model_tag}-only" if opus_n == 0 else f"merged({model_tag}+opus={opus_n},agree={agree},disagree={disagree})"
+ return paper_id, True, coverage, merge_stats
+
+ except json.JSONDecodeError as e:
+ return paper_id, False, f"JSON error: {e}", {}
+ except subprocess.TimeoutExpired:
+ return paper_id, False, "timeout", {}
+ except Exception as e:
+ return paper_id, False, f"error: {e}", {}
+
+
+# ── Main ──────────────────────────────────────────────────────────────
+
+def main():
+ args = sys.argv[1:]
+ force = "--force" in args
+ limit = None
+ specific_id = None
+ parallel = 1
+
+ for i, arg in enumerate(args):
+ if arg == "--limit" and i + 1 < len(args):
+ limit = int(args[i + 1])
+ if arg == "--id" and i + 1 < len(args):
+ specific_id = args[i + 1]
+ if arg == "--parallel" and i + 1 < len(args):
+ parallel = int(args[i + 1])
+
+ registry = load_registry()
+
+ # Collect candidates: papers with paper.txt + paper_type.json
+ candidates = []
+ for type_path in sorted(PAPERS_DIR.glob("*/paper_type.json")):
+ pid = type_path.parent.name
+ if specific_id and pid != specific_id:
+ continue
+ v4_path = type_path.parent / "scan-v4.json"
+ if v4_path.exists() and not force and not specific_id:
+ continue
+ txt_path = type_path.parent / "paper.txt"
+ if not txt_path.exists():
+ continue
+ candidates.append(pid)
+
+ if limit:
+ candidates = candidates[:limit]
+
+ if not candidates:
+ print("No papers to scan.")
+ return
+
+ print(f"V4 Haiku scan: {len(candidates)} papers"
+ f"{f' (parallel={parallel})' if parallel > 1 else ''}:\n")
+
+ total_agree = 0
+ total_disagree = 0
+ ok_count = 0
+ fail_count = 0
+
+ if parallel > 1:
+ with ThreadPoolExecutor(max_workers=parallel) as executor:
+ futures = {executor.submit(scan_one, pid, registry, force): pid for pid in candidates}
+ for future in as_completed(futures):
+ pid, ok, reason, stats = future.result()
+ if ok:
+ ok_count += 1
+ total_agree += stats.get("agreements", 0)
+ total_disagree += stats.get("disagreements", 0)
+ if "merged" in reason:
+ print(f" OK: {pid} — {reason}")
+ else:
+ fail_count += 1
+ print(f" FAIL: {pid} — {reason}")
+ else:
+ for i, pid in enumerate(candidates):
+ if (i + 1) % 20 == 0:
+ print(f" ... {i+1}/{len(candidates)}")
+ pid, ok, reason, stats = scan_one(pid, registry, force)
+ if ok:
+ ok_count += 1
+ total_agree += stats.get("agreements", 0)
+ total_disagree += stats.get("disagreements", 0)
+ else:
+ fail_count += 1
+ print(f" FAIL: {pid} — {reason}")
+
+ print(f"\nDone. OK: {ok_count}, Failed: {fail_count}")
+ if total_agree + total_disagree > 0:
+ rate = total_agree / (total_agree + total_disagree) * 100
+ print(f"Haiku-Opus agreement: {total_agree}/{total_agree + total_disagree} ({rate:.1f}%)")
+
+
+if __name__ == "__main__":
+ main()