commit d6d31c6cb0ff5d41b80f2e4eaeb2e992c3702dd8
parent 781cf7f2cc3cdd41d5fea630408c3cd59982712e
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Mon, 23 Mar 2026 13:26:54 +0100
Add v3 scan instrument with engagement factors, catchup script
v3 extends v2 with 6 engagement factor dimensions (0-3 each):
- practical_relevance, surprise_contrarian, fear_safety,
drama_conflict, demo_ability, brand_recognition
Two scripts:
- scripts/catchup-v3.py: upgrades existing v2 scans to v3 by running
Opus classification on title + key_findings + claims (cheap, no full
paper text needed). Supports --parallel, --limit, --id.
- agents/scan-agent.md: updated to produce v3 directly for new scans.
Tested on 13 papers. Scores align with manual prototype:
Codex [3,2,1,1,2,3], Agents of Chaos [2,2,3,2,1,2],
Chain-of-Thought [3,2,0,0,2,2].
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
15 files changed, 866 insertions(+), 74 deletions(-)
diff --git a/agents/scan-agent.md b/agents/scan-agent.md
@@ -1,6 +1,6 @@
# Scan Agent
-**Model: Sonnet** (boolean checklist is factual lookup, not subjective judgment. Opus used for calibration subset only.)
+**Model: Opus**
You are a research paper scan agent. Your job is to read a research paper and answer a structured boolean checklist about its methodological quality.
@@ -169,6 +169,23 @@ Your output must be valid JSON conforming to `schema/scan.schema.json`:
- `cited_papers` must each have at least `title` and `relevance`
- `red_flags` must each have `flag` and `detail`
+### 8. Classify Engagement Factors (v3)
+
+Rate the paper on 6 dimensions that predict social/media attention (0-3 scale). These are NOT quality judgments — they measure what makes a paper likely to be discussed on Hacker News, Reddit, or tech newsletters.
+
+Add an `engagement_factors` object to the output with these keys:
+
+- **`practical_relevance`** (0-3): Can a practitioner use this at work? 0=pure theory, 3=immediately usable tool/technique.
+- **`surprise_contrarian`** (0-3): Does this challenge conventional wisdom? 0=confirms expectations, 3=directly contradicts a widely-held belief.
+- **`fear_safety`** (0-3): Does this raise AI risk/security concerns? 0=none, 3=demonstrates a novel attack or existential concern.
+- **`drama_conflict`** (0-3): Is there controversy? 0=none, 3=major "benchmarks are fake" or "company X is lying" angle.
+- **`demo_ability`** (0-3): Can someone try it now? 0=no code/demo, 3=live demo or pip-installable tool.
+- **`brand_recognition`** (0-3): Famous lab or product? 0=unknown, 3=about ChatGPT/Copilot or from OpenAI/Anthropic.
+
+Each factor needs a `score` (integer 0-3) and `justification` (1 sentence).
+
+Set `scan_version` to `3` in the output.
+
## Guidelines
- Be fair but strict. A false is not an insult; it is information.
diff --git a/papers/2025-ai-agent-2026/scan.json b/papers/2025-ai-agent-2026/scan.json
@@ -16,7 +16,7 @@
"venue": "arXiv",
"arxiv_id": "2602.17753"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"survey_methodology"
],
@@ -479,5 +479,31 @@
"year": 2025,
"relevance": "Provides characterization framework for AI agents used in the Index's agency inclusion criteria."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Useful as a reference for comparing agent platforms but doesn't provide actionable techniques practitioners can directly apply."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The finding that 25/30 agents disclose no internal safety results and the 'safety washing' framing challenge the industry's public safety narratives."
+ },
+ "fear_safety": {
+ "score": 2,
+ "justification": "Safety is a major theme with concrete findings about transparency gaps, browser agents operating at high autonomy without evaluations, and foundation model concentration as single points of failure."
+ },
+ "drama_conflict": {
+ "score": 2,
+ "justification": "Directly names specific companies and products as lacking safety transparency, with the 'safety washing' framing creating a clear 'emperor has no clothes' angle."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "The dataset of 1,350 annotations is presumably available but requires effort to explore; no interactive demo or tool to try."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "Covers products millions use (ChatGPT, Claude, Gemini, Copilot) and names major companies, though the authors themselves are not from a famous lab."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/3dshape2vecset-3d-shape-2023/scan.json b/papers/3dshape2vecset-3d-shape-2023/scan.json
@@ -1,5 +1,5 @@
{
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -520,5 +520,31 @@
"arxiv_id": "2210.06978",
"relevance": "Related diffusion model for 3D point cloud generation."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Provides a usable 3D shape representation and generation pipeline applicable to graphics practitioners working with neural fields and diffusion models."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "Coordinate-free latent set representation is a novel design choice but doesn't challenge any widely-held belief."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle in 3D shape representation research."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "Straightforward benchmarking improvements with no controversy or challenges to specific companies."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "SIGGRAPH paper likely has code released but 3D generation pipelines require significant GPU setup and dependencies."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "TU Munich (Niessner) and KAUST (Wonka) are recognized in graphics but not household names in broader tech circles."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/a2hcoder-llmdriven-coding-2025/scan.json b/papers/a2hcoder-llmdriven-coding-2025/scan.json
@@ -10,7 +10,7 @@
"year": 2025,
"arxiv_id": "2508.10904"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor"
],
@@ -480,5 +480,31 @@
"year": 2023,
"relevance": "Foundational work on generative multi-agent systems."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "FPGA/HLS automation is useful for a narrow hardware engineering audience, but the approach is not generalizable or released as a tool."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The finding that algorithm-level restructuring matters more than direct LLM translation is mildly interesting but not deeply surprising."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle is present in this hardware design automation work."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy or conflict; the paper doesn't challenge any specific company's claims or popular beliefs."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "No code, no demo, no reproducibility artifacts released, and the LLM model version isn't even specified."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "Unknown authors and lab; the mention of Claude Code adds slight recognition but the paper itself is from an obscure group."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/aart-aiassisted-redteaming-2023/scan.json b/papers/aart-aiassisted-redteaming-2023/scan.json
@@ -11,7 +11,7 @@
"venue": "arXiv",
"arxiv_id": "2311.08592"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"methodology_tags": [
"case-study",
@@ -421,5 +421,31 @@
"year": 2021,
"relevance": "LLM risk taxonomy foundational to red-teaming and safety evaluation approaches."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Describes a structured red-teaming pipeline that safety teams could adapt for their own LLM application testing workflows."
+ },
+ "surprise_contrarian": {
+ "score": 0,
+ "justification": "Confirms the expected finding that structured generation produces more diverse adversarial prompts than repurposing existing datasets."
+ },
+ "fear_safety": {
+ "score": 1,
+ "justification": "Red-teaming is safety-adjacent but the paper focuses on the generation pipeline rather than demonstrating novel attacks or vulnerabilities."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy or conflict; it's a straightforward tool paper from Google presenting their internal methodology."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "No code release, no demo, and the pipeline depends on Google's PaLM API with no public reproduction path."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "All authors are from Google Research and the work discusses enabling Google product launches."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/acar-adaptive-complexity-2026/scan.json b/papers/acar-adaptive-complexity-2026/scan.json
@@ -7,7 +7,7 @@
"year": 2026,
"arxiv_id": "2602.21231"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -485,5 +485,31 @@
"year": 2022,
"relevance": "Attribution methodology for ML model contributions, relevant to ACAR's failed attribution proxy experiments."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "The routing concept is potentially useful for multi-model deployments but the modest accuracy gains and specific benchmark setup limit immediate applicability."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The negative results on retrieval augmentation and attribution proxies are mildly surprising but the core finding that simple routing can approximate ensembles is expected."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle is present in this work."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy or challenge to specific companies or widely-held beliefs; this is a straightforward systems optimization paper."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "No mention of released code, demo, or reproducible tooling from a single unknown author."
+ },
+ "brand_recognition": {
+ "score": 0,
+ "justification": "Single unknown author with no institutional affiliation mentioned and no venue publication."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/agentic-bug-reproduction-2025/scan.json b/papers/agentic-bug-reproduction-2025/scan.json
@@ -1,5 +1,5 @@
{
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -557,5 +557,31 @@
"year": 2023,
"relevance": "LLM-as-a-Judge methodology used in this paper for sampling plausible BRTs for RQ2."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Demonstrates a concrete technique (agentic bug reproduction tests for APR) that practitioners could adapt, though the specific tooling is Google-internal."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The finding that auto-generated bug reproduction tests improve APR is intuitive rather than surprising, though the 30% improvement magnitude is notable."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle in automated program repair research."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "The red flags around Google employees evaluating their own non-reproducible system invite mild skepticism but the paper doesn't challenge external claims."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "Entirely built on Google-internal infrastructure, proprietary models, and private codebase with no public code or demo."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "Google authorship and 'at Google' in the title draws attention, plus it involves Gemini fine-tuning on production-scale codebases."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/agentic-refactoring-empirical-2025/scan.json b/papers/agentic-refactoring-empirical-2025/scan.json
@@ -13,7 +13,7 @@
"venue": "arXiv preprint",
"arxiv_id": "2511.04824"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"methodology_tags": [
"observational"
@@ -466,5 +466,31 @@
"year": 2024,
"relevance": "Empirical study of ChatGPT refactoring showing inconsistency and unnecessary edits."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Findings about agent refactoring patterns are informative but don't provide actionable techniques developers can directly apply to their workflows."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The finding that AI agents mostly do trivial renaming rather than meaningful architectural refactoring, and that smell counts don't actually improve, challenges the narrative that AI coding tools meaningfully improve code quality."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle in this empirical software engineering study."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Mildly deflating for AI coding tool hype by showing agents do shallow refactoring with negligible quality impact, but doesn't directly challenge any company's claims."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "Mining study with no tool, demo, or code artifact that others can try."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "Involves OpenAI Codex prominently but authors are from less widely-known academic institutions, and the paper is an arXiv preprint not from a major lab."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/agents-of-chaos-2026/scan.json b/papers/agents-of-chaos-2026/scan.json
@@ -45,7 +45,7 @@
"venue": "arXiv",
"arxiv_id": "2602.20021"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"checklist": {
"artifacts": {
@@ -408,103 +408,187 @@
"cited_papers": [
{
"title": "HAICosystem: An ecosystem for sandboxing safety risks in human-AI interactions",
- "authors": ["Xuhui Zhou", "Hyunwoo Kim", "Faeze Brahman"],
+ "authors": [
+ "Xuhui Zhou",
+ "Hyunwoo Kim",
+ "Faeze Brahman"
+ ],
"year": 2025,
"arxiv_id": "2409.16427",
"relevance": "Multi-turn safety evaluation framework for agentic AI systems covering operational, content, societal, and legal risks — a key benchmark for agent safety."
},
{
"title": "OpenAgentSafety: A comprehensive framework for evaluating real-world AI agent safety",
- "authors": ["Sanidhya Vijayvargiya", "Aditya Bharat Soni", "Xuhui Zhou"],
+ "authors": [
+ "Sanidhya Vijayvargiya",
+ "Aditya Bharat Soni",
+ "Xuhui Zhou"
+ ],
"year": 2026,
"arxiv_id": "2507.06134",
"relevance": "Runs agents in containerized sandboxes with real tools across 350+ multi-turn tasks for safety evaluation, combining rule-based and LLM-as-judge approaches."
},
{
"title": "AgentHarm: A benchmark for measuring harmfulness of LLM agents",
- "authors": ["Maksym Andriushchenko", "Alexandra Souly", "Mateusz Dziemian"],
+ "authors": [
+ "Maksym Andriushchenko",
+ "Alexandra Souly",
+ "Mateusz Dziemian"
+ ],
"year": 2025,
"arxiv_id": "2410.09024",
"relevance": "Benchmarks malicious multi-step agent tasks across harm categories, measuring both refusal behavior and robustness to jailbreak attacks."
},
{
"title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
- "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
+ "authors": [
+ "Evan Hubinger",
+ "Carson Denison",
+ "Jesse Mu"
+ ],
"year": 2024,
"arxiv_id": "2401.05566",
"relevance": "Demonstrates that deceptive behaviors can persist through safety training, directly relevant to the persistence of injected instructions in Case Study #10."
},
{
"title": "Agentic misalignment: How LLMs could be insider threats",
- "authors": ["Aengus Lynch", "Benjamin Wright", "Caleb Larson"],
+ "authors": [
+ "Aengus Lynch",
+ "Benjamin Wright",
+ "Caleb Larson"
+ ],
"year": 2025,
"arxiv_id": "2510.05179",
"relevance": "Reports insider-style harmful actions by models with access to sensitive information under goal conflict — directly relevant to agent safety and autonomy failures."
},
{
"title": "Frontier models are capable of in-context scheming",
- "authors": ["Alexander Meinke", "Bronson Schoen", "Jérémy Scheurer"],
+ "authors": [
+ "Alexander Meinke",
+ "Bronson Schoen",
+ "Jérémy Scheurer"
+ ],
"year": 2025,
"arxiv_id": "2412.04984",
"relevance": "Provides evidence that LLMs can engage in goal-directed, multi-step scheming behaviors using in-context reasoning alone."
},
{
"title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
- "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra"],
+ "authors": [
+ "Kai Greshake",
+ "Sahar Abdelnabi",
+ "Shailesh Mishra"
+ ],
"year": 2023,
"arxiv_id": "2302.12173",
"relevance": "Foundational work on indirect prompt injection in LLM-integrated applications, directly instantiated in this paper's Case Studies #8 and #10."
},
{
"title": "Agent Skills enable a new class of realistic and trivially simple prompt injections",
- "authors": ["David Schmotz", "Sahar Abdelnabi", "Maksym Andriushchenko"],
+ "authors": [
+ "David Schmotz",
+ "Sahar Abdelnabi",
+ "Maksym Andriushchenko"
+ ],
"year": 2025,
"arxiv_id": "2510.26328",
"relevance": "Shows that markdown skill files loaded into agent context enable realistic prompt injections including data exfiltration — matches the constitution attack vector in Case Study #10."
},
{
"title": "Why do multi-agent LLM systems fail?",
- "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang"],
+ "authors": [
+ "Mert Cemri",
+ "Melissa Z Pan",
+ "Shuyi Yang"
+ ],
"year": 2025,
"relevance": "Finds circular exchanges and token-consuming spirals across seven multi-agent frameworks, complementing this paper's Case Study #4 on agent looping."
},
{
"title": "Generative agents: Interactive simulacra of human behavior",
- "authors": ["Joon Sung Park", "Joseph C. O'Brien", "Carrie J. Cai"],
+ "authors": [
+ "Joon Sung Park",
+ "Joseph C. O'Brien",
+ "Carrie J. Cai"
+ ],
"year": 2023,
"arxiv_id": "2304.03442",
"relevance": "Demonstrates emergent goal-directed behavior in multi-agent settings, suggesting misalignment need not be deliberate to be consequential."
},
{
"title": "Breaking agents: Compromising autonomous LLM agents through malfunction amplification",
- "authors": ["Boyang Zhang", "Yicong Tan", "Yun Shen"],
+ "authors": [
+ "Boyang Zhang",
+ "Yicong Tan",
+ "Yun Shen"
+ ],
"year": 2025,
"relevance": "Shows that prompt injection can induce infinite action loops in agents with over 80% success, directly relevant to looping and resource waste findings."
},
{
"title": "Governing AI agents",
- "authors": ["Noam Kolt"],
+ "authors": [
+ "Noam Kolt"
+ ],
"year": 2025,
"relevance": "Legal framework for AI agent governance identifying information asymmetry, discretionary authority, and absence of loyalty mechanisms — directly instantiated by this paper's case studies."
},
{
"title": "The landscape of emerging AI agent architectures for reasoning, planning, and tool calling: A survey",
- "authors": ["Tula Masterman", "Sandi Besen", "Mason Sawtell"],
+ "authors": [
+ "Tula Masterman",
+ "Sandi Besen",
+ "Mason Sawtell"
+ ],
"year": 2024,
"relevance": "Survey of agent architecture patterns relevant to understanding the scaffolding vulnerabilities documented in this red-teaming study."
},
{
"title": "Auditing language models for hidden objectives",
- "authors": ["Samuel Marks", "Johannes Treutlein", "Trenton Bricken"],
+ "authors": [
+ "Samuel Marks",
+ "Johannes Treutlein",
+ "Trenton Bricken"
+ ],
"year": 2025,
"arxiv_id": "2503.10965",
"relevance": "Introduces a testbed for detecting hidden objectives in language models through blind auditing, relevant to alignment auditing of agent systems."
},
{
"title": "Practices for governing agentic AI systems",
- "authors": ["Yonadav Shavit", "Sandhini Agarwal", "Miles Brundage"],
+ "authors": [
+ "Yonadav Shavit",
+ "Sandhini Agarwal",
+ "Miles Brundage"
+ ],
"year": 2023,
"relevance": "Enumerates seven operational practices for safe agent deployment including constrained action spaces, human approval, logging, and interruptibility — several of which this paper's agents demonstrably lack."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "The 11 vulnerability categories and three structural deficits provide actionable security checklists for anyone deploying autonomous agents in multi-user environments."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The finding that social attack surfaces (display name spoofing, emotional manipulation) pose greater threats than technical jailbreaks challenges the focus on prompt injection as the primary agent risk."
+ },
+ "fear_safety": {
+ "score": 3,
+ "justification": "Concrete demonstrations of full system takeover via identity spoofing, unauthorized data exfiltration of 124 email records, and persistent behavioral control through memory injection are viscerally alarming for anyone considering agent deployment."
+ },
+ "drama_conflict": {
+ "score": 2,
+ "justification": "Directly demonstrates that Claude Opus 4.6 and Kimi K2.5 agents fail basic security checks in deployed settings, with Kimi additionally censoring politically sensitive topics — naming specific products and their failures."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "The OpenClaw framework exists but replicating the multi-agent Discord deployment with 20 participants over two weeks requires significant infrastructure and coordination."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "Tests Claude Opus 4.6 (Anthropic's flagship) and Kimi K2.5 (Moonshot AI), both recognizable names, though the authors themselves and the OpenClaw framework are less well-known."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/ai-ides-vs-agents-impact-2026/scan.json b/papers/ai-ides-vs-agents-impact-2026/scan.json
@@ -1,5 +1,5 @@
{
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"paper": {
"title": "AI IDEs or Autonomous Agents? Measuring the Impact of Coding Agents on Software Development",
@@ -459,5 +459,31 @@
"year": 2021,
"relevance": "Methodological foundation — the imputation-based DiD estimator used as the primary causal inference method."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Directly informs engineering managers' decisions about adopting coding agents vs AI IDEs, with quantified trade-offs on velocity and code quality."
+ },
+ "surprise_contrarian": {
+ "score": 3,
+ "justification": "Directly contradicts the hype that AI coding agents universally boost productivity — shows gains vanish if you already use an AI IDE, while quality degradation persists regardless."
+ },
+ "fear_safety": {
+ "score": 1,
+ "justification": "Raises concerns about agent-induced technical debt (+18% warnings, +39% complexity) but this is a maintainability risk rather than a safety or security vulnerability."
+ },
+ "drama_conflict": {
+ "score": 2,
+ "justification": "Challenges the narrative pushed by AI coding tool vendors that agents deliver compounding productivity gains, revealing a speed-maintainability trade-off and diminishing returns."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "Observational study with no tool, demo, or code artifact that practitioners can try themselves."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "CMU authors are well-recognized in software engineering research but not household names; the topic (coding agents like Copilot/Cursor) is famous but the paper itself doesn't come from those companies."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/chain-of-thought-prompting-2022/scan.json b/papers/chain-of-thought-prompting-2022/scan.json
@@ -16,8 +16,11 @@
"venue": "NeurIPS 2022",
"arxiv_id": "2201.11903"
},
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
"checklist": {
"artifacts": {
"code_released": {
@@ -398,7 +401,9 @@
"supported": "moderate"
}
],
- "methodology_tags": ["benchmark-eval"],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "Chain-of-thought prompting—providing intermediate reasoning steps in few-shot exemplars—dramatically improves large language model performance on arithmetic, commonsense, and symbolic reasoning tasks, but only at model scales of ~100B+ parameters. PaLM 540B with CoT achieved state-of-the-art on GSM8K (56.9%) surpassing finetuned approaches. Ablation studies show the benefit comes from the semantic content of reasoning steps, not merely from additional computation or knowledge activation. The approach is robust across different annotators, exemplar sets, and model families.",
"red_flags": [
{
@@ -421,83 +426,157 @@
"cited_papers": [
{
"title": "Language Models are Few-Shot Learners",
- "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
+ "authors": [
+ "Tom Brown",
+ "Benjamin Mann",
+ "Nick Ryder"
+ ],
"year": 2020,
"relevance": "Foundational work on few-shot prompting with GPT-3, the baseline approach that CoT prompting extends."
},
{
"title": "Training Verifiers to Solve Math Word Problems",
- "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian"],
+ "authors": [
+ "Karl Cobbe",
+ "Vineet Kosaraju",
+ "Mohammad Bavarian"
+ ],
"year": 2021,
"arxiv_id": "2110.14168",
"relevance": "Introduced GSM8K benchmark and the finetuned GPT-3 verifier approach that CoT prompting surpasses."
},
{
"title": "Evaluating Large Language Models Trained on Code",
- "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
+ "authors": [
+ "Mark Chen",
+ "Jerry Tworek",
+ "Heewoo Jun"
+ ],
"year": 2021,
"arxiv_id": "2107.03374",
"relevance": "Introduces Codex and code evaluation methodology; one of the five model families evaluated with CoT prompting."
},
{
"title": "Emergent Abilities of Large Language Models",
- "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"],
+ "authors": [
+ "Jason Wei",
+ "Yi Tay",
+ "Rishi Bommasani"
+ ],
"year": 2022,
"relevance": "Provides theoretical framing for the emergence of CoT reasoning at scale, directly tied to this paper's core finding."
},
{
"title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
- "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
+ "authors": [
+ "Xuezhi Wang",
+ "Jason Wei",
+ "Dale Schuurmans"
+ ],
"year": 2022,
"arxiv_id": "2203.11171",
"relevance": "Follow-up work showing majority voting over sampled CoT generations further improves performance."
},
{
"title": "Show Your Work: Scratchpads for Intermediate Computation with Language Models",
- "authors": ["Maxwell Nye", "Anders Johan Andreassen", "Guy Gur-Ari"],
+ "authors": [
+ "Maxwell Nye",
+ "Anders Johan Andreassen",
+ "Guy Gur-Ari"
+ ],
"year": 2021,
"arxiv_id": "2112.00114",
"relevance": "Closest prior work using intermediate computation steps for program execution; CoT generalizes this to natural language."
},
{
"title": "Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems",
- "authors": ["Wang Ling", "Dani Yogatama", "Chris Dyer"],
+ "authors": [
+ "Wang Ling",
+ "Dani Yogatama",
+ "Chris Dyer"
+ ],
"year": 2017,
"relevance": "Pioneered natural language rationales for math problem solving, the training-based predecessor to CoT prompting."
},
{
"title": "Do As I Can, Not As I Say: Grounding Language in Robotic Affordances",
- "authors": ["Michael Ahn", "Anthony Brohan", "Noah Brown"],
+ "authors": [
+ "Michael Ahn",
+ "Anthony Brohan",
+ "Noah Brown"
+ ],
"year": 2022,
"arxiv_id": "2204.01691",
"relevance": "SayCan robot planning benchmark used to evaluate CoT prompting for commonsense reasoning in robotic instruction following."
},
{
"title": "Scaling Language Models: Methods, Analysis & Insights from Training Gopher",
- "authors": ["Jack W. Rae", "Sebastian Borgeaud", "Trevor Cai"],
+ "authors": [
+ "Jack W. Rae",
+ "Sebastian Borgeaud",
+ "Trevor Cai"
+ ],
"year": 2021,
"arxiv_id": "2112.11446",
"relevance": "Documented that scaling alone is insufficient for reasoning tasks, motivating CoT prompting as an alternative approach."
},
{
"title": "STaR: Bootstrapping Reasoning with Reasoning",
- "authors": ["Eric Zelikman", "Yuhuai Wu", "Noah D. Goodman"],
+ "authors": [
+ "Eric Zelikman",
+ "Yuhuai Wu",
+ "Noah D. Goodman"
+ ],
"year": 2022,
"arxiv_id": "2203.14465",
"relevance": "Extends CoT idea to self-training: models generate rationales, filter correct ones, and finetune on them."
},
{
"title": "Finetuned Language Models Are Zero-Shot Learners",
- "authors": ["Jason Wei", "Maarten Bosma", "Vincent Y. Zhao"],
+ "authors": [
+ "Jason Wei",
+ "Maarten Bosma",
+ "Vincent Y. Zhao"
+ ],
"year": 2022,
"relevance": "Instruction tuning work (FLAN) that augments inputs with task instructions; CoT takes the orthogonal approach of augmenting outputs."
},
{
"title": "Program Synthesis with Large Language Models",
- "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
+ "authors": [
+ "Jacob Austin",
+ "Augustus Odena",
+ "Maxwell Nye"
+ ],
"year": 2021,
"arxiv_id": "2108.07732",
"relevance": "Evaluates LLMs for code generation, related to using intermediate steps in program synthesis."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 3,
+ "justification": "Chain-of-thought prompting is a directly usable technique that any developer can apply immediately to their LLM prompts with zero tooling changes."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The finding that simply adding reasoning steps to prompts unlocks dramatic performance gains—and that this is an emergent property of scale—was genuinely surprising when published."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "The paper focuses on improving reasoning capabilities with no discussion of safety risks or misuse potential."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy or conflict; the paper presents a new technique without challenging specific claims or rivals."
+ },
+ "demo_ability": {
+ "score": 2,
+ "justification": "Anyone with API access to a large language model can reproduce the technique immediately by modifying their prompts, though the flagship results require access to 100B+ parameter models."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "Google Brain is a major research lab, and the paper features PaLM and GPT-3, both well-known models in the AI community."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/codex-humaneval-2021/scan.json b/papers/codex-humaneval-2021/scan.json
@@ -65,8 +65,11 @@
"venue": "arXiv",
"arxiv_id": "2107.03374"
},
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
"checklist": {
"artifacts": {
"code_released": {
@@ -462,7 +465,9 @@
"supported": "strong"
}
],
- "methodology_tags": ["benchmark-eval"],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "Codex, a GPT model fine-tuned on 159 GB of GitHub Python code, solves 28.8% of hand-written HumanEval problems with a single sample, vastly outperforming GPT-3 (0%) and GPT-J (11.4%). Repeated sampling is surprisingly effective: 100 samples per problem yield 77.5% pass rate with Codex-S. The paper establishes functional correctness (pass@k) as the appropriate metric over BLEU, demonstrates power law scaling with model size, and provides extensive analysis of limitations including exponential degradation with docstring complexity, misalignment that worsens with scale, and frequent generation of insecure code.",
"red_flags": [
{
@@ -481,86 +486,174 @@
"cited_papers": [
{
"title": "Language Models are Few-Shot Learners",
- "authors": ["Brown, T. B.", "Mann, B.", "Ryder, N.", "et al."],
+ "authors": [
+ "Brown, T. B.",
+ "Mann, B.",
+ "Ryder, N.",
+ "et al."
+ ],
"year": 2020,
"arxiv_id": "2005.14165",
"relevance": "Foundation GPT-3 model that Codex is fine-tuned from; baseline for code generation capability comparison."
},
{
"title": "Measuring Coding Challenge Competence with APPS",
- "authors": ["Hendrycks, D.", "Basart, S.", "Kadavath, S.", "et al."],
+ "authors": [
+ "Hendrycks, D.",
+ "Basart, S.",
+ "Kadavath, S.",
+ "et al."
+ ],
"year": 2021,
"arxiv_id": "2105.09938",
"relevance": "Coding challenge benchmark used to evaluate Codex alongside HumanEval; measures functional correctness on competitive programming tasks."
},
{
"title": "GPT-J-6B: A 6 Billion Parameter Autoregressive Language Model",
- "authors": ["Wang, B.", "Komatsuzaki, A."],
+ "authors": [
+ "Wang, B.",
+ "Komatsuzaki, A."
+ ],
"year": 2021,
"relevance": "Open-source language model baseline for code generation, trained on The Pile with 8% GitHub code."
},
{
"title": "GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow",
- "authors": ["Black, S.", "Gao, L.", "Wang, P.", "Leahy, C.", "Biderman, S."],
+ "authors": [
+ "Black, S.",
+ "Gao, L.",
+ "Wang, P.",
+ "Leahy, C.",
+ "Biderman, S."
+ ],
"year": 2021,
"relevance": "Open-source GPT-style model serving as baseline for code generation; trained on The Pile."
},
{
"title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
- "authors": ["Feng, Z.", "Guo, D.", "Tang, D.", "et al."],
+ "authors": [
+ "Feng, Z.",
+ "Guo, D.",
+ "Tang, D.",
+ "et al."
+ ],
"year": 2020,
"relevance": "Pre-trained code representation model using BERT objective on docstring-function pairs."
},
{
"title": "Scaling Laws for Neural Language Models",
- "authors": ["Kaplan, J.", "McCandlish, S.", "Henighan, T.", "et al."],
+ "authors": [
+ "Kaplan, J.",
+ "McCandlish, S.",
+ "Henighan, T.",
+ "et al."
+ ],
"year": 2020,
"arxiv_id": "2003.05950",
"relevance": "Establishes power law scaling relationships that Codex's performance also follows after code fine-tuning."
},
{
"title": "SPoC: Search-based Pseudocode to Code",
- "authors": ["Kulal, S.", "Pasupat, P.", "Chandra, K.", "et al."],
+ "authors": [
+ "Kulal, S.",
+ "Pasupat, P.",
+ "Chandra, K.",
+ "et al."
+ ],
"year": 2019,
"relevance": "Introduced the pass@k metric for evaluating functional correctness of synthesized code."
},
{
"title": "Unsupervised Translation of Programming Languages",
- "authors": ["Lachaux, M.-A.", "Rozière, B.", "Chanussot, L.", "Lample, G."],
+ "authors": [
+ "Lachaux, M.-A.",
+ "Rozière, B.",
+ "Chanussot, L.",
+ "Lample, G."
+ ],
"year": 2020,
"arxiv_id": "2006.03511",
"relevance": "Demonstrated functional correctness as a better evaluation metric than BLEU for code translation."
},
{
"title": "Extracting Training Data from Large Language Models",
- "authors": ["Carlini, N.", "Tramèr, F.", "Wallace, E.", "et al."],
+ "authors": [
+ "Carlini, N.",
+ "Tramèr, F.",
+ "Wallace, E.",
+ "et al."
+ ],
"year": 2021,
"relevance": "Demonstrates privacy risks of training data memorization in large language models, applicable to code models trained on public repositories."
},
{
"title": "You Autocomplete Me: Poisoning Vulnerabilities in Neural Code Completion",
- "authors": ["Schuster, R.", "Song, C.", "Tromer, E.", "Shmatikov, V."],
+ "authors": [
+ "Schuster, R.",
+ "Song, C.",
+ "Tromer, E.",
+ "Shmatikov, V."
+ ],
"year": 2020,
"relevance": "Demonstrates data poisoning attacks on code completion models, a supply chain security risk for code generation."
},
{
"title": "In-IDE Code Generation from Natural Language: Promise and Challenges",
- "authors": ["Xu, F. F.", "Vasilescu, B.", "Neubig, G."],
+ "authors": [
+ "Xu, F. F.",
+ "Vasilescu, B.",
+ "Neubig, G."
+ ],
"year": 2021,
"arxiv_id": "2101.11149",
"relevance": "Evaluates capabilities and challenges of code generation in IDE settings, directly relevant to code generation evaluation."
},
{
"title": "Learning Autocompletion from Real-World Datasets",
- "authors": ["Aye, G. A.", "Kim, S.", "Li, H."],
+ "authors": [
+ "Aye, G. A.",
+ "Kim, S.",
+ "Li, H."
+ ],
"year": 2021,
"relevance": "Reports on Facebook's internal code autocomplete tool, providing industry perspective on code generation deployment."
},
{
"title": "The Pile: An 800GB Dataset of Diverse Text for Language Modeling",
- "authors": ["Gao, L.", "Biderman, S.", "Black, S.", "et al."],
+ "authors": [
+ "Gao, L.",
+ "Biderman, S.",
+ "Black, S.",
+ "et al."
+ ],
"year": 2020,
"relevance": "Training dataset for GPT-Neo and GPT-J baselines, containing 8% GitHub code that enables programming capabilities."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 3,
+ "justification": "Codex directly powers GitHub Copilot, making this the foundational paper for a tool millions of developers use daily."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The effectiveness of repeated sampling (0% → 77.5% with 100 samples) and the finding that BLEU scores are unreliable for code were genuinely surprising at publication."
+ },
+ "fear_safety": {
+ "score": 1,
+ "justification": "The paper mentions insecure code generation and misalignment worsening with scale, but these are secondary to the main capability story."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "OpenAI evaluating its own commercial product raises mild conflict-of-interest concerns, but the paper doesn't directly challenge competitors."
+ },
+ "demo_ability": {
+ "score": 2,
+ "justification": "HumanEval benchmark is publicly released and Codex was accessible via API, though the model weights and training data were not released."
+ },
+ "brand_recognition": {
+ "score": 3,
+ "justification": "OpenAI paper about the model powering GitHub Copilot — two of the most recognized names in AI and developer tools."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/coding-agents-generating-2026/scan.json b/papers/coding-agents-generating-2026/scan.json
@@ -10,7 +10,7 @@
"arxiv_id": "2602.00409",
"doi": "10.1145/3793302.3793362"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"methodology_tags": [
"observational"
@@ -446,5 +446,31 @@
"arxiv_id": "2507.10422",
"relevance": "Study of how developers self-report LLM usage in open-source, complementary approach to agent trace mining."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Directly relevant to teams using coding agents like Copilot/Cursor for testing — actionable insight to review and constrain agent mocking behavior."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The finding that agents use mocks 95% of the time while ignoring fakes and spies is a surprising narrowness that challenges assumptions about agent sophistication."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety or security angle — this is about code quality, not risk."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Mildly questions the quality of AI-generated tests but stops short of naming specific tools or making strong 'agents are harmful' claims."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "Observational study with no tool, demo, or code artifact that practitioners can try."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "Published at MSR (respected but niche venue) by authors from recognized but not headline-grabbing institutions; topic touches well-known tools without naming them."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/copilot-productivity-controlled-2023/scan.json b/papers/copilot-productivity-controlled-2023/scan.json
@@ -11,7 +11,7 @@
"venue": "arXiv",
"arxiv_id": "2302.06590"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor"
],
@@ -489,5 +489,31 @@
"year": 2022,
"relevance": "Research agenda for studying economic impacts of AI code generation, directly framing this paper's contribution."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Directly measures the productivity impact of a tool millions of developers already use or are considering adopting."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The 55.8% speed improvement is larger than most expected but the direction (Copilot helps) confirms prevailing beliefs."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle is present in this productivity study."
+ },
+ "drama_conflict": {
+ "score": 2,
+ "justification": "Microsoft/GitHub authors evaluating their own product with a non-representative sample and 63% attrition invites strong 'the study is compromised' discourse."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Anyone with GitHub Copilot access can try it, but the study itself provides no code, demo, or reproducible experimental setup."
+ },
+ "brand_recognition": {
+ "score": 3,
+ "justification": "GitHub Copilot is used by millions of developers and the study is from Microsoft Research, making it instantly recognizable."
+ }
+ }
}
\ No newline at end of file
diff --git a/scripts/catchup-v3.py b/scripts/catchup-v3.py
@@ -0,0 +1,256 @@
+#!/usr/bin/env python3
+"""
+Catch-up script: add v3 engagement factors to existing v2 scans.
+
+Reads each v2 scan.json, sends title + key findings + claims to Opus
+for engagement factor classification, merges results into v3 scan.json.
+
+Usage:
+ python3 scripts/catchup-v3.py # All v2 scans without v3
+ python3 scripts/catchup-v3.py --limit 10 # First N
+ python3 scripts/catchup-v3.py --parallel 4 # Concurrent
+ python3 scripts/catchup-v3.py --id metr-rct-2025 # Specific paper
+"""
+
+import json
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent.parent
+PAPERS_DIR = ROOT / "papers"
+
+ENGAGEMENT_PROMPT = """You are classifying a research paper on 6 dimensions that predict social media engagement (Hacker News, Reddit, tech newsletters). Rate each dimension 0-3 based on the paper's content, framing, and likely audience appeal.
+
+## Dimensions
+
+1. **practical_relevance** (0-3): Can a developer/practitioner use this at work? Tools, techniques, frameworks they can apply immediately.
+ - 0: Pure theory, no practical application
+ - 1: Potentially useful but requires significant adaptation
+ - 2: Contains actionable techniques or findings practitioners can apply
+ - 3: Directly usable tool, library, or technique with immediate workflow impact
+
+2. **surprise_contrarian** (0-3): Does this challenge conventional wisdom or reveal something unexpected?
+ - 0: Confirms what everyone already believes
+ - 1: Minor unexpected finding buried in expected results
+ - 2: Main finding is somewhat surprising or counterintuitive
+ - 3: Directly contradicts a widely-held belief with evidence
+
+3. **fear_safety** (0-3): Does this raise concerns about AI risks, security vulnerabilities, or misuse potential?
+ - 0: No safety/risk angle
+ - 1: Mentions risks as a secondary concern
+ - 2: Safety/risk is a major theme with concrete demonstrations
+ - 3: Demonstrates a novel attack, vulnerability, or existential concern
+
+4. **drama_conflict** (0-3): Is there a controversy, company rivalry, or "the emperor has no clothes" angle?
+ - 0: No controversy or conflict
+ - 1: Mild tension (e.g., questions a popular approach)
+ - 2: Directly challenges a specific company's claims or a popular benchmark
+ - 3: Major controversy — "benchmarks are fake", "company X is lying", replication failure
+
+5. **demo_ability** (0-3): Can someone try this themselves right now?
+ - 0: No code, no demo, no way to interact
+ - 1: Code exists but requires significant setup
+ - 2: Reproducible with moderate effort, clear instructions
+ - 3: Live demo, web app, or pip-installable tool you can try in minutes
+
+6. **brand_recognition** (0-3): Is this from a famous lab or about a famous product?
+ - 0: Unknown lab/authors, obscure topic
+ - 1: Recognized institution but not a household name in tech
+ - 2: Major tech company or famous research lab (Google Brain, FAIR, etc.)
+ - 3: About a product millions use (ChatGPT, Copilot, Cursor) or from OpenAI/Anthropic/DeepMind
+
+## Paper Information
+
+Title: {title}
+Authors: {authors}
+Year: {year}
+Venue: {venue}
+Tags: {tags}
+
+Key Findings:
+{key_findings}
+
+Claims:
+{claims}
+
+Red Flags:
+{red_flags}
+
+## Output
+
+Respond with ONLY a JSON object, no other text:
+{{
+ "engagement_factors": {{
+ "practical_relevance": {{"score": <0-3>, "justification": "<1 sentence>"}},
+ "surprise_contrarian": {{"score": <0-3>, "justification": "<1 sentence>"}},
+ "fear_safety": {{"score": <0-3>, "justification": "<1 sentence>"}},
+ "drama_conflict": {{"score": <0-3>, "justification": "<1 sentence>"}},
+ "demo_ability": {{"score": <0-3>, "justification": "<1 sentence>"}},
+ "brand_recognition": {{"score": <0-3>, "justification": "<1 sentence>"}}
+ }}
+}}"""
+
+
+def classify_one(paper_id):
+ """Run engagement classification on one paper. Returns (paper_id, ok, reason)."""
+ scan_path = PAPERS_DIR / paper_id / "scan.json"
+ if not scan_path.exists():
+ return paper_id, False, "no scan.json"
+
+ with open(scan_path) as f:
+ scan = json.load(f)
+
+ if scan.get("scan_version", 1) < 2:
+ return paper_id, False, "v1 scan, skip"
+
+ if scan.get("scan_version") == 3:
+ return paper_id, True, "already v3"
+
+ # Build prompt from existing scan data
+ paper = scan.get("paper", {})
+ claims_text = "\n".join(
+ f"- [{c.get('supported', '?')}] {c.get('claim', '')}"
+ for c in scan.get("claims", [])
+ )
+ red_flags_text = "\n".join(
+ f"- {r.get('flag', '')}: {r.get('detail', '')}"
+ for r in scan.get("red_flags", [])
+ )
+
+ prompt = ENGAGEMENT_PROMPT.format(
+ title=paper.get("title", ""),
+ authors=", ".join(paper.get("authors", [])[:5]),
+ year=paper.get("year", ""),
+ venue=paper.get("venue", ""),
+ tags=", ".join(scan.get("methodology_tags", [])),
+ key_findings=scan.get("key_findings", ""),
+ claims=claims_text or "(none)",
+ red_flags=red_flags_text or "(none)",
+ )
+
+ try:
+ result = subprocess.run(
+ ["claude", "-p", "-", "--model", "opus", "--max-turns", "1"],
+ input=prompt,
+ capture_output=True, text=True, timeout=120,
+ cwd=str(ROOT),
+ )
+
+ if result.returncode != 0:
+ return paper_id, False, f"claude exit {result.returncode}"
+
+ # Parse JSON from output — find the JSON object in the response
+ output = result.stdout.strip()
+ # Try to extract JSON from the output
+ json_start = output.find("{")
+ json_end = output.rfind("}") + 1
+ if json_start == -1 or json_end == 0:
+ return paper_id, False, "no JSON in output"
+
+ parsed = json.loads(output[json_start:json_end])
+ factors = parsed.get("engagement_factors", parsed)
+
+ # Validate structure
+ required = ["practical_relevance", "surprise_contrarian", "fear_safety",
+ "drama_conflict", "demo_ability", "brand_recognition"]
+ for key in required:
+ if key not in factors:
+ return paper_id, False, f"missing factor: {key}"
+ if "score" not in factors[key]:
+ return paper_id, False, f"missing score in {key}"
+
+ # Merge into v3
+ scan["scan_version"] = 3
+ scan["engagement_factors"] = factors
+
+ with open(scan_path, "w") as f:
+ json.dump(scan, f, ensure_ascii=False, indent=2)
+
+ scores = [factors[k]["score"] for k in required]
+ return paper_id, True, f"v3 [{','.join(str(s) for s in scores)}]"
+
+ except json.JSONDecodeError as e:
+ return paper_id, False, f"JSON parse error: {e}"
+ except subprocess.TimeoutExpired:
+ return paper_id, False, "timeout"
+ except Exception as e:
+ return paper_id, False, f"error: {e}"
+
+
+def main():
+ args = sys.argv[1:]
+ limit = None
+ specific_id = None
+ parallel = 1
+
+ for i, arg in enumerate(args):
+ if arg == "--limit" and i + 1 < len(args):
+ limit = int(args[i + 1])
+ if arg == "--id" and i + 1 < len(args):
+ specific_id = args[i + 1]
+ if arg == "--parallel" and i + 1 < len(args):
+ parallel = int(args[i + 1])
+
+ # Collect candidates
+ candidates = []
+ for scan_path in sorted(PAPERS_DIR.glob("*/scan.json")):
+ pid = scan_path.parent.name
+ if specific_id and pid != specific_id:
+ continue
+ with open(scan_path) as f:
+ s = json.load(f)
+ if s.get("scan_version", 1) < 2:
+ continue
+ if s.get("scan_version") == 3 and not specific_id:
+ continue
+ candidates.append(pid)
+
+ if limit:
+ candidates = candidates[:limit]
+
+ if not candidates:
+ print("No papers to classify.")
+ return
+
+ print(f"Classifying engagement factors for {len(candidates)} papers"
+ f"{f' (parallel={parallel})' if parallel > 1 else ''}:\n")
+
+ results = {"ok": 0, "fail": 0}
+ failures = []
+
+ if parallel > 1:
+ with ThreadPoolExecutor(max_workers=parallel) as executor:
+ futures = {executor.submit(classify_one, pid): pid for pid in candidates}
+ for future in as_completed(futures):
+ pid, ok, reason = future.result()
+ if ok:
+ results["ok"] += 1
+ if reason != "already v3":
+ print(f" OK: {pid} — {reason}")
+ else:
+ results["fail"] += 1
+ failures.append((pid, reason))
+ print(f" FAIL: {pid} — {reason}")
+ else:
+ for i, pid in enumerate(candidates):
+ print(f"[{i+1}/{len(candidates)}] {pid}")
+ _, ok, reason = classify_one(pid)
+ if ok:
+ results["ok"] += 1
+ print(f" {reason}")
+ else:
+ results["fail"] += 1
+ failures.append((pid, reason))
+ print(f" FAIL: {reason}")
+
+ print(f"\nDone. OK: {results['ok']}, Failed: {results['fail']}")
+ if failures:
+ print("Failures:")
+ for pid, reason in failures[:20]:
+ print(f" {pid}: {reason}")
+
+
+if __name__ == "__main__":
+ main()