Add 3 new tensions, expand keyword matching for existing 3 - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

commit ddde6369343ce6a1c7129bb5e8093318815ae2a7
parent 372ecdeaa4a2719d071db645776f137c093a7bc5
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue, 24 Mar 2026 06:49:13 +0100

Add 3 new tensions, expand keyword matching for existing 3

New tensions:
- Security Arms Race: 379 defense vs 546 attack claims
- Code Quality Paradox: 363 LLMs-help vs 251 LLMs-hurt
- Scaling Debate: 152 efficient vs 452 limits

Expanded keywords for existing tensions:
- Benchmarks: added pass@, accuracy, f1, performance on, sota (103→315 pos)
- Agents: added agentic, tool use, planning, chain-of-thought (72→110 pos)
- Productivity: added developer productivity, coding efficiency (minor)

Each tension gets a butterfly chart with bar width encoding.
Total claim coverage: 858→3600 (from 5115 total claims).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M explorer/src/data.ts  | 15 ++++++++++++---
M explorer/src/views/tensions.ts  | 15 +++++++++++++++
M explorer/tests/explorer.spec.ts  | 2 +-
M scripts/build-explorer-data.py  | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------

4 files changed, 82 insertions(+), 22 deletions(-)
diff --git a/explorer/src/data.ts b/explorer/src/data.ts
@@ -123,10 +123,19 @@ export interface TensionClaim {
   year: number | null;
 }
 
+export interface TensionSides {
+  positive: TensionClaim[];
+  nuanced: TensionClaim[];
+}
+
 export interface Tensions {
-  productivity: { positive: TensionClaim[]; nuanced: TensionClaim[] };
-  benchmarks: { positive: TensionClaim[]; nuanced: TensionClaim[] };
-  agents: { positive: TensionClaim[]; nuanced: TensionClaim[] };
+  productivity: TensionSides;
+  benchmarks: TensionSides;
+  agents: TensionSides;
+  security: TensionSides;
+  code_quality: TensionSides;
+  scaling: TensionSides;
+  [key: string]: TensionSides;
 }
 
 export interface NetNode {
diff --git a/explorer/src/views/tensions.ts b/explorer/src/views/tensions.ts
@@ -16,6 +16,21 @@ const TENSION_META: Record<string, { title: string; positive: string; nuanced: s
     positive: 'Agents succeed at tasks',
     nuanced: 'Agents fail in deployment',
   },
+  security: {
+    title: 'Security Arms Race',
+    positive: 'Defenses work',
+    nuanced: 'Attacks succeed',
+  },
+  code_quality: {
+    title: 'Code Quality Paradox',
+    positive: 'LLMs improve code',
+    nuanced: 'LLMs introduce defects',
+  },
+  scaling: {
+    title: 'Scaling Debate',
+    positive: 'Scaling is efficient',
+    nuanced: 'Scaling hits limits',
+  },
 };
 
 function meanScore(claims: TensionClaim[]): number {
diff --git a/explorer/tests/explorer.spec.ts b/explorer/tests/explorer.spec.ts
@@ -167,7 +167,7 @@ test.describe('Tensions', () => {
   test('shows three tension groups', async ({ page }) => {
     await page.goto('/#/tensions');
     await expect(page.locator('.tension-group').first()).toBeVisible({ timeout: 10000 });
-    expect(await page.locator('.tension-group').count()).toBe(3);
+    expect(await page.locator('.tension-group').count()).toBe(6);
   });
 
   test('shows tension claims', async ({ page }) => {
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -253,6 +253,9 @@ def build():
         "productivity": {"positive": [], "nuanced": []},
         "benchmarks": {"positive": [], "nuanced": []},
         "agents": {"positive": [], "nuanced": []},
+        "security": {"positive": [], "nuanced": []},
+        "code_quality": {"positive": [], "nuanced": []},
+        "scaling": {"positive": [], "nuanced": []},
     }
 
     for scan_path in sorted(PAPERS_DIR.glob("*/scan.json")):
@@ -365,24 +368,57 @@ def build():
         # Tension classification
         for claim in claims:
             ct = claim.get("claim", "").lower()
-            if any(k in ct for k in ["productivity", "developer speed", "completion time", "speedup", "faster"]):
-                bucket = "positive" if any(k in ct for k in ["faster", "speedup", "improves", "increases", "gain"]) else "nuanced"
-                tensions["productivity"][bucket].append({
-                    "paper_id": paper_id, "claim": claim["claim"],
-                    "supported": claim.get("supported", ""), "score": score_pct, "year": year,
-                })
-            if any(k in ct for k in ["benchmark", "evaluation", "leaderboard", "swe-bench"]):
-                bucket = "positive" if any(k in ct for k in ["state-of-the-art", "outperforms", "achieves", "best"]) else "nuanced"
-                tensions["benchmarks"][bucket].append({
-                    "paper_id": paper_id, "claim": claim["claim"],
-                    "supported": claim.get("supported", ""), "score": score_pct, "year": year,
-                })
-            if any(k in ct for k in ["agent", "autonomous", "multi-agent"]):
-                bucket = "positive" if any(k in ct for k in ["solves", "achieves", "succeeds", "capable", "outperforms"]) else "nuanced"
-                tensions["agents"][bucket].append({
-                    "paper_id": paper_id, "claim": claim["claim"],
-                    "supported": claim.get("supported", ""), "score": score_pct, "year": year,
-                })
+            entry = {"paper_id": paper_id, "claim": claim["claim"],
+                     "supported": claim.get("supported", ""), "score": score_pct, "year": year}
+
+            # Productivity
+            if any(k in ct for k in ["productivity", "developer speed", "completion time", "speedup",
+                                      "faster", "developer productivity", "coding efficiency",
+                                      "development time", "time savings", "code faster"]):
+                bucket = "positive" if any(k in ct for k in ["faster", "speedup", "improves", "increases",
+                                                              "gain", "savings", "efficient"]) else "nuanced"
+                tensions["productivity"][bucket].append(entry)
+
+            # Benchmarks (expanded)
+            if any(k in ct for k in ["benchmark", "evaluation", "leaderboard", "swe-bench",
+                                      "pass@", "accuracy", "f1 score", "performance on",
+                                      "state-of-the-art", "sota"]):
+                bucket = "positive" if any(k in ct for k in ["state-of-the-art", "outperforms", "achieves",
+                                                              "best", "surpasses", "exceeds"]) else "nuanced"
+                tensions["benchmarks"][bucket].append(entry)
+
+            # Agents (expanded)
+            if any(k in ct for k in ["agent", "autonomous", "multi-agent", "agentic",
+                                      "tool use", "planning", "chain-of-thought",
+                                      "reasoning capability"]):
+                bucket = "positive" if any(k in ct for k in ["solves", "achieves", "succeeds", "capable",
+                                                              "outperforms", "enables", "improves"]) else "nuanced"
+                tensions["agents"][bucket].append(entry)
+
+            # Security arms race (NEW)
+            if any(k in ct for k in ["attack", "defense", "jailbreak", "injection", "adversarial",
+                                      "vulnerability", "safety", "alignment", "harmful", "toxic",
+                                      "secure", "exploit", "bypass", "mitigat"]):
+                bucket = "positive" if any(k in ct for k in ["defense", "protect", "mitigat", "detect",
+                                                              "prevent", "secure", "effective", "reduces",
+                                                              "robust"]) else "nuanced"
+                tensions["security"][bucket].append(entry)
+
+            # Code quality (NEW)
+            if any(k in ct for k in ["code quality", "bug", "vulnerability", "error", "defect",
+                                      "repair", "fix", "correct", "hallucin", "incorrect code",
+                                      "insecure code", "code generation"]):
+                bucket = "positive" if any(k in ct for k in ["repair", "fix", "correct", "improve",
+                                                              "reduc", "effective", "resolve"]) else "nuanced"
+                tensions["code_quality"][bucket].append(entry)
+
+            # Scaling debate (NEW)
+            if any(k in ct for k in ["scaling", "scale", "cost", "efficient", "latency",
+                                      "token", "compute", "inference", "smaller model",
+                                      "distill", "compress"]):
+                bucket = "positive" if any(k in ct for k in ["efficient", "reduc", "cheaper", "faster",
+                                                              "smaller", "compet", "saving"]) else "nuanced"
+                tensions["scaling"][bucket].append(entry)
 
         cat_scores_pct = {k: round(v * 100, 1) for k, v in cat_scores.items()}

	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs

M	explorer/src/data.ts	\|	15	++++++++++++---
M	explorer/src/views/tensions.ts	\|	15	+++++++++++++++
M	explorer/tests/explorer.spec.ts	\|	2	+-
M	scripts/build-explorer-data.py	\|	72	++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------