commit ddde6369343ce6a1c7129bb5e8093318815ae2a7
parent 372ecdeaa4a2719d071db645776f137c093a7bc5
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Tue, 24 Mar 2026 06:49:13 +0100
Add 3 new tensions, expand keyword matching for existing 3
New tensions:
- Security Arms Race: 379 defense vs 546 attack claims
- Code Quality Paradox: 363 LLMs-help vs 251 LLMs-hurt
- Scaling Debate: 152 efficient vs 452 limits
Expanded keywords for existing tensions:
- Benchmarks: added pass@, accuracy, f1, performance on, sota (103→315 pos)
- Agents: added agentic, tool use, planning, chain-of-thought (72→110 pos)
- Productivity: added developer productivity, coding efficiency (minor)
Each tension gets a butterfly chart with bar width encoding.
Total claim coverage: 858→3600 (from 5115 total claims).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
4 files changed, 82 insertions(+), 22 deletions(-)
diff --git a/explorer/src/data.ts b/explorer/src/data.ts
@@ -123,10 +123,19 @@ export interface TensionClaim {
year: number | null;
}
+export interface TensionSides {
+ positive: TensionClaim[];
+ nuanced: TensionClaim[];
+}
+
export interface Tensions {
- productivity: { positive: TensionClaim[]; nuanced: TensionClaim[] };
- benchmarks: { positive: TensionClaim[]; nuanced: TensionClaim[] };
- agents: { positive: TensionClaim[]; nuanced: TensionClaim[] };
+ productivity: TensionSides;
+ benchmarks: TensionSides;
+ agents: TensionSides;
+ security: TensionSides;
+ code_quality: TensionSides;
+ scaling: TensionSides;
+ [key: string]: TensionSides;
}
export interface NetNode {
diff --git a/explorer/src/views/tensions.ts b/explorer/src/views/tensions.ts
@@ -16,6 +16,21 @@ const TENSION_META: Record<string, { title: string; positive: string; nuanced: s
positive: 'Agents succeed at tasks',
nuanced: 'Agents fail in deployment',
},
+ security: {
+ title: 'Security Arms Race',
+ positive: 'Defenses work',
+ nuanced: 'Attacks succeed',
+ },
+ code_quality: {
+ title: 'Code Quality Paradox',
+ positive: 'LLMs improve code',
+ nuanced: 'LLMs introduce defects',
+ },
+ scaling: {
+ title: 'Scaling Debate',
+ positive: 'Scaling is efficient',
+ nuanced: 'Scaling hits limits',
+ },
};
function meanScore(claims: TensionClaim[]): number {
diff --git a/explorer/tests/explorer.spec.ts b/explorer/tests/explorer.spec.ts
@@ -167,7 +167,7 @@ test.describe('Tensions', () => {
test('shows three tension groups', async ({ page }) => {
await page.goto('/#/tensions');
await expect(page.locator('.tension-group').first()).toBeVisible({ timeout: 10000 });
- expect(await page.locator('.tension-group').count()).toBe(3);
+ expect(await page.locator('.tension-group').count()).toBe(6);
});
test('shows tension claims', async ({ page }) => {
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -253,6 +253,9 @@ def build():
"productivity": {"positive": [], "nuanced": []},
"benchmarks": {"positive": [], "nuanced": []},
"agents": {"positive": [], "nuanced": []},
+ "security": {"positive": [], "nuanced": []},
+ "code_quality": {"positive": [], "nuanced": []},
+ "scaling": {"positive": [], "nuanced": []},
}
for scan_path in sorted(PAPERS_DIR.glob("*/scan.json")):
@@ -365,24 +368,57 @@ def build():
# Tension classification
for claim in claims:
ct = claim.get("claim", "").lower()
- if any(k in ct for k in ["productivity", "developer speed", "completion time", "speedup", "faster"]):
- bucket = "positive" if any(k in ct for k in ["faster", "speedup", "improves", "increases", "gain"]) else "nuanced"
- tensions["productivity"][bucket].append({
- "paper_id": paper_id, "claim": claim["claim"],
- "supported": claim.get("supported", ""), "score": score_pct, "year": year,
- })
- if any(k in ct for k in ["benchmark", "evaluation", "leaderboard", "swe-bench"]):
- bucket = "positive" if any(k in ct for k in ["state-of-the-art", "outperforms", "achieves", "best"]) else "nuanced"
- tensions["benchmarks"][bucket].append({
- "paper_id": paper_id, "claim": claim["claim"],
- "supported": claim.get("supported", ""), "score": score_pct, "year": year,
- })
- if any(k in ct for k in ["agent", "autonomous", "multi-agent"]):
- bucket = "positive" if any(k in ct for k in ["solves", "achieves", "succeeds", "capable", "outperforms"]) else "nuanced"
- tensions["agents"][bucket].append({
- "paper_id": paper_id, "claim": claim["claim"],
- "supported": claim.get("supported", ""), "score": score_pct, "year": year,
- })
+ entry = {"paper_id": paper_id, "claim": claim["claim"],
+ "supported": claim.get("supported", ""), "score": score_pct, "year": year}
+
+ # Productivity
+ if any(k in ct for k in ["productivity", "developer speed", "completion time", "speedup",
+ "faster", "developer productivity", "coding efficiency",
+ "development time", "time savings", "code faster"]):
+ bucket = "positive" if any(k in ct for k in ["faster", "speedup", "improves", "increases",
+ "gain", "savings", "efficient"]) else "nuanced"
+ tensions["productivity"][bucket].append(entry)
+
+ # Benchmarks (expanded)
+ if any(k in ct for k in ["benchmark", "evaluation", "leaderboard", "swe-bench",
+ "pass@", "accuracy", "f1 score", "performance on",
+ "state-of-the-art", "sota"]):
+ bucket = "positive" if any(k in ct for k in ["state-of-the-art", "outperforms", "achieves",
+ "best", "surpasses", "exceeds"]) else "nuanced"
+ tensions["benchmarks"][bucket].append(entry)
+
+ # Agents (expanded)
+ if any(k in ct for k in ["agent", "autonomous", "multi-agent", "agentic",
+ "tool use", "planning", "chain-of-thought",
+ "reasoning capability"]):
+ bucket = "positive" if any(k in ct for k in ["solves", "achieves", "succeeds", "capable",
+ "outperforms", "enables", "improves"]) else "nuanced"
+ tensions["agents"][bucket].append(entry)
+
+ # Security arms race (NEW)
+ if any(k in ct for k in ["attack", "defense", "jailbreak", "injection", "adversarial",
+ "vulnerability", "safety", "alignment", "harmful", "toxic",
+ "secure", "exploit", "bypass", "mitigat"]):
+ bucket = "positive" if any(k in ct for k in ["defense", "protect", "mitigat", "detect",
+ "prevent", "secure", "effective", "reduces",
+ "robust"]) else "nuanced"
+ tensions["security"][bucket].append(entry)
+
+ # Code quality (NEW)
+ if any(k in ct for k in ["code quality", "bug", "vulnerability", "error", "defect",
+ "repair", "fix", "correct", "hallucin", "incorrect code",
+ "insecure code", "code generation"]):
+ bucket = "positive" if any(k in ct for k in ["repair", "fix", "correct", "improve",
+ "reduc", "effective", "resolve"]) else "nuanced"
+ tensions["code_quality"][bucket].append(entry)
+
+ # Scaling debate (NEW)
+ if any(k in ct for k in ["scaling", "scale", "cost", "efficient", "latency",
+ "token", "compute", "inference", "smaller model",
+ "distill", "compress"]):
+ bucket = "positive" if any(k in ct for k in ["efficient", "reduc", "cheaper", "faster",
+ "smaller", "compet", "saving"]) else "nuanced"
+ tensions["scaling"][bucket].append(entry)
cat_scores_pct = {k: round(v * 100, 1) for k, v in cat_scores.items()}