ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

commit ddde6369343ce6a1c7129bb5e8093318815ae2a7
parent 372ecdeaa4a2719d071db645776f137c093a7bc5
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue, 24 Mar 2026 06:49:13 +0100

Add 3 new tensions, expand keyword matching for existing 3

New tensions:
- Security Arms Race: 379 defense vs 546 attack claims
- Code Quality Paradox: 363 LLMs-help vs 251 LLMs-hurt
- Scaling Debate: 152 efficient vs 452 limits

Expanded keywords for existing tensions:
- Benchmarks: added pass@, accuracy, f1, performance on, sota (103→315 pos)
- Agents: added agentic, tool use, planning, chain-of-thought (72→110 pos)
- Productivity: added developer productivity, coding efficiency (minor)

Each tension gets a butterfly chart with bar width encoding.
Total claim coverage: 858→3600 (from 5115 total claims).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mexplorer/src/data.ts | 15++++++++++++---
Mexplorer/src/views/tensions.ts | 15+++++++++++++++
Mexplorer/tests/explorer.spec.ts | 2+-
Mscripts/build-explorer-data.py | 72++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
4 files changed, 82 insertions(+), 22 deletions(-)

diff --git a/explorer/src/data.ts b/explorer/src/data.ts @@ -123,10 +123,19 @@ export interface TensionClaim { year: number | null; } +export interface TensionSides { + positive: TensionClaim[]; + nuanced: TensionClaim[]; +} + export interface Tensions { - productivity: { positive: TensionClaim[]; nuanced: TensionClaim[] }; - benchmarks: { positive: TensionClaim[]; nuanced: TensionClaim[] }; - agents: { positive: TensionClaim[]; nuanced: TensionClaim[] }; + productivity: TensionSides; + benchmarks: TensionSides; + agents: TensionSides; + security: TensionSides; + code_quality: TensionSides; + scaling: TensionSides; + [key: string]: TensionSides; } export interface NetNode { diff --git a/explorer/src/views/tensions.ts b/explorer/src/views/tensions.ts @@ -16,6 +16,21 @@ const TENSION_META: Record<string, { title: string; positive: string; nuanced: s positive: 'Agents succeed at tasks', nuanced: 'Agents fail in deployment', }, + security: { + title: 'Security Arms Race', + positive: 'Defenses work', + nuanced: 'Attacks succeed', + }, + code_quality: { + title: 'Code Quality Paradox', + positive: 'LLMs improve code', + nuanced: 'LLMs introduce defects', + }, + scaling: { + title: 'Scaling Debate', + positive: 'Scaling is efficient', + nuanced: 'Scaling hits limits', + }, }; function meanScore(claims: TensionClaim[]): number { diff --git a/explorer/tests/explorer.spec.ts b/explorer/tests/explorer.spec.ts @@ -167,7 +167,7 @@ test.describe('Tensions', () => { test('shows three tension groups', async ({ page }) => { await page.goto('/#/tensions'); await expect(page.locator('.tension-group').first()).toBeVisible({ timeout: 10000 }); - expect(await page.locator('.tension-group').count()).toBe(3); + expect(await page.locator('.tension-group').count()).toBe(6); }); test('shows tension claims', async ({ page }) => { diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py @@ -253,6 +253,9 @@ def build(): "productivity": {"positive": [], "nuanced": []}, "benchmarks": {"positive": [], "nuanced": []}, "agents": {"positive": [], "nuanced": []}, + "security": {"positive": [], "nuanced": []}, + "code_quality": {"positive": [], "nuanced": []}, + "scaling": {"positive": [], "nuanced": []}, } for scan_path in sorted(PAPERS_DIR.glob("*/scan.json")): @@ -365,24 +368,57 @@ def build(): # Tension classification for claim in claims: ct = claim.get("claim", "").lower() - if any(k in ct for k in ["productivity", "developer speed", "completion time", "speedup", "faster"]): - bucket = "positive" if any(k in ct for k in ["faster", "speedup", "improves", "increases", "gain"]) else "nuanced" - tensions["productivity"][bucket].append({ - "paper_id": paper_id, "claim": claim["claim"], - "supported": claim.get("supported", ""), "score": score_pct, "year": year, - }) - if any(k in ct for k in ["benchmark", "evaluation", "leaderboard", "swe-bench"]): - bucket = "positive" if any(k in ct for k in ["state-of-the-art", "outperforms", "achieves", "best"]) else "nuanced" - tensions["benchmarks"][bucket].append({ - "paper_id": paper_id, "claim": claim["claim"], - "supported": claim.get("supported", ""), "score": score_pct, "year": year, - }) - if any(k in ct for k in ["agent", "autonomous", "multi-agent"]): - bucket = "positive" if any(k in ct for k in ["solves", "achieves", "succeeds", "capable", "outperforms"]) else "nuanced" - tensions["agents"][bucket].append({ - "paper_id": paper_id, "claim": claim["claim"], - "supported": claim.get("supported", ""), "score": score_pct, "year": year, - }) + entry = {"paper_id": paper_id, "claim": claim["claim"], + "supported": claim.get("supported", ""), "score": score_pct, "year": year} + + # Productivity + if any(k in ct for k in ["productivity", "developer speed", "completion time", "speedup", + "faster", "developer productivity", "coding efficiency", + "development time", "time savings", "code faster"]): + bucket = "positive" if any(k in ct for k in ["faster", "speedup", "improves", "increases", + "gain", "savings", "efficient"]) else "nuanced" + tensions["productivity"][bucket].append(entry) + + # Benchmarks (expanded) + if any(k in ct for k in ["benchmark", "evaluation", "leaderboard", "swe-bench", + "pass@", "accuracy", "f1 score", "performance on", + "state-of-the-art", "sota"]): + bucket = "positive" if any(k in ct for k in ["state-of-the-art", "outperforms", "achieves", + "best", "surpasses", "exceeds"]) else "nuanced" + tensions["benchmarks"][bucket].append(entry) + + # Agents (expanded) + if any(k in ct for k in ["agent", "autonomous", "multi-agent", "agentic", + "tool use", "planning", "chain-of-thought", + "reasoning capability"]): + bucket = "positive" if any(k in ct for k in ["solves", "achieves", "succeeds", "capable", + "outperforms", "enables", "improves"]) else "nuanced" + tensions["agents"][bucket].append(entry) + + # Security arms race (NEW) + if any(k in ct for k in ["attack", "defense", "jailbreak", "injection", "adversarial", + "vulnerability", "safety", "alignment", "harmful", "toxic", + "secure", "exploit", "bypass", "mitigat"]): + bucket = "positive" if any(k in ct for k in ["defense", "protect", "mitigat", "detect", + "prevent", "secure", "effective", "reduces", + "robust"]) else "nuanced" + tensions["security"][bucket].append(entry) + + # Code quality (NEW) + if any(k in ct for k in ["code quality", "bug", "vulnerability", "error", "defect", + "repair", "fix", "correct", "hallucin", "incorrect code", + "insecure code", "code generation"]): + bucket = "positive" if any(k in ct for k in ["repair", "fix", "correct", "improve", + "reduc", "effective", "resolve"]) else "nuanced" + tensions["code_quality"][bucket].append(entry) + + # Scaling debate (NEW) + if any(k in ct for k in ["scaling", "scale", "cost", "efficient", "latency", + "token", "compute", "inference", "smaller model", + "distill", "compress"]): + bucket = "positive" if any(k in ct for k in ["efficient", "reduc", "cheaper", "faster", + "smaller", "compet", "saving"]) else "nuanced" + tensions["scaling"][bucket].append(entry) cat_scores_pct = {k: round(v * 100, 1) for k, v in cat_scores.items()}

Impressum · Datenschutz