ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

commit a2c488b4b161129d19bc4aff0445e74a4c93407f
parent 59c5b1043da1db314c2da2b0d833733c9fe627f5
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon, 23 Mar 2026 11:03:05 +0100

Add reproducibility funnel, methodology treemap, two cultures scatter

Three new visualizations in findings:

Reproducibility funnel: 745 → 400 (code) → 351 (data) → 61 (env) →
49 (instructions). The cliff at environment specs is where
reproducibility collapses — 90% of code-releasing papers stop there.

Methodology landscape treemap: proportional blocks sized by paper
count, colored by mean score. Benchmark-eval dominates (561 papers),
RCTs score highest (64.3%), case studies lowest (39.5%).

Two cultures scatter: human_studies vs artifacts score for 80 papers
with human subjects. Negatively correlated (r=-0.24) — CS researchers
release code but skip IRB; psychology researchers do ethics review
but don't release data. Four quadrants labeled.

Also: 3 new v2 scans (Codex 71.7%, CoT 56.6%, ReAct 48.2%) and
Agents of Chaos rescan (47.5%).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mexplorer/src/style.css | 49+++++++++++++++++++++++++++++++++++++++++++++++++
Mexplorer/src/views/findings.ts | 115+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mexplorer/tests/explorer.spec.ts | 6+++---
Mscripts/build-explorer-data.py | 68++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 235 insertions(+), 3 deletions(-)

diff --git a/explorer/src/style.css b/explorer/src/style.css @@ -530,6 +530,55 @@ td.score { .toggle-btn:hover { border-color: var(--text-dim); } .toggle-btn.active { background: rgba(108, 140, 255, 0.08); } +/* Reproducibility funnel */ +.funnel-step { margin-bottom: 0.4rem; } +.funnel-label { + display: flex; + justify-content: space-between; + font-size: 0.82rem; + margin-bottom: 2px; +} +.funnel-track { + height: 24px; + background: var(--border); + border-radius: 3px; + overflow: hidden; +} +.funnel-fill { + height: 100%; + border-radius: 3px; + transition: width 0.3s; +} + +/* Methodology treemap */ +.treemap { + display: flex; + flex-wrap: wrap; + gap: 4px; + min-height: 80px; +} +.treemap-cell { + border-radius: 6px; + padding: 0.6rem 0.75rem; + min-height: 60px; + display: flex; + flex-direction: column; + justify-content: center; + flex-grow: 1; +} +.treemap-label { + font-size: 0.82rem; + font-weight: 600; + color: #fff; + text-shadow: 0 1px 2px rgba(0,0,0,0.4); +} +.treemap-value { + font-size: 0.72rem; + color: rgba(255,255,255,0.8); + font-family: var(--font); + text-shadow: 0 1px 2px rgba(0,0,0,0.4); +} + /* DNA strip (paper profile in table) */ .dna-strip { display: inline-flex; diff --git a/explorer/src/views/findings.ts b/explorer/src/views/findings.ts @@ -46,6 +46,9 @@ export async function renderFindings(app: HTMLElement) { ${renderBenchmarkMonoculture(f)} ${renderFundingGap(f)} ${renderReproDetail(f)} + ${renderReproFunnel(f)} + ${renderTagTreemap(f)} + ${renderTwoCultures(f)} ${renderNetworkInsights(f)} ${renderGames(f)} `; @@ -526,6 +529,118 @@ function renderReproDetail(f: Findings): string { </div>`; } +function renderReproFunnel(f: Findings): string { + const funnel = (f.repro_detail as any).funnel as { step: string; n: number }[]; + if (!funnel || !funnel.length) return ''; + const max = funnel[0].n; + + return `<div class="section"> + <h2>Reproducibility Funnel</h2> + <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">Each step filters papers that pass ALL previous criteria. The cliff at "Environment specified" is where reproducibility collapses.</p> + ${funnel.map((step, i) => { + const pct = (step.n / max * 100).toFixed(1); + const lost = i > 0 ? funnel[i - 1].n - step.n : 0; + const color = step.n / max > 0.5 ? 'var(--green)' : step.n / max > 0.1 ? 'var(--yellow)' : 'var(--red)'; + return `<div class="funnel-step"> + <div class="funnel-label"> + <span>${step.step}</span> + <span style="font-family:var(--font)">${step.n} <span style="color:var(--text-dim)">(${pct}%)</span>${i > 0 ? ` <span style="color:var(--red);font-size:0.75rem">\u2212${lost}</span>` : ''}</span> + </div> + <div class="funnel-track"><div class="funnel-fill" style="width:${pct}%;background:${color}"></div></div> + </div>`; + }).join('')} + </div>`; +} + +function renderTagTreemap(f: Findings): string { + const tags = (f as any).tag_treemap as { tag: string; n: number; mean: number }[]; + if (!tags || !tags.length) return ''; + const totalPapers = tags.reduce((s, t) => s + t.n, 0); + + // Render as proportional blocks + return `<div class="section"> + <h2>Methodology Landscape</h2> + <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">Paper corpus by methodology type. Size = paper count. Color = mean score.</p> + <div class="treemap"> + ${tags.map(t => { + const pct = (t.n / totalPapers * 100); + const color = t.mean < 42 ? 'var(--red)' : t.mean < 52 ? 'var(--yellow)' : t.mean < 58 ? 'var(--accent)' : 'var(--green)'; + // Min width for small tags + const width = Math.max(pct, 6); + return `<div class="treemap-cell" style="flex-basis:${width}%;background:${color}" title="${formatName(t.tag)}: ${t.n} papers, mean ${t.mean}%"> + <div class="treemap-label">${formatName(t.tag)}</div> + <div class="treemap-value">${t.n} (${t.mean}%)</div> + </div>`; + }).join('')} + </div> + </div>`; +} + +function renderTwoCultures(f: Findings): string { + const papers = (f as any).two_cultures as { human_studies: number; artifacts: number; id: string; score: number }[]; + if (!papers || papers.length < 10) return ''; + + // Compute quadrant counts + const q = { hh: 0, hl: 0, lh: 0, ll: 0 }; + for (const p of papers) { + const hs = p.human_studies >= 50; + const ar = p.artifacts >= 50; + if (hs && ar) q.hh++; + else if (hs && !ar) q.hl++; + else if (!hs && ar) q.lh++; + else q.ll++; + } + + // SVG scatter + const w = 500, h = 400; + const pad = { l: 60, r: 20, t: 20, b: 50 }; + const cw = w - pad.l - pad.r, ch = h - pad.t - pad.b; + const xScale = (v: number) => pad.l + (v / 100) * cw; + const yScale = (v: number) => pad.t + ch - (v / 100) * ch; + + let dots = ''; + for (const p of papers) { + const cx = xScale(p.artifacts); + const cy = yScale(p.human_studies); + const color = p.score < 40 ? '#f06565' : p.score < 55 ? '#f0c050' : '#3dd68c'; + dots += `<circle cx="${cx}" cy="${cy}" r="5" fill="${color}" opacity="0.6"> + <title>${p.id}: artifacts ${p.artifacts}%, human_studies ${p.human_studies}%, score ${p.score}%</title> + </circle>`; + } + + // Quadrant labels + const midX = xScale(50), midY = yScale(50); + const quadrants = ` + <line x1="${midX}" y1="${pad.t}" x2="${midX}" y2="${h - pad.b}" stroke="var(--border)" stroke-dasharray="4"/> + <line x1="${pad.l}" y1="${midY}" x2="${w - pad.r}" y2="${midY}" stroke="var(--border)" stroke-dasharray="4"/> + <text x="${xScale(25)}" y="${yScale(80)}" text-anchor="middle" font-size="10" fill="var(--text-dim)">CS tradition only (${q.hl})</text> + <text x="${xScale(75)}" y="${yScale(80)}" text-anchor="middle" font-size="10" fill="var(--green)">Both traditions (${q.hh})</text> + <text x="${xScale(25)}" y="${yScale(20)}" text-anchor="middle" font-size="10" fill="var(--red)">Neither (${q.ll})</text> + <text x="${xScale(75)}" y="${yScale(20)}" text-anchor="middle" font-size="10" fill="var(--text-dim)">Psych tradition only (${q.lh})</text> + `; + + // Axes + const axes = ` + <text x="${w / 2}" y="${h - 5}" text-anchor="middle" fill="var(--text-dim)" font-size="11">Artifacts Score \u2192</text> + <text x="12" y="${h / 2}" text-anchor="middle" fill="var(--text-dim)" font-size="11" transform="rotate(-90, 12, ${h / 2})">Human Studies Score \u2192</text> + `; + + // Grid + let grid = ''; + for (let v = 0; v <= 100; v += 25) { + grid += `<text x="${pad.l - 8}" y="${yScale(v) + 4}" text-anchor="end" font-size="10">${v}%</text>`; + grid += `<text x="${xScale(v)}" y="${h - pad.b + 15}" text-anchor="middle" font-size="10">${v}%</text>`; + } + + return `<div class="section"> + <h2>Two Cultures</h2> + <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">Papers with human subjects (n=${papers.length}): human_studies score vs artifacts score. These two dimensions are <strong>negatively correlated</strong> (r=\u22120.24). CS-trained researchers release code but skip IRB; psychology-trained researchers do ethics review but don't release data.</p> + <svg viewBox="0 0 ${w} ${h}" style="width:100%;max-width:${w}px"> + ${grid}${quadrants}${dots}${axes} + </svg> + </div>`; +} + function renderNetworkInsights(f: Findings): string { const ni = (f as any).network_insights; if (!ni) return ''; diff --git a/explorer/tests/explorer.spec.ts b/explorer/tests/explorer.spec.ts @@ -6,7 +6,7 @@ test.describe('Dashboard', () => { await expect(page.locator('.card .value').first()).toBeVisible({ timeout: 10000 }); const cards = page.locator('.card'); await expect(cards).toHaveCount(4); - await expect(cards.nth(0).locator('.value')).toHaveText('744'); + await expect(cards.nth(0).locator('.value')).toHaveText('745'); await expect(cards.nth(1).locator('.value')).toHaveText('48.1%'); }); @@ -206,8 +206,8 @@ test.describe('Findings', () => { test('loads and shows all sections', async ({ page }) => { await page.goto('/#/findings'); await expect(page.locator('.section').first()).toBeVisible({ timeout: 10000 }); - // Should have 13 sections - expect(await page.locator('.section').count()).toBe(13); + // Should have 16 sections + expect(await page.locator('.section').count()).toBe(16); }); test('shows per-question pass rates', async ({ page }) => { diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py @@ -651,6 +651,72 @@ def build(): repro_detail["full_pass_count"] = repro_count repro_detail["full_pass_pct"] = round(repro_count / total_papers * 100, 1) if total_papers else 0 + # 9b. Reproducibility funnel — cascading filter + repro_funnel = [] + step_papers = set(p["id"] for p in papers_full) + repro_funnel.append({"step": "All papers", "n": len(step_papers)}) + for q_name, label in [ + ("code_released", "Code released"), + ("data_released", "Data released"), + ("environment_specified", "Environment specified"), + ("reproduction_instructions", "Reproduction instructions"), + ]: + next_set = set() + for p in papers_full: + if p["id"] not in step_papers: + continue + for item in p["checklist"]: + if item["category"] == "artifacts" and item["question"] == q_name: + if item["applies"] and item["answer"]: + next_set.add(p["id"]) + elif not item["applies"]: + next_set.add(p["id"]) # N/A doesn't filter out + break + step_papers = next_set + repro_funnel.append({"step": label, "n": len(step_papers)}) + repro_detail["funnel"] = repro_funnel + + # 9c. Methodology tag treemap + tag_treemap = [] + tag_score_map = defaultdict(list) + for p in papers_full: + for t in p["tags"]: + tag_score_map[t].append(p["score"]) + for t, scores in tag_score_map.items(): + tag_treemap.append({ + "tag": t, + "n": len(scores), + "mean": safe_mean(scores), + }) + tag_treemap.sort(key=lambda x: -x["n"]) + + # 9d. Two cultures / three clusters + # Cluster definitions based on correlation analysis + cluster_defs = { + "Transparency & Artifacts": ["artifacts", "setup_transparency", "data_integrity"], + "Statistical & Experimental Rigor": ["statistical_methodology", "experimental_rigor", "claims_and_evidence"], + "Contamination Awareness": ["contamination", "data_leakage"], + } + # Compute mean score per cluster per paper, then inter-cluster correlations + cluster_vectors = defaultdict(list) # cluster_name -> [mean_scores] + for p in papers_full: + cs = p["category_scores"] + for cname, cats in cluster_defs.items(): + vals = [cs[c] for c in cats if c in cs] + if vals: + cluster_vectors[cname].append(sum(vals) / len(vals)) + else: + cluster_vectors[cname].append(None) + + # Two cultures: compute overlap between human_studies and artifacts/setup + two_cultures_papers = [] + for p in papers_full: + cs = p["category_scores"] + hs = cs.get("human_studies") + art = cs.get("artifacts") + if hs is not None and art is not None: + two_cultures_papers.append({"human_studies": hs, "artifacts": art, "id": p["id"], "score": p["score"]}) + # 10. Category correlation matrix # Collect per-paper category score vectors paper_cat_vectors = [] @@ -802,6 +868,8 @@ def build(): "game_pcts": game_pcts, "correlation": correlation, "pca": pca_result, + "tag_treemap": tag_treemap, + "two_cultures": two_cultures_papers, } # --- Citation network (built from cited_papers in scan.json) ---

Impressum · Datenschutz