Add reproducibility funnel, methodology treemap, two cultures scatter - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

commit a2c488b4b161129d19bc4aff0445e74a4c93407f
parent 59c5b1043da1db314c2da2b0d833733c9fe627f5
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon, 23 Mar 2026 11:03:05 +0100

Add reproducibility funnel, methodology treemap, two cultures scatter

Three new visualizations in findings:

Reproducibility funnel: 745 → 400 (code) → 351 (data) → 61 (env) →
49 (instructions). The cliff at environment specs is where
reproducibility collapses — 90% of code-releasing papers stop there.

Methodology landscape treemap: proportional blocks sized by paper
count, colored by mean score. Benchmark-eval dominates (561 papers),
RCTs score highest (64.3%), case studies lowest (39.5%).

Two cultures scatter: human_studies vs artifacts score for 80 papers
with human subjects. Negatively correlated (r=-0.24) — CS researchers
release code but skip IRB; psychology researchers do ethics review
but don't release data. Four quadrants labeled.

Also: 3 new v2 scans (Codex 71.7%, CoT 56.6%, ReAct 48.2%) and
Agents of Chaos rescan (47.5%).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M explorer/src/style.css  | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
M explorer/src/views/findings.ts  | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M explorer/tests/explorer.spec.ts  | 6 +++---
M scripts/build-explorer-data.py  | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

4 files changed, 235 insertions(+), 3 deletions(-)
diff --git a/explorer/src/style.css b/explorer/src/style.css
@@ -530,6 +530,55 @@ td.score {
 .toggle-btn:hover { border-color: var(--text-dim); }
 .toggle-btn.active { background: rgba(108, 140, 255, 0.08); }
 
+/* Reproducibility funnel */
+.funnel-step { margin-bottom: 0.4rem; }
+.funnel-label {
+  display: flex;
+  justify-content: space-between;
+  font-size: 0.82rem;
+  margin-bottom: 2px;
+}
+.funnel-track {
+  height: 24px;
+  background: var(--border);
+  border-radius: 3px;
+  overflow: hidden;
+}
+.funnel-fill {
+  height: 100%;
+  border-radius: 3px;
+  transition: width 0.3s;
+}
+
+/* Methodology treemap */
+.treemap {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 4px;
+  min-height: 80px;
+}
+.treemap-cell {
+  border-radius: 6px;
+  padding: 0.6rem 0.75rem;
+  min-height: 60px;
+  display: flex;
+  flex-direction: column;
+  justify-content: center;
+  flex-grow: 1;
+}
+.treemap-label {
+  font-size: 0.82rem;
+  font-weight: 600;
+  color: #fff;
+  text-shadow: 0 1px 2px rgba(0,0,0,0.4);
+}
+.treemap-value {
+  font-size: 0.72rem;
+  color: rgba(255,255,255,0.8);
+  font-family: var(--font);
+  text-shadow: 0 1px 2px rgba(0,0,0,0.4);
+}
+
 /* DNA strip (paper profile in table) */
 .dna-strip {
   display: inline-flex;
diff --git a/explorer/src/views/findings.ts b/explorer/src/views/findings.ts
@@ -46,6 +46,9 @@ export async function renderFindings(app: HTMLElement) {
     ${renderBenchmarkMonoculture(f)}
     ${renderFundingGap(f)}
     ${renderReproDetail(f)}
+    ${renderReproFunnel(f)}
+    ${renderTagTreemap(f)}
+    ${renderTwoCultures(f)}
     ${renderNetworkInsights(f)}
     ${renderGames(f)}
   `;
@@ -526,6 +529,118 @@ function renderReproDetail(f: Findings): string {
   </div>`;
 }
 
+function renderReproFunnel(f: Findings): string {
+  const funnel = (f.repro_detail as any).funnel as { step: string; n: number }[];
+  if (!funnel || !funnel.length) return '';
+  const max = funnel[0].n;
+
+  return `<div class="section">
+    <h2>Reproducibility Funnel</h2>
+    <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">Each step filters papers that pass ALL previous criteria. The cliff at "Environment specified" is where reproducibility collapses.</p>
+    ${funnel.map((step, i) => {
+      const pct = (step.n / max * 100).toFixed(1);
+      const lost = i > 0 ? funnel[i - 1].n - step.n : 0;
+      const color = step.n / max > 0.5 ? 'var(--green)' : step.n / max > 0.1 ? 'var(--yellow)' : 'var(--red)';
+      return `<div class="funnel-step">
+        <div class="funnel-label">
+          <span>${step.step}</span>
+          <span style="font-family:var(--font)">${step.n} <span style="color:var(--text-dim)">(${pct}%)</span>${i > 0 ? ` <span style="color:var(--red);font-size:0.75rem">\u2212${lost}</span>` : ''}</span>
+        </div>
+        <div class="funnel-track"><div class="funnel-fill" style="width:${pct}%;background:${color}"></div></div>
+      </div>`;
+    }).join('')}
+  </div>`;
+}
+
+function renderTagTreemap(f: Findings): string {
+  const tags = (f as any).tag_treemap as { tag: string; n: number; mean: number }[];
+  if (!tags || !tags.length) return '';
+  const totalPapers = tags.reduce((s, t) => s + t.n, 0);
+
+  // Render as proportional blocks
+  return `<div class="section">
+    <h2>Methodology Landscape</h2>
+    <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">Paper corpus by methodology type. Size = paper count. Color = mean score.</p>
+    <div class="treemap">
+      ${tags.map(t => {
+        const pct = (t.n / totalPapers * 100);
+        const color = t.mean < 42 ? 'var(--red)' : t.mean < 52 ? 'var(--yellow)' : t.mean < 58 ? 'var(--accent)' : 'var(--green)';
+        // Min width for small tags
+        const width = Math.max(pct, 6);
+        return `<div class="treemap-cell" style="flex-basis:${width}%;background:${color}" title="${formatName(t.tag)}: ${t.n} papers, mean ${t.mean}%">
+          <div class="treemap-label">${formatName(t.tag)}</div>
+          <div class="treemap-value">${t.n} (${t.mean}%)</div>
+        </div>`;
+      }).join('')}
+    </div>
+  </div>`;
+}
+
+function renderTwoCultures(f: Findings): string {
+  const papers = (f as any).two_cultures as { human_studies: number; artifacts: number; id: string; score: number }[];
+  if (!papers || papers.length < 10) return '';
+
+  // Compute quadrant counts
+  const q = { hh: 0, hl: 0, lh: 0, ll: 0 };
+  for (const p of papers) {
+    const hs = p.human_studies >= 50;
+    const ar = p.artifacts >= 50;
+    if (hs && ar) q.hh++;
+    else if (hs && !ar) q.hl++;
+    else if (!hs && ar) q.lh++;
+    else q.ll++;
+  }
+
+  // SVG scatter
+  const w = 500, h = 400;
+  const pad = { l: 60, r: 20, t: 20, b: 50 };
+  const cw = w - pad.l - pad.r, ch = h - pad.t - pad.b;
+  const xScale = (v: number) => pad.l + (v / 100) * cw;
+  const yScale = (v: number) => pad.t + ch - (v / 100) * ch;
+
+  let dots = '';
+  for (const p of papers) {
+    const cx = xScale(p.artifacts);
+    const cy = yScale(p.human_studies);
+    const color = p.score < 40 ? '#f06565' : p.score < 55 ? '#f0c050' : '#3dd68c';
+    dots += `<circle cx="${cx}" cy="${cy}" r="5" fill="${color}" opacity="0.6">
+      <title>${p.id}: artifacts ${p.artifacts}%, human_studies ${p.human_studies}%, score ${p.score}%</title>
+    </circle>`;
+  }
+
+  // Quadrant labels
+  const midX = xScale(50), midY = yScale(50);
+  const quadrants = `
+    <line x1="${midX}" y1="${pad.t}" x2="${midX}" y2="${h - pad.b}" stroke="var(--border)" stroke-dasharray="4"/>
+    <line x1="${pad.l}" y1="${midY}" x2="${w - pad.r}" y2="${midY}" stroke="var(--border)" stroke-dasharray="4"/>
+    <text x="${xScale(25)}" y="${yScale(80)}" text-anchor="middle" font-size="10" fill="var(--text-dim)">CS tradition only (${q.hl})</text>
+    <text x="${xScale(75)}" y="${yScale(80)}" text-anchor="middle" font-size="10" fill="var(--green)">Both traditions (${q.hh})</text>
+    <text x="${xScale(25)}" y="${yScale(20)}" text-anchor="middle" font-size="10" fill="var(--red)">Neither (${q.ll})</text>
+    <text x="${xScale(75)}" y="${yScale(20)}" text-anchor="middle" font-size="10" fill="var(--text-dim)">Psych tradition only (${q.lh})</text>
+  `;
+
+  // Axes
+  const axes = `
+    <text x="${w / 2}" y="${h - 5}" text-anchor="middle" fill="var(--text-dim)" font-size="11">Artifacts Score \u2192</text>
+    <text x="12" y="${h / 2}" text-anchor="middle" fill="var(--text-dim)" font-size="11" transform="rotate(-90, 12, ${h / 2})">Human Studies Score \u2192</text>
+  `;
+
+  // Grid
+  let grid = '';
+  for (let v = 0; v <= 100; v += 25) {
+    grid += `<text x="${pad.l - 8}" y="${yScale(v) + 4}" text-anchor="end" font-size="10">${v}%</text>`;
+    grid += `<text x="${xScale(v)}" y="${h - pad.b + 15}" text-anchor="middle" font-size="10">${v}%</text>`;
+  }
+
+  return `<div class="section">
+    <h2>Two Cultures</h2>
+    <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">Papers with human subjects (n=${papers.length}): human_studies score vs artifacts score. These two dimensions are <strong>negatively correlated</strong> (r=\u22120.24). CS-trained researchers release code but skip IRB; psychology-trained researchers do ethics review but don't release data.</p>
+    <svg viewBox="0 0 ${w} ${h}" style="width:100%;max-width:${w}px">
+      ${grid}${quadrants}${dots}${axes}
+    </svg>
+  </div>`;
+}
+
 function renderNetworkInsights(f: Findings): string {
   const ni = (f as any).network_insights;
   if (!ni) return '';
diff --git a/explorer/tests/explorer.spec.ts b/explorer/tests/explorer.spec.ts
@@ -6,7 +6,7 @@ test.describe('Dashboard', () => {
     await expect(page.locator('.card .value').first()).toBeVisible({ timeout: 10000 });
     const cards = page.locator('.card');
     await expect(cards).toHaveCount(4);
-    await expect(cards.nth(0).locator('.value')).toHaveText('744');
+    await expect(cards.nth(0).locator('.value')).toHaveText('745');
     await expect(cards.nth(1).locator('.value')).toHaveText('48.1%');
   });
 
@@ -206,8 +206,8 @@ test.describe('Findings', () => {
   test('loads and shows all sections', async ({ page }) => {
     await page.goto('/#/findings');
     await expect(page.locator('.section').first()).toBeVisible({ timeout: 10000 });
-    // Should have 13 sections
-    expect(await page.locator('.section').count()).toBe(13);
+    // Should have 16 sections
+    expect(await page.locator('.section').count()).toBe(16);
   });
 
   test('shows per-question pass rates', async ({ page }) => {
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -651,6 +651,72 @@ def build():
     repro_detail["full_pass_count"] = repro_count
     repro_detail["full_pass_pct"] = round(repro_count / total_papers * 100, 1) if total_papers else 0
 
+    # 9b. Reproducibility funnel — cascading filter
+    repro_funnel = []
+    step_papers = set(p["id"] for p in papers_full)
+    repro_funnel.append({"step": "All papers", "n": len(step_papers)})
+    for q_name, label in [
+        ("code_released", "Code released"),
+        ("data_released", "Data released"),
+        ("environment_specified", "Environment specified"),
+        ("reproduction_instructions", "Reproduction instructions"),
+    ]:
+        next_set = set()
+        for p in papers_full:
+            if p["id"] not in step_papers:
+                continue
+            for item in p["checklist"]:
+                if item["category"] == "artifacts" and item["question"] == q_name:
+                    if item["applies"] and item["answer"]:
+                        next_set.add(p["id"])
+                    elif not item["applies"]:
+                        next_set.add(p["id"])  # N/A doesn't filter out
+                    break
+        step_papers = next_set
+        repro_funnel.append({"step": label, "n": len(step_papers)})
+    repro_detail["funnel"] = repro_funnel
+
+    # 9c. Methodology tag treemap
+    tag_treemap = []
+    tag_score_map = defaultdict(list)
+    for p in papers_full:
+        for t in p["tags"]:
+            tag_score_map[t].append(p["score"])
+    for t, scores in tag_score_map.items():
+        tag_treemap.append({
+            "tag": t,
+            "n": len(scores),
+            "mean": safe_mean(scores),
+        })
+    tag_treemap.sort(key=lambda x: -x["n"])
+
+    # 9d. Two cultures / three clusters
+    # Cluster definitions based on correlation analysis
+    cluster_defs = {
+        "Transparency & Artifacts": ["artifacts", "setup_transparency", "data_integrity"],
+        "Statistical & Experimental Rigor": ["statistical_methodology", "experimental_rigor", "claims_and_evidence"],
+        "Contamination Awareness": ["contamination", "data_leakage"],
+    }
+    # Compute mean score per cluster per paper, then inter-cluster correlations
+    cluster_vectors = defaultdict(list)  # cluster_name -> [mean_scores]
+    for p in papers_full:
+        cs = p["category_scores"]
+        for cname, cats in cluster_defs.items():
+            vals = [cs[c] for c in cats if c in cs]
+            if vals:
+                cluster_vectors[cname].append(sum(vals) / len(vals))
+            else:
+                cluster_vectors[cname].append(None)
+
+    # Two cultures: compute overlap between human_studies and artifacts/setup
+    two_cultures_papers = []
+    for p in papers_full:
+        cs = p["category_scores"]
+        hs = cs.get("human_studies")
+        art = cs.get("artifacts")
+        if hs is not None and art is not None:
+            two_cultures_papers.append({"human_studies": hs, "artifacts": art, "id": p["id"], "score": p["score"]})
+
     # 10. Category correlation matrix
     # Collect per-paper category score vectors
     paper_cat_vectors = []
@@ -802,6 +868,8 @@ def build():
         "game_pcts": game_pcts,
         "correlation": correlation,
         "pca": pca_result,
+        "tag_treemap": tag_treemap,
+        "two_cultures": two_cultures_papers,
     }
 
     # --- Citation network (built from cited_papers in scan.json) ---

	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs

M	explorer/src/style.css	\|	49	+++++++++++++++++++++++++++++++++++++++++++++++++
M	explorer/src/views/findings.ts	\|	115	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	explorer/tests/explorer.spec.ts	\|	6	+++---
M	scripts/build-explorer-data.py	\|	68	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++