commit a2c488b4b161129d19bc4aff0445e74a4c93407f
parent 59c5b1043da1db314c2da2b0d833733c9fe627f5
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Mon, 23 Mar 2026 11:03:05 +0100
Add reproducibility funnel, methodology treemap, two cultures scatter
Three new visualizations in findings:
Reproducibility funnel: 745 → 400 (code) → 351 (data) → 61 (env) →
49 (instructions). The cliff at environment specs is where
reproducibility collapses — 90% of code-releasing papers stop there.
Methodology landscape treemap: proportional blocks sized by paper
count, colored by mean score. Benchmark-eval dominates (561 papers),
RCTs score highest (64.3%), case studies lowest (39.5%).
Two cultures scatter: human_studies vs artifacts score for 80 papers
with human subjects. Negatively correlated (r=-0.24) — CS researchers
release code but skip IRB; psychology researchers do ethics review
but don't release data. Four quadrants labeled.
Also: 3 new v2 scans (Codex 71.7%, CoT 56.6%, ReAct 48.2%) and
Agents of Chaos rescan (47.5%).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
4 files changed, 235 insertions(+), 3 deletions(-)
diff --git a/explorer/src/style.css b/explorer/src/style.css
@@ -530,6 +530,55 @@ td.score {
.toggle-btn:hover { border-color: var(--text-dim); }
.toggle-btn.active { background: rgba(108, 140, 255, 0.08); }
+/* Reproducibility funnel */
+.funnel-step { margin-bottom: 0.4rem; }
+.funnel-label {
+ display: flex;
+ justify-content: space-between;
+ font-size: 0.82rem;
+ margin-bottom: 2px;
+}
+.funnel-track {
+ height: 24px;
+ background: var(--border);
+ border-radius: 3px;
+ overflow: hidden;
+}
+.funnel-fill {
+ height: 100%;
+ border-radius: 3px;
+ transition: width 0.3s;
+}
+
+/* Methodology treemap */
+.treemap {
+ display: flex;
+ flex-wrap: wrap;
+ gap: 4px;
+ min-height: 80px;
+}
+.treemap-cell {
+ border-radius: 6px;
+ padding: 0.6rem 0.75rem;
+ min-height: 60px;
+ display: flex;
+ flex-direction: column;
+ justify-content: center;
+ flex-grow: 1;
+}
+.treemap-label {
+ font-size: 0.82rem;
+ font-weight: 600;
+ color: #fff;
+ text-shadow: 0 1px 2px rgba(0,0,0,0.4);
+}
+.treemap-value {
+ font-size: 0.72rem;
+ color: rgba(255,255,255,0.8);
+ font-family: var(--font);
+ text-shadow: 0 1px 2px rgba(0,0,0,0.4);
+}
+
/* DNA strip (paper profile in table) */
.dna-strip {
display: inline-flex;
diff --git a/explorer/src/views/findings.ts b/explorer/src/views/findings.ts
@@ -46,6 +46,9 @@ export async function renderFindings(app: HTMLElement) {
${renderBenchmarkMonoculture(f)}
${renderFundingGap(f)}
${renderReproDetail(f)}
+ ${renderReproFunnel(f)}
+ ${renderTagTreemap(f)}
+ ${renderTwoCultures(f)}
${renderNetworkInsights(f)}
${renderGames(f)}
`;
@@ -526,6 +529,118 @@ function renderReproDetail(f: Findings): string {
</div>`;
}
+function renderReproFunnel(f: Findings): string {
+ const funnel = (f.repro_detail as any).funnel as { step: string; n: number }[];
+ if (!funnel || !funnel.length) return '';
+ const max = funnel[0].n;
+
+ return `<div class="section">
+ <h2>Reproducibility Funnel</h2>
+ <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">Each step filters papers that pass ALL previous criteria. The cliff at "Environment specified" is where reproducibility collapses.</p>
+ ${funnel.map((step, i) => {
+ const pct = (step.n / max * 100).toFixed(1);
+ const lost = i > 0 ? funnel[i - 1].n - step.n : 0;
+ const color = step.n / max > 0.5 ? 'var(--green)' : step.n / max > 0.1 ? 'var(--yellow)' : 'var(--red)';
+ return `<div class="funnel-step">
+ <div class="funnel-label">
+ <span>${step.step}</span>
+ <span style="font-family:var(--font)">${step.n} <span style="color:var(--text-dim)">(${pct}%)</span>${i > 0 ? ` <span style="color:var(--red);font-size:0.75rem">\u2212${lost}</span>` : ''}</span>
+ </div>
+ <div class="funnel-track"><div class="funnel-fill" style="width:${pct}%;background:${color}"></div></div>
+ </div>`;
+ }).join('')}
+ </div>`;
+}
+
+function renderTagTreemap(f: Findings): string {
+ const tags = (f as any).tag_treemap as { tag: string; n: number; mean: number }[];
+ if (!tags || !tags.length) return '';
+ const totalPapers = tags.reduce((s, t) => s + t.n, 0);
+
+ // Render as proportional blocks
+ return `<div class="section">
+ <h2>Methodology Landscape</h2>
+ <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">Paper corpus by methodology type. Size = paper count. Color = mean score.</p>
+ <div class="treemap">
+ ${tags.map(t => {
+ const pct = (t.n / totalPapers * 100);
+ const color = t.mean < 42 ? 'var(--red)' : t.mean < 52 ? 'var(--yellow)' : t.mean < 58 ? 'var(--accent)' : 'var(--green)';
+ // Min width for small tags
+ const width = Math.max(pct, 6);
+ return `<div class="treemap-cell" style="flex-basis:${width}%;background:${color}" title="${formatName(t.tag)}: ${t.n} papers, mean ${t.mean}%">
+ <div class="treemap-label">${formatName(t.tag)}</div>
+ <div class="treemap-value">${t.n} (${t.mean}%)</div>
+ </div>`;
+ }).join('')}
+ </div>
+ </div>`;
+}
+
+function renderTwoCultures(f: Findings): string {
+ const papers = (f as any).two_cultures as { human_studies: number; artifacts: number; id: string; score: number }[];
+ if (!papers || papers.length < 10) return '';
+
+ // Compute quadrant counts
+ const q = { hh: 0, hl: 0, lh: 0, ll: 0 };
+ for (const p of papers) {
+ const hs = p.human_studies >= 50;
+ const ar = p.artifacts >= 50;
+ if (hs && ar) q.hh++;
+ else if (hs && !ar) q.hl++;
+ else if (!hs && ar) q.lh++;
+ else q.ll++;
+ }
+
+ // SVG scatter
+ const w = 500, h = 400;
+ const pad = { l: 60, r: 20, t: 20, b: 50 };
+ const cw = w - pad.l - pad.r, ch = h - pad.t - pad.b;
+ const xScale = (v: number) => pad.l + (v / 100) * cw;
+ const yScale = (v: number) => pad.t + ch - (v / 100) * ch;
+
+ let dots = '';
+ for (const p of papers) {
+ const cx = xScale(p.artifacts);
+ const cy = yScale(p.human_studies);
+ const color = p.score < 40 ? '#f06565' : p.score < 55 ? '#f0c050' : '#3dd68c';
+ dots += `<circle cx="${cx}" cy="${cy}" r="5" fill="${color}" opacity="0.6">
+ <title>${p.id}: artifacts ${p.artifacts}%, human_studies ${p.human_studies}%, score ${p.score}%</title>
+ </circle>`;
+ }
+
+ // Quadrant labels
+ const midX = xScale(50), midY = yScale(50);
+ const quadrants = `
+ <line x1="${midX}" y1="${pad.t}" x2="${midX}" y2="${h - pad.b}" stroke="var(--border)" stroke-dasharray="4"/>
+ <line x1="${pad.l}" y1="${midY}" x2="${w - pad.r}" y2="${midY}" stroke="var(--border)" stroke-dasharray="4"/>
+ <text x="${xScale(25)}" y="${yScale(80)}" text-anchor="middle" font-size="10" fill="var(--text-dim)">CS tradition only (${q.hl})</text>
+ <text x="${xScale(75)}" y="${yScale(80)}" text-anchor="middle" font-size="10" fill="var(--green)">Both traditions (${q.hh})</text>
+ <text x="${xScale(25)}" y="${yScale(20)}" text-anchor="middle" font-size="10" fill="var(--red)">Neither (${q.ll})</text>
+ <text x="${xScale(75)}" y="${yScale(20)}" text-anchor="middle" font-size="10" fill="var(--text-dim)">Psych tradition only (${q.lh})</text>
+ `;
+
+ // Axes
+ const axes = `
+ <text x="${w / 2}" y="${h - 5}" text-anchor="middle" fill="var(--text-dim)" font-size="11">Artifacts Score \u2192</text>
+ <text x="12" y="${h / 2}" text-anchor="middle" fill="var(--text-dim)" font-size="11" transform="rotate(-90, 12, ${h / 2})">Human Studies Score \u2192</text>
+ `;
+
+ // Grid
+ let grid = '';
+ for (let v = 0; v <= 100; v += 25) {
+ grid += `<text x="${pad.l - 8}" y="${yScale(v) + 4}" text-anchor="end" font-size="10">${v}%</text>`;
+ grid += `<text x="${xScale(v)}" y="${h - pad.b + 15}" text-anchor="middle" font-size="10">${v}%</text>`;
+ }
+
+ return `<div class="section">
+ <h2>Two Cultures</h2>
+ <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">Papers with human subjects (n=${papers.length}): human_studies score vs artifacts score. These two dimensions are <strong>negatively correlated</strong> (r=\u22120.24). CS-trained researchers release code but skip IRB; psychology-trained researchers do ethics review but don't release data.</p>
+ <svg viewBox="0 0 ${w} ${h}" style="width:100%;max-width:${w}px">
+ ${grid}${quadrants}${dots}${axes}
+ </svg>
+ </div>`;
+}
+
function renderNetworkInsights(f: Findings): string {
const ni = (f as any).network_insights;
if (!ni) return '';
diff --git a/explorer/tests/explorer.spec.ts b/explorer/tests/explorer.spec.ts
@@ -6,7 +6,7 @@ test.describe('Dashboard', () => {
await expect(page.locator('.card .value').first()).toBeVisible({ timeout: 10000 });
const cards = page.locator('.card');
await expect(cards).toHaveCount(4);
- await expect(cards.nth(0).locator('.value')).toHaveText('744');
+ await expect(cards.nth(0).locator('.value')).toHaveText('745');
await expect(cards.nth(1).locator('.value')).toHaveText('48.1%');
});
@@ -206,8 +206,8 @@ test.describe('Findings', () => {
test('loads and shows all sections', async ({ page }) => {
await page.goto('/#/findings');
await expect(page.locator('.section').first()).toBeVisible({ timeout: 10000 });
- // Should have 13 sections
- expect(await page.locator('.section').count()).toBe(13);
+ // Should have 16 sections
+ expect(await page.locator('.section').count()).toBe(16);
});
test('shows per-question pass rates', async ({ page }) => {
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -651,6 +651,72 @@ def build():
repro_detail["full_pass_count"] = repro_count
repro_detail["full_pass_pct"] = round(repro_count / total_papers * 100, 1) if total_papers else 0
+ # 9b. Reproducibility funnel — cascading filter
+ repro_funnel = []
+ step_papers = set(p["id"] for p in papers_full)
+ repro_funnel.append({"step": "All papers", "n": len(step_papers)})
+ for q_name, label in [
+ ("code_released", "Code released"),
+ ("data_released", "Data released"),
+ ("environment_specified", "Environment specified"),
+ ("reproduction_instructions", "Reproduction instructions"),
+ ]:
+ next_set = set()
+ for p in papers_full:
+ if p["id"] not in step_papers:
+ continue
+ for item in p["checklist"]:
+ if item["category"] == "artifacts" and item["question"] == q_name:
+ if item["applies"] and item["answer"]:
+ next_set.add(p["id"])
+ elif not item["applies"]:
+ next_set.add(p["id"]) # N/A doesn't filter out
+ break
+ step_papers = next_set
+ repro_funnel.append({"step": label, "n": len(step_papers)})
+ repro_detail["funnel"] = repro_funnel
+
+ # 9c. Methodology tag treemap
+ tag_treemap = []
+ tag_score_map = defaultdict(list)
+ for p in papers_full:
+ for t in p["tags"]:
+ tag_score_map[t].append(p["score"])
+ for t, scores in tag_score_map.items():
+ tag_treemap.append({
+ "tag": t,
+ "n": len(scores),
+ "mean": safe_mean(scores),
+ })
+ tag_treemap.sort(key=lambda x: -x["n"])
+
+ # 9d. Two cultures / three clusters
+ # Cluster definitions based on correlation analysis
+ cluster_defs = {
+ "Transparency & Artifacts": ["artifacts", "setup_transparency", "data_integrity"],
+ "Statistical & Experimental Rigor": ["statistical_methodology", "experimental_rigor", "claims_and_evidence"],
+ "Contamination Awareness": ["contamination", "data_leakage"],
+ }
+ # Compute mean score per cluster per paper, then inter-cluster correlations
+ cluster_vectors = defaultdict(list) # cluster_name -> [mean_scores]
+ for p in papers_full:
+ cs = p["category_scores"]
+ for cname, cats in cluster_defs.items():
+ vals = [cs[c] for c in cats if c in cs]
+ if vals:
+ cluster_vectors[cname].append(sum(vals) / len(vals))
+ else:
+ cluster_vectors[cname].append(None)
+
+ # Two cultures: compute overlap between human_studies and artifacts/setup
+ two_cultures_papers = []
+ for p in papers_full:
+ cs = p["category_scores"]
+ hs = cs.get("human_studies")
+ art = cs.get("artifacts")
+ if hs is not None and art is not None:
+ two_cultures_papers.append({"human_studies": hs, "artifacts": art, "id": p["id"], "score": p["score"]})
+
# 10. Category correlation matrix
# Collect per-paper category score vectors
paper_cat_vectors = []
@@ -802,6 +868,8 @@ def build():
"game_pcts": game_pcts,
"correlation": correlation,
"pca": pca_result,
+ "tag_treemap": tag_treemap,
+ "two_cultures": two_cultures_papers,
}
# --- Citation network (built from cited_papers in scan.json) ---