commit d240203118b1d2332118fdcb2cfd94594a523da2
parent 1818e336e2cc2445cd1006f83c3fa66c7eec7259
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sun, 22 Mar 2026 21:33:58 +0100
Add findings view with 10 analysis sections and code URL extraction
New #/findings view surfaces deep analysis that was only in docs:
- Per-question pass rates (67 questions, worst: self_comparison_bias 0.8%)
- Year trends by category with toggleable lines (contamination 29%→7%)
- Venue & citation scoring (500+ cites score below average)
- Optimism-rigor inversion (positive claims from weaker papers)
- Quality homophily (high-quality papers cite high-quality 3x more)
- Sampling effect (median drops as long tail scanned)
- Benchmark monoculture (58% pure benchmark-eval)
- Funding gap (13pp between disclosed/undisclosed)
- Reproducibility drill-down (4.2% fully reproducible)
- All 6 named games (added Cherry-picked Comparisons, All Show No Substance)
Also: extracted 282 code URLs from scan justification text, shown as
"Code" link on paper detail pages.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
9 files changed, 680 insertions(+), 26 deletions(-)
diff --git a/explorer/index.html b/explorer/index.html
@@ -14,6 +14,7 @@
<a href="#/papers">Papers</a>
<a href="#/network">Network</a>
<a href="#/tensions">Tensions</a>
+ <a href="#/findings">Findings</a>
</nav>
<button id="theme-toggle" aria-label="Toggle theme">☀</button>
</header>
diff --git a/explorer/src/components/multi-line-chart.ts b/explorer/src/components/multi-line-chart.ts
@@ -0,0 +1,48 @@
+export interface LineData {
+ label: string;
+ color: string;
+ points: { x: number; y: number }[];
+}
+
+export function renderMultiLineChart(
+ lines: LineData[],
+ xLabels: string[],
+ opts: { width?: number; height?: number; yLabel?: string } = {}
+): string {
+ const w = opts.width || 600;
+ const h = opts.height || 220;
+ const pad = { l: 50, r: 20, t: 15, b: 30 };
+ const chartW = w - pad.l - pad.r;
+ const chartH = h - pad.t - pad.b;
+
+ const xScale = (i: number) => pad.l + (i / Math.max(xLabels.length - 1, 1)) * chartW;
+ const yScale = (v: number) => pad.t + chartH - (v / 100) * chartH;
+
+ // Grid + labels
+ let svg = '';
+ for (let v = 0; v <= 100; v += 25) {
+ svg += `<text x="${pad.l - 8}" y="${yScale(v) + 4}" text-anchor="end">${v}%</text>`;
+ svg += `<line class="grid-line" x1="${pad.l}" x2="${w - pad.r}" y1="${yScale(v)}" y2="${yScale(v)}" stroke-dasharray="3"/>`;
+ }
+ for (let i = 0; i < xLabels.length; i++) {
+ svg += `<text x="${xScale(i)}" y="${h - 5}" text-anchor="middle">${xLabels[i]}</text>`;
+ }
+
+ // Lines
+ for (const line of lines) {
+ if (line.points.length < 2) continue;
+ const pts = line.points.map(p => `${xScale(p.x)},${yScale(p.y)}`).join(' ');
+ svg += `<polyline points="${pts}" fill="none" stroke="${line.color}" stroke-width="2"/>`;
+ for (const p of line.points) {
+ svg += `<circle cx="${xScale(p.x)}" cy="${yScale(p.y)}" r="3" fill="${line.color}"/>`;
+ }
+ }
+
+ // Legend
+ const legend = lines.map(l =>
+ `<span class="chart-legend-item"><span class="chart-legend-swatch" style="background:${l.color}"></span>${l.label}</span>`
+ ).join('');
+
+ return `<svg viewBox="0 0 ${w} ${h}" style="width:100%;max-width:${w}px">${svg}</svg>
+ <div class="chart-legend">${legend}</div>`;
+}
diff --git a/explorer/src/data.ts b/explorer/src/data.ts
@@ -16,7 +16,6 @@ export interface RedFlag {
detail: string;
}
-// Slim entry for paper table (papers-index.json)
export interface PaperIndex {
id: string;
title: string;
@@ -28,18 +27,9 @@ export interface PaperIndex {
games: string[];
arxiv_id: string;
doi: string;
+ code_url: string | null;
}
-export interface Pipeline {
- registry_total: number;
- v2_scanned: number;
- v1_needs_rescan: number;
- has_text_no_scan: number;
- no_text: number;
- excluded: number;
-}
-
-// Full detail for per-paper pages (papers/{slug}.json)
export interface PaperDetail extends PaperIndex {
category_scores: Record<string, number>;
claims: Claim[];
@@ -56,6 +46,15 @@ export interface HistBin {
count: number;
}
+export interface Pipeline {
+ registry_total: number;
+ v2_scanned: number;
+ v1_needs_rescan: number;
+ has_text_no_scan: number;
+ no_text: number;
+ excluded: number;
+}
+
export interface Dashboard {
n: number;
median: number;
@@ -70,6 +69,31 @@ export interface Dashboard {
pipeline: Pipeline;
}
+export interface QuestionRate {
+ rate: number;
+ n: number;
+}
+
+export interface GroupStat {
+ n: number;
+ mean: number;
+ median: number;
+}
+
+export interface Findings {
+ question_rates: Record<string, QuestionRate>;
+ year_category_trends: Record<string, Record<string, number>>;
+ venue_stats: Record<string, GroupStat>;
+ citation_band_stats: Record<string, GroupStat>;
+ optimism_rigor: Record<string, { positive_n: number; positive_mean: number; nuanced_n: number; nuanced_mean: number; gap: number }>;
+ homophily: { threshold: number; baseline_pct: number; high_cite_high_pct: number; high_cite_total: number };
+ sampling_effect: { checkpoints: { n: number; median: number }[] };
+ benchmark_monoculture: Record<string, { benchmark_only: number; total: number; pct: number }>;
+ funding_gap: Record<string, GroupStat>;
+ repro_detail: Record<string, QuestionRate | number> & { full_pass_count: number; full_pass_pct: number };
+ game_pcts: Record<string, number>;
+}
+
export interface TensionClaim {
paper_id: string;
claim: string;
@@ -97,7 +121,6 @@ export interface CitationNetwork {
edges: [string, string][];
}
-// Lazy-loaded per-view data
const cache: Record<string, unknown> = {};
async function fetchJson<T>(path: string): Promise<T> {
@@ -109,6 +132,7 @@ async function fetchJson<T>(path: string): Promise<T> {
}
export const loadDashboard = () => fetchJson<Dashboard>('/data/dashboard.json');
+export const loadFindings = () => fetchJson<Findings>('/data/findings.json');
export const loadPapersIndex = () => fetchJson<PaperIndex[]>('/data/papers-index.json');
export const loadPaperDetail = (slug: string) => fetchJson<PaperDetail>(`/data/papers/${slug}.json`);
export const loadNetwork = () => fetchJson<CitationNetwork>('/data/network.json');
diff --git a/explorer/src/main.ts b/explorer/src/main.ts
@@ -4,6 +4,7 @@ import { renderPapers } from './views/papers';
import { renderPaperDetail } from './views/paper-detail';
import { renderNetwork } from './views/network';
import { renderTensions } from './views/tensions';
+import { renderFindings } from './views/findings';
import { initTheme } from './theme';
function init() {
@@ -15,6 +16,7 @@ function init() {
route('/paper/:slug', ({ slug }) => renderPaperDetail(app, slug));
route('/network', () => renderNetwork(app));
route('/tensions', () => renderTensions(app));
+ route('/findings', () => renderFindings(app));
startRouter();
}
diff --git a/explorer/src/style.css b/explorer/src/style.css
@@ -488,3 +488,23 @@ td.score {
/* Year trend chart */
.trend-chart { margin-top: 0.5rem; }
+
+/* Multi-line chart legend */
+.chart-legend { display: flex; gap: 1rem; flex-wrap: wrap; margin-top: 0.5rem; }
+.chart-legend-item { font-size: 0.75rem; color: var(--text-dim); display: flex; align-items: center; gap: 4px; }
+.chart-legend-swatch { width: 14px; height: 3px; border-radius: 1px; display: inline-block; }
+
+/* Toggle buttons for category lines */
+.toggle-group { display: flex; gap: 0.5rem; flex-wrap: wrap; margin-bottom: 0.75rem; }
+.toggle-btn {
+ font-size: 0.72rem;
+ padding: 0.2rem 0.55rem;
+ border: 1px solid var(--border);
+ border-radius: 3px;
+ background: none;
+ color: var(--text-dim);
+ cursor: pointer;
+ transition: all 0.15s;
+}
+.toggle-btn:hover { border-color: var(--text-dim); }
+.toggle-btn.active { background: rgba(108, 140, 255, 0.08); }
diff --git a/explorer/src/views/findings.ts b/explorer/src/views/findings.ts
@@ -0,0 +1,304 @@
+import { loadFindings, type Findings, type QuestionRate } from '../data';
+import { renderBarChart } from '../components/bar-chart';
+import { renderMultiLineChart } from '../components/multi-line-chart';
+
+function formatName(name: string): string {
+ return name.replace(/_/g, ' ').replace(/\b\w/g, c => c.toUpperCase());
+}
+
+const CAT_COLORS: Record<string, string> = {
+ contamination: '#f06565',
+ data_leakage: '#e08050',
+ statistical_methodology: '#6c8cff',
+ experimental_rigor: '#f0c050',
+ artifacts: '#3dd68c',
+ evaluation_design: '#a080f0',
+ claims_and_evidence: '#50c0c0',
+ survey_methodology: '#ff80b0',
+ setup_transparency: '#90b060',
+ limitations_and_scope: '#c0a060',
+ cost_and_practicality: '#8090a0',
+ human_studies: '#b070b0',
+ data_integrity: '#70a0d0',
+ conflicts_of_interest: '#d07070',
+};
+
+const TENSION_NAMES: Record<string, string> = {
+ productivity: 'Productivity',
+ benchmarks: 'Benchmarks',
+ agents: 'Agents',
+};
+
+export async function renderFindings(app: HTMLElement) {
+ app.innerHTML = '<div class="spinner"></div>';
+ const f = await loadFindings();
+
+ app.innerHTML = `
+ ${renderQuestionRates(f)}
+ ${renderYearCategoryTrends(f)}
+ ${renderVenueCitation(f)}
+ ${renderOptimismRigor(f)}
+ ${renderHomophily(f)}
+ ${renderSamplingEffect(f)}
+ ${renderBenchmarkMonoculture(f)}
+ ${renderFundingGap(f)}
+ ${renderReproDetail(f)}
+ ${renderGames(f)}
+ `;
+
+ // Attach toggle listeners for year-category chart
+ attachCategoryToggles(f);
+}
+
+function renderQuestionRates(f: Findings): string {
+ const sorted = Object.entries(f.question_rates)
+ .sort((a, b) => a[1].rate - b[1].rate);
+ const worst20 = sorted.slice(0, 20);
+ const best10 = sorted.slice(-10).reverse();
+
+ return `<div class="section">
+ <h2>Per-Question Pass Rates</h2>
+ <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">${sorted.length} questions across 14 categories. Sorted by pass rate, worst first.</p>
+ <h3 style="font-size:0.85rem;color:var(--red);margin-bottom:0.5rem">Worst 20</h3>
+ ${worst20.map(([key, d]) => {
+ const [cat, q] = key.split('.');
+ const color = d.rate < 10 ? 'var(--red)' : d.rate < 30 ? 'var(--yellow)' : 'var(--accent)';
+ return `<div class="hbar">
+ <div class="hbar-label"><span>${formatName(q)} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
+ <div class="hbar-track"><div class="hbar-fill" style="width:${d.rate}%;background:${color}"></div></div>
+ </div>`;
+ }).join('')}
+ <h3 style="font-size:0.85rem;color:var(--green);margin:1rem 0 0.5rem">Best 10</h3>
+ ${best10.map(([key, d]) => {
+ const [cat, q] = key.split('.');
+ return `<div class="hbar">
+ <div class="hbar-label"><span>${formatName(q)} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
+ <div class="hbar-track"><div class="hbar-fill" style="width:${d.rate}%;background:var(--green)"></div></div>
+ </div>`;
+ }).join('')}
+ </div>`;
+}
+
+function renderYearCategoryTrends(f: Findings): string {
+ const years = Object.keys(f.year_category_trends).sort();
+ const defaultCats = ['contamination', 'data_leakage', 'statistical_methodology', 'experimental_rigor'];
+ const allCats = Object.keys(CAT_COLORS);
+
+ const toggles = allCats.map(cat => {
+ const active = defaultCats.includes(cat) ? ' active' : '';
+ return `<button class="toggle-btn${active}" data-cat="${cat}" style="border-color:${active ? CAT_COLORS[cat] : ''};color:${active ? CAT_COLORS[cat] : ''}">${formatName(cat)}</button>`;
+ }).join('');
+
+ const lines = defaultCats.map(cat => ({
+ label: formatName(cat),
+ color: CAT_COLORS[cat],
+ points: years.map((y, i) => ({ x: i, y: f.year_category_trends[y]?.[cat] ?? 0 }))
+ .filter(p => p.y > 0),
+ }));
+
+ return `<div class="section">
+ <h2>Year Trends by Category</h2>
+ <div class="toggle-group" id="cat-toggles">${toggles}</div>
+ <div id="cat-chart">${renderMultiLineChart(lines, years, { width: 700 })}</div>
+ </div>`;
+}
+
+function attachCategoryToggles(f: Findings) {
+ const container = document.getElementById('cat-toggles');
+ const chartEl = document.getElementById('cat-chart');
+ if (!container || !chartEl) return;
+
+ container.addEventListener('click', e => {
+ const btn = (e.target as HTMLElement).closest('.toggle-btn') as HTMLElement;
+ if (!btn) return;
+ btn.classList.toggle('active');
+ const cat = btn.dataset.cat!;
+ const color = CAT_COLORS[cat];
+ if (btn.classList.contains('active')) {
+ btn.style.borderColor = color;
+ btn.style.color = color;
+ } else {
+ btn.style.borderColor = '';
+ btn.style.color = '';
+ }
+
+ // Re-render chart with active categories
+ const activeCats = Array.from(container.querySelectorAll('.toggle-btn.active'))
+ .map(b => (b as HTMLElement).dataset.cat!);
+ const years = Object.keys(f.year_category_trends).sort();
+ const lines = activeCats.map(c => ({
+ label: formatName(c),
+ color: CAT_COLORS[c],
+ points: years.map((y, i) => ({ x: i, y: f.year_category_trends[y]?.[c] ?? 0 }))
+ .filter(p => p.y > 0),
+ }));
+ chartEl.innerHTML = renderMultiLineChart(lines, years, { width: 700 });
+ });
+}
+
+function renderVenueCitation(f: Findings): string {
+ const venues = Object.entries(f.venue_stats).sort((a, b) => b[1].mean - a[1].mean);
+ const bands = ['0', '1-50', '51-500', '500+'];
+
+ return `<div class="section">
+ <h2>Venue & Citation Scoring</h2>
+ <div class="detail-grid">
+ <div>
+ <h3 style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">Score by Venue (3+ papers)</h3>
+ ${venues.map(([v, d]) => {
+ const color = d.mean < 40 ? 'var(--red)' : d.mean < 55 ? 'var(--yellow)' : 'var(--green)';
+ return `<div class="hbar">
+ <div class="hbar-label"><span>${v}</span><span>${d.mean}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
+ <div class="hbar-track"><div class="hbar-fill" style="width:${d.mean}%;background:${color}"></div></div>
+ </div>`;
+ }).join('')}
+ </div>
+ <div>
+ <h3 style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">Score by Citation Count</h3>
+ ${bands.map(band => {
+ const d = f.citation_band_stats[band];
+ if (!d) return '';
+ const color = d.mean < 40 ? 'var(--red)' : d.mean < 55 ? 'var(--yellow)' : 'var(--green)';
+ return `<div class="hbar">
+ <div class="hbar-label"><span>${band} citations</span><span>${d.mean}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
+ <div class="hbar-track"><div class="hbar-fill" style="width:${d.mean}%;background:${color}"></div></div>
+ </div>`;
+ }).join('')}
+ <p style="font-size:0.8rem;color:var(--text-dim);margin-top:0.75rem">Most-cited papers (500+) score <strong style="color:var(--red)">below average</strong>. Citations measure influence, not rigor.</p>
+ </div>
+ </div>
+ </div>`;
+}
+
+function renderOptimismRigor(f: Findings): string {
+ return `<div class="section">
+ <h2>Optimism-Rigor Inversion</h2>
+ <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">Across all three claim tensions, papers making positive/optimistic claims have <strong>lower</strong> methodology scores than papers with nuanced findings.</p>
+ ${Object.entries(f.optimism_rigor).map(([key, d]) => `
+ <div class="game-row">
+ <span class="game-name">${TENSION_NAMES[key] || key}</span>
+ <span style="font-family:var(--font);font-size:0.85rem">
+ <span style="color:var(--yellow)">Positive ${d.positive_mean}%</span>
+ <span style="color:var(--text-dim)"> vs </span>
+ <span style="color:var(--green)">Nuanced ${d.nuanced_mean}%</span>
+ <span style="color:var(--accent)"> (+${d.gap}pp)</span>
+ </span>
+ </div>
+ `).join('')}
+ </div>`;
+}
+
+function renderHomophily(f: Findings): string {
+ const h = f.homophily;
+ const ratio = h.high_cite_total > 0 ? (h.high_cite_high_pct / h.baseline_pct).toFixed(1) : '?';
+ return `<div class="section">
+ <h2>Quality Homophily</h2>
+ <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">Do high-quality papers cite other high-quality papers more than expected? (threshold: ${h.threshold}%+ score)</p>
+ <div class="hbar">
+ <div class="hbar-label"><span>Expected (baseline)</span><span>${h.baseline_pct}%</span></div>
+ <div class="hbar-track"><div class="hbar-fill" style="width:${h.baseline_pct}%;background:var(--text-dim)"></div></div>
+ </div>
+ <div class="hbar">
+ <div class="hbar-label"><span>Observed (high cites high)</span><span>${h.high_cite_high_pct}%</span></div>
+ <div class="hbar-track"><div class="hbar-fill" style="width:${Math.min(h.high_cite_high_pct, 100)}%;background:var(--green)"></div></div>
+ </div>
+ <p style="font-size:0.85rem;margin-top:0.5rem"><strong>${ratio}x</strong> more likely to cite high-quality work <span style="color:var(--text-dim)">(n=${h.high_cite_total} citations)</span></p>
+ </div>`;
+}
+
+function renderSamplingEffect(f: Findings): string {
+ const pts = f.sampling_effect.checkpoints;
+ const w = 400, h = 150;
+ const pad = { l: 50, r: 20, t: 15, b: 30 };
+ const chartW = w - pad.l - pad.r;
+ const chartH = h - pad.t - pad.b;
+
+ const xScale = (i: number) => pad.l + (i / (pts.length - 1)) * chartW;
+ const yMin = 40, yMax = 60;
+ const yScale = (v: number) => pad.t + chartH - ((v - yMin) / (yMax - yMin)) * chartH;
+
+ let svg = '';
+ for (let v = yMin; v <= yMax; v += 5) {
+ svg += `<text x="${pad.l - 8}" y="${yScale(v) + 4}" text-anchor="end">${v}%</text>`;
+ svg += `<line class="grid-line" x1="${pad.l}" x2="${w - pad.r}" y1="${yScale(v)}" y2="${yScale(v)}" stroke-dasharray="3"/>`;
+ }
+
+ const polyline = pts.map((p, i) => `${xScale(i)},${yScale(p.median)}`).join(' ');
+ svg += `<polyline points="${polyline}" fill="none" stroke="var(--yellow)" stroke-width="2"/>`;
+ pts.forEach((p, i) => {
+ svg += `<circle cx="${xScale(i)}" cy="${yScale(p.median)}" r="4" fill="var(--yellow)"/>`;
+ svg += `<text x="${xScale(i)}" y="${yScale(p.median) - 10}" text-anchor="middle" fill="var(--yellow)" font-size="11">${p.median}%</text>`;
+ svg += `<text x="${xScale(i)}" y="${h - 5}" text-anchor="middle">n=${p.n}</text>`;
+ });
+
+ return `<div class="section">
+ <h2>Sampling Effect</h2>
+ <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">Median score drops as the long tail is scanned. Visibility correlates with quality.</p>
+ <svg viewBox="0 0 ${w} ${h}" style="width:100%;max-width:${w}px">${svg}</svg>
+ </div>`;
+}
+
+function renderBenchmarkMonoculture(f: Findings): string {
+ const years = Object.keys(f.benchmark_monoculture).sort();
+ const pts = years.map((y, i) => ({ x: i, y: f.benchmark_monoculture[y].pct }));
+ const lines = [{
+ label: 'Benchmark-only papers',
+ color: '#f0c050',
+ points: pts,
+ }];
+
+ return `<div class="section">
+ <h2>Benchmark Monoculture</h2>
+ <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">Share of papers using only benchmark evaluation, no other methodology.</p>
+ ${renderMultiLineChart(lines, years)}
+ ${years.length > 0 ? `<p style="font-size:0.85rem;margin-top:0.5rem">${f.benchmark_monoculture[years[years.length - 1]]?.pct ?? 0}% of ${years[years.length - 1]} papers are pure benchmark-eval.</p>` : ''}
+ </div>`;
+}
+
+function renderFundingGap(f: Findings): string {
+ const disc = f.funding_gap.disclosed;
+ const nodisc = f.funding_gap.not_disclosed;
+ if (!disc || !nodisc) return '';
+ const gap = (disc.mean - nodisc.mean).toFixed(1);
+
+ return `<div class="section">
+ <h2>Funding Disclosure Gap</h2>
+ <div class="hbar">
+ <div class="hbar-label"><span>Funding disclosed</span><span>${disc.mean}% <span style="color:var(--text-dim)">(n=${disc.n})</span></span></div>
+ <div class="hbar-track"><div class="hbar-fill" style="width:${disc.mean}%;background:var(--green)"></div></div>
+ </div>
+ <div class="hbar">
+ <div class="hbar-label"><span>Not disclosed</span><span>${nodisc.mean}% <span style="color:var(--text-dim)">(n=${nodisc.n})</span></span></div>
+ <div class="hbar-track"><div class="hbar-fill" style="width:${nodisc.mean}%;background:var(--red)"></div></div>
+ </div>
+ <p style="font-size:0.85rem;margin-top:0.5rem"><strong>${gap}pp gap</strong> — papers that disclose funding score substantially higher.</p>
+ </div>`;
+}
+
+function renderReproDetail(f: Findings): string {
+ const qs = ['code_released', 'data_released', 'environment_specified', 'reproduction_instructions'];
+ return `<div class="section">
+ <h2>Reproducibility Drill-Down</h2>
+ ${qs.map(q => {
+ const d = f.repro_detail[q] as QuestionRate | undefined;
+ if (!d || typeof d === 'number') return '';
+ const color = d.rate < 15 ? 'var(--red)' : d.rate < 50 ? 'var(--yellow)' : 'var(--green)';
+ return `<div class="hbar">
+ <div class="hbar-label"><span>${formatName(q)}</span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
+ <div class="hbar-track"><div class="hbar-fill" style="width:${d.rate}%;background:${color}"></div></div>
+ </div>`;
+ }).join('')}
+ <p style="font-size:1rem;margin-top:1rem;font-weight:600"><span style="color:var(--red);font-family:var(--font);font-size:1.3rem">${f.repro_detail.full_pass_pct}%</span> of papers are fully reproducible <span style="color:var(--text-dim)">(${f.repro_detail.full_pass_count} papers pass all 4 criteria)</span></p>
+ </div>`;
+}
+
+function renderGames(f: Findings): string {
+ const sorted = Object.entries(f.game_pcts).sort((a, b) => b[1] - a[1]);
+ return `<div class="section">
+ <h2>Named Games</h2>
+ <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.75rem">Recurring methodological patterns detected across the corpus.</p>
+ ${sorted.map(([name, pct]) =>
+ `<div class="game-row"><span class="game-name">${name}</span><span class="game-pct">${pct}%</span></div>`
+ ).join('')}
+ </div>`;
+}
diff --git a/explorer/src/views/paper-detail.ts b/explorer/src/views/paper-detail.ts
@@ -50,6 +50,9 @@ export async function renderPaperDetail(app: HTMLElement, slug: string) {
if (paper.source_url && !paper.source_url.includes('arxiv.org')) {
links.push(`<a href="${paper.source_url}" target="_blank" rel="noopener">Source</a>`);
}
+ if (paper.code_url) {
+ links.push(`<a href="${paper.code_url}" target="_blank" rel="noopener">Code</a>`);
+ }
// Load network for citations (lazy, non-blocking)
let incomingHtml = '';
diff --git a/explorer/tests/explorer.spec.ts b/explorer/tests/explorer.spec.ts
@@ -202,6 +202,52 @@ test.describe('Navigation', () => {
});
});
+test.describe('Findings', () => {
+ test('loads and shows all sections', async ({ page }) => {
+ await page.goto('/#/findings');
+ await expect(page.locator('.section').first()).toBeVisible({ timeout: 10000 });
+ // Should have 10 sections
+ expect(await page.locator('.section').count()).toBe(10);
+ });
+
+ test('shows per-question pass rates', async ({ page }) => {
+ await page.goto('/#/findings');
+ await expect(page.locator('.section').first()).toBeVisible({ timeout: 10000 });
+ await expect(page.locator('h2', { hasText: 'Per-Question Pass Rates' })).toBeVisible();
+ // Should have horizontal bars
+ expect(await page.locator('.section').first().locator('.hbar').count()).toBeGreaterThan(10);
+ });
+
+ test('shows year category trends with toggles', async ({ page }) => {
+ await page.goto('/#/findings');
+ await expect(page.locator('#cat-toggles')).toBeVisible({ timeout: 10000 });
+ const activeToggles = page.locator('.toggle-btn.active');
+ expect(await activeToggles.count()).toBe(4);
+ // Click a toggle to deactivate
+ await activeToggles.first().click();
+ expect(await page.locator('.toggle-btn.active').count()).toBe(3);
+ });
+
+ test('shows funding gap', async ({ page }) => {
+ await page.goto('/#/findings');
+ await expect(page.locator('h2', { hasText: 'Funding Disclosure Gap' })).toBeVisible({ timeout: 10000 });
+ await expect(page.locator('text=pp gap')).toBeVisible();
+ });
+
+ test('shows reproducibility drill-down', async ({ page }) => {
+ await page.goto('/#/findings');
+ await expect(page.locator('h2', { hasText: 'Reproducibility Drill-Down' })).toBeVisible({ timeout: 10000 });
+ await expect(page.locator('text=fully reproducible')).toBeVisible();
+ });
+
+ test('shows 6 named games', async ({ page }) => {
+ await page.goto('/#/findings');
+ const gamesSection = page.locator('.section', { has: page.locator('h2', { hasText: 'Named Games' }) });
+ await expect(gamesSection).toBeVisible({ timeout: 10000 });
+ expect(await gamesSection.locator('.game-row').count()).toBe(6);
+ });
+});
+
test.describe('Theme', () => {
test('toggle switches theme', async ({ page }) => {
await page.goto('/');
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -6,8 +6,9 @@ Reads v2 scan.json files, metadata.json, citation-graph.json, and registry.jsonl
Outputs view-specific JSON files for fast loading, plus a full explorer.json for power users.
Output files:
- explorer/public/data/dashboard.json — aggregation stats only (~0.5KB)
- explorer/public/data/papers-index.json — table data without checklists (~200KB)
+ explorer/public/data/dashboard.json — aggregation stats only
+ explorer/public/data/findings.json — deep analysis findings
+ explorer/public/data/papers-index.json — table data without checklists
explorer/public/data/papers/{slug}.json — full detail per paper
explorer/public/data/network.json — citation network
explorer/public/data/tensions.json — claim tensions
@@ -18,6 +19,7 @@ Usage:
"""
import json
+import re
from collections import Counter, defaultdict
from pathlib import Path
@@ -38,6 +40,10 @@ CONDITIONAL_CATEGORIES = [
]
ALL_CATEGORIES = BASE_CATEGORIES + CONDITIONAL_CATEGORIES
+CODE_URL_RE = re.compile(
+ r'https?://(?:github\.com|gitlab\.com|zenodo\.org|bitbucket\.org|huggingface\.co)[^\s,)\"\'<>]+'
+)
+
def classify_archetype(cat_scores):
ed = cat_scores.get("evaluation_design", 0)
@@ -125,9 +131,28 @@ def detect_games(checklist, score, cat_scores):
bc = checklist.get("contamination", {}).get("benchmark_contamination_addressed", {})
if bc.get("applies") and not bc.get("answer"):
games.append("Contamination Dodge")
+ # Cherry-picked Comparisons
+ bc2 = checklist.get("evaluation_design", {}).get("baselines_contemporary", {})
+ if bc2.get("applies") and not bc2.get("answer"):
+ games.append("Cherry-picked Comparisons")
+ # All Show No Substance
+ ed = cat_scores.get("evaluation_design", 0)
+ sm = cat_scores.get("statistical_methodology", 0)
+ ar = cat_scores.get("artifacts", 0)
+ if ed >= 0.8 and sm < 0.2 and ar < 0.2:
+ games.append("All Show No Substance")
return games
+def extract_code_url(checklist):
+ cr = checklist.get("artifacts", {}).get("code_released", {})
+ if cr.get("applies") and cr.get("answer"):
+ urls = CODE_URL_RE.findall(cr.get("justification", ""))
+ if urls:
+ return urls[0].rstrip(".,;:")
+ return None
+
+
def load_registry():
entries = {}
with open(REGISTRY_PATH) as f:
@@ -161,14 +186,25 @@ def write_json(path, data):
json.dump(data, f, ensure_ascii=False, separators=(",", ":"))
+def safe_mean(scores):
+ return round(sum(scores) / len(scores), 1) if scores else 0
+
+
+def safe_median(scores):
+ if not scores:
+ return 0
+ s = sorted(scores)
+ return round(s[len(s) // 2], 1)
+
+
def build():
registry = load_registry()
citation_data = load_citation_graph()
# Accumulators
- papers_full = [] # full paper objects (for explorer.json)
- papers_index = [] # slim objects (for papers-index.json)
- paper_details = {} # slug -> detail object (for papers/{slug}.json)
+ papers_full = []
+ papers_index = []
+ paper_details = {}
all_scores = []
cat_pass_counts = defaultdict(lambda: {"passed": 0, "applicable": 0})
year_scores = defaultdict(list)
@@ -177,6 +213,15 @@ def build():
game_counts = Counter()
total_papers = 0
+ # Findings accumulators
+ question_pass_counts = defaultdict(lambda: {"passed": 0, "applicable": 0})
+ year_cat_scores = defaultdict(lambda: defaultdict(lambda: {"passed": 0, "applicable": 0}))
+ venue_scores = defaultdict(list)
+ citation_band_scores = defaultdict(list)
+ benchmark_only_by_year = defaultdict(lambda: {"benchmark_only": 0, "total": 0})
+ funding_groups = {"disclosed": [], "not_disclosed": []}
+ score_map = {} # paper_id -> score_pct (built incrementally for homophily)
+
tensions = {
"productivity": {"positive": [], "nuanced": []},
"benchmarks": {"positive": [], "nuanced": []},
@@ -211,6 +256,7 @@ def build():
total_papers += 1
score_pct = round(overall * 100, 1)
all_scores.append(score_pct)
+ score_map[paper_id] = score_pct
year = paper_meta.get("year") or reg_entry.get("year")
venue = metadata.get("venue") or paper_meta.get("venue") or reg_entry.get("venue", "")
@@ -223,6 +269,9 @@ def build():
doi = paper_meta.get("doi") or reg_entry.get("doi", "")
source_url = reg_entry.get("source_url", "")
+ # Code URL extraction
+ code_url = extract_code_url(checklist)
+
year_scores[year].append(score_pct)
for t in tags:
tag_counts[t] += 1
@@ -230,7 +279,7 @@ def build():
for g in games:
game_counts[g] += 1
- # Category aggregations
+ # Category + question aggregations
for cat in ALL_CATEGORIES:
cat_data = checklist.get(cat, {})
if not isinstance(cat_data, dict):
@@ -240,8 +289,47 @@ def build():
continue
if q_data["applies"]:
cat_pass_counts[cat]["applicable"] += 1
+ question_pass_counts[f"{cat}.{q_name}"]["applicable"] += 1
if q_data.get("answer", False):
cat_pass_counts[cat]["passed"] += 1
+ question_pass_counts[f"{cat}.{q_name}"]["passed"] += 1
+ # Year × category
+ if year:
+ year_cat_scores[year][cat]["applicable"] += 1
+ if q_data.get("answer", False):
+ year_cat_scores[year][cat]["passed"] += 1
+
+ # Venue scoring (skip arXiv — it's a preprint server, not a venue)
+ venue_clean = venue.strip()
+ if venue_clean and venue_clean.lower() not in ("arxiv", "arxiv.org", ""):
+ venue_scores[venue_clean].append(score_pct)
+
+ # Citation band scoring
+ cit = metadata.get("citation_count")
+ if cit is not None:
+ if cit == 0:
+ band = "0"
+ elif cit <= 50:
+ band = "1-50"
+ elif cit <= 500:
+ band = "51-500"
+ else:
+ band = "500+"
+ citation_band_scores[band].append(score_pct)
+
+ # Benchmark monoculture
+ if year:
+ benchmark_only_by_year[year]["total"] += 1
+ if tags == ["benchmark-eval"]:
+ benchmark_only_by_year[year]["benchmark_only"] += 1
+
+ # Funding gap
+ fd = checklist.get("conflicts_of_interest", {}).get("funding_disclosed", {})
+ if fd.get("applies"):
+ if fd.get("answer"):
+ funding_groups["disclosed"].append(score_pct)
+ else:
+ funding_groups["not_disclosed"].append(score_pct)
claims = scan.get("claims", [])
red_flags = scan.get("red_flags", [])
@@ -270,7 +358,7 @@ def build():
cat_scores_pct = {k: round(v * 100, 1) for k, v in cat_scores.items()}
- # Slim index entry (for table)
+ # Slim index entry
index_entry = {
"id": paper_id,
"title": paper_meta.get("title", reg_entry.get("title", paper_id)),
@@ -282,10 +370,11 @@ def build():
"games": games,
"arxiv_id": arxiv_id,
"doi": doi,
+ "code_url": code_url,
}
papers_index.append(index_entry)
- # Full detail (for per-paper file)
+ # Full detail
detail = {
**index_entry,
"category_scores": cat_scores_pct,
@@ -297,11 +386,9 @@ def build():
"source_url": source_url,
}
paper_details[paper_id] = detail
-
- # Full entry for explorer.json
papers_full.append(detail)
- # --- Aggregations ---
+ # --- Dashboard aggregations ---
all_scores.sort()
n = len(all_scores)
median = all_scores[n // 2] if n else 0
@@ -368,9 +455,121 @@ def build():
"pipeline": pipeline,
}
+ # --- Findings aggregations ---
+
+ # 1. Per-question pass rates
+ q_rates = {}
+ for key, d in question_pass_counts.items():
+ if d["applicable"] > 0:
+ q_rates[key] = {"rate": round(d["passed"] / d["applicable"] * 100, 1), "n": d["applicable"]}
+
+ # 2. Year trends by category
+ year_cat_trends = {}
+ for y in sorted(year_cat_scores.keys()):
+ year_cat_trends[str(y)] = {}
+ for cat in ALL_CATEGORIES:
+ d = year_cat_scores[y][cat]
+ if d["applicable"] > 0:
+ year_cat_trends[str(y)][cat] = round(d["passed"] / d["applicable"] * 100, 1)
+
+ # 3. Venue & citation scoring
+ venue_stats = {}
+ for v, scores in venue_scores.items():
+ if len(scores) >= 3:
+ venue_stats[v] = {"n": len(scores), "mean": safe_mean(scores), "median": safe_median(scores)}
+
+ cit_band_stats = {}
+ for band in ["0", "1-50", "51-500", "500+"]:
+ scores = citation_band_scores.get(band, [])
+ if scores:
+ cit_band_stats[band] = {"n": len(scores), "mean": safe_mean(scores), "median": safe_median(scores)}
+
+ # 4. Optimism-rigor inversion
+ optimism_rigor = {}
+ for key, sides in tensions.items():
+ pos = [c["score"] for c in sides["positive"]]
+ nua = [c["score"] for c in sides["nuanced"]]
+ optimism_rigor[key] = {
+ "positive_n": len(pos), "positive_mean": safe_mean(pos),
+ "nuanced_n": len(nua), "nuanced_mean": safe_mean(nua),
+ "gap": round(safe_mean(nua) - safe_mean(pos), 1),
+ }
+
+ # 5. Quality homophily
+ threshold = 60
+ high_quality_ids = {pid for pid, sc in score_map.items() if sc >= threshold}
+ baseline_pct = round(len(high_quality_ids) / total_papers * 100, 1) if total_papers else 0
+
+ cited_high = 0
+ cited_total = 0
+ for edge in citation_data.get("edges", []):
+ s, t = edge["source"], edge["target"]
+ if s in high_quality_ids and t in score_map:
+ cited_total += 1
+ if score_map[t] >= threshold:
+ cited_high += 1
+
+ homophily = {
+ "threshold": threshold,
+ "baseline_pct": baseline_pct,
+ "high_cite_high_pct": round(cited_high / cited_total * 100, 1) if cited_total else 0,
+ "high_cite_total": cited_total,
+ }
+
+ # 6. Sampling effect (historical checkpoints + current)
+ sampling_effect = {
+ "checkpoints": [
+ {"n": 135, "median": 53.3},
+ {"n": 271, "median": 50.6},
+ {"n": 467, "median": 50.0},
+ {"n": total_papers, "median": round(median, 1)},
+ ]
+ }
+
+ # 7. Benchmark monoculture
+ bench_mono = {}
+ for y in sorted(benchmark_only_by_year.keys()):
+ d = benchmark_only_by_year[y]
+ if d["total"] > 0:
+ bench_mono[str(y)] = {
+ "benchmark_only": d["benchmark_only"],
+ "total": d["total"],
+ "pct": round(d["benchmark_only"] / d["total"] * 100, 1),
+ }
+
+ # 8. Funding gap
+ funding_gap = {}
+ for group, scores in funding_groups.items():
+ if scores:
+ funding_gap[group] = {"n": len(scores), "mean": safe_mean(scores), "median": safe_median(scores)}
+
+ # 9. Reproducibility drill-down
+ artifacts_qs = ["code_released", "data_released", "environment_specified", "reproduction_instructions"]
+ repro_detail = {}
+ for q in artifacts_qs:
+ key = f"artifacts.{q}"
+ d = question_pass_counts.get(key, {"passed": 0, "applicable": 0})
+ if d["applicable"] > 0:
+ repro_detail[q] = {"rate": round(d["passed"] / d["applicable"] * 100, 1), "n": d["applicable"]}
+ repro_detail["full_pass_count"] = repro_count
+ repro_detail["full_pass_pct"] = round(repro_count / total_papers * 100, 1) if total_papers else 0
+
+ findings = {
+ "question_rates": q_rates,
+ "year_category_trends": year_cat_trends,
+ "venue_stats": venue_stats,
+ "citation_band_stats": cit_band_stats,
+ "optimism_rigor": optimism_rigor,
+ "homophily": homophily,
+ "sampling_effect": sampling_effect,
+ "benchmark_monoculture": bench_mono,
+ "funding_gap": funding_gap,
+ "repro_detail": repro_detail,
+ "game_pcts": game_pcts,
+ }
+
# --- Citation network ---
v2_ids = {p["id"] for p in papers_full}
- score_map = {p["id"]: p["score"] for p in papers_full}
year_map = {p["id"]: p["year"] for p in papers_full}
all_graph_ids = {n["id"] for n in citation_data.get("nodes", [])}
@@ -401,7 +600,7 @@ def build():
papers_detail_dir = OUTPUT_DIR / "papers"
papers_detail_dir.mkdir(parents=True, exist_ok=True)
- # Add unscanned registry entries to papers-index (score=null, no detail page)
+ # Add unscanned registry entries to papers-index
scanned_ids = {p["id"] for p in papers_index}
for entry in registry.values():
if entry["id"] in scanned_ids:
@@ -419,9 +618,11 @@ def build():
"games": [],
"arxiv_id": entry.get("arxiv_id", ""),
"doi": entry.get("doi", ""),
+ "code_url": None,
})
write_json(OUTPUT_DIR / "dashboard.json", dashboard)
+ write_json(OUTPUT_DIR / "findings.json", findings)
write_json(OUTPUT_DIR / "papers-index.json", papers_index)
write_json(OUTPUT_DIR / "network.json", network)
write_json(OUTPUT_DIR / "tensions.json", tensions)
@@ -431,9 +632,10 @@ def build():
# Full monolith
explorer = {
- "generated": "2026-03-18",
+ "generated": "2026-03-21",
"papers": papers_full,
"agg": dashboard,
+ "findings": findings,
"tensions": tensions,
"citation_network": network,
}
@@ -441,15 +643,19 @@ def build():
# Report
dash_size = (OUTPUT_DIR / "dashboard.json").stat().st_size
+ find_size = (OUTPUT_DIR / "findings.json").stat().st_size
idx_size = (OUTPUT_DIR / "papers-index.json").stat().st_size
net_size = (OUTPUT_DIR / "network.json").stat().st_size
tens_size = (OUTPUT_DIR / "tensions.json").stat().st_size
full_size = (OUTPUT_DIR / "explorer.json").stat().st_size
+ code_url_count = sum(1 for p in papers_full if p.get("code_url"))
print(f"Papers: {total_papers}")
print(f"Median score: {median:.1f}%")
+ print(f"Code URLs extracted: {code_url_count}")
print(f"Network: {len(net_nodes)} nodes, {len(net_edges)} edges")
print(f"Files:")
print(f" dashboard.json: {dash_size:>8,} bytes")
+ print(f" findings.json: {find_size:>8,} bytes")
print(f" papers-index.json: {idx_size:>8,} bytes")
print(f" papers/*.json: {len(paper_details):>5} files")
print(f" network.json: {net_size:>8,} bytes")