ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

commit d240203118b1d2332118fdcb2cfd94594a523da2
parent 1818e336e2cc2445cd1006f83c3fa66c7eec7259
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sun, 22 Mar 2026 21:33:58 +0100

Add findings view with 10 analysis sections and code URL extraction

New #/findings view surfaces deep analysis that was only in docs:
- Per-question pass rates (67 questions, worst: self_comparison_bias 0.8%)
- Year trends by category with toggleable lines (contamination 29%→7%)
- Venue & citation scoring (500+ cites score below average)
- Optimism-rigor inversion (positive claims from weaker papers)
- Quality homophily (high-quality papers cite high-quality 3x more)
- Sampling effect (median drops as long tail scanned)
- Benchmark monoculture (58% pure benchmark-eval)
- Funding gap (13pp between disclosed/undisclosed)
- Reproducibility drill-down (4.2% fully reproducible)
- All 6 named games (added Cherry-picked Comparisons, All Show No Substance)

Also: extracted 282 code URLs from scan justification text, shown as
"Code" link on paper detail pages.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mexplorer/index.html | 1+
Aexplorer/src/components/multi-line-chart.ts | 48++++++++++++++++++++++++++++++++++++++++++++++++
Mexplorer/src/data.ts | 48++++++++++++++++++++++++++++++++++++------------
Mexplorer/src/main.ts | 2++
Mexplorer/src/style.css | 20++++++++++++++++++++
Aexplorer/src/views/findings.ts | 304+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mexplorer/src/views/paper-detail.ts | 3+++
Mexplorer/tests/explorer.spec.ts | 46++++++++++++++++++++++++++++++++++++++++++++++
Mscripts/build-explorer-data.py | 234++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
9 files changed, 680 insertions(+), 26 deletions(-)

diff --git a/explorer/index.html b/explorer/index.html @@ -14,6 +14,7 @@ <a href="#/papers">Papers</a> <a href="#/network">Network</a> <a href="#/tensions">Tensions</a> + <a href="#/findings">Findings</a> </nav> <button id="theme-toggle" aria-label="Toggle theme">☀</button> </header> diff --git a/explorer/src/components/multi-line-chart.ts b/explorer/src/components/multi-line-chart.ts @@ -0,0 +1,48 @@ +export interface LineData { + label: string; + color: string; + points: { x: number; y: number }[]; +} + +export function renderMultiLineChart( + lines: LineData[], + xLabels: string[], + opts: { width?: number; height?: number; yLabel?: string } = {} +): string { + const w = opts.width || 600; + const h = opts.height || 220; + const pad = { l: 50, r: 20, t: 15, b: 30 }; + const chartW = w - pad.l - pad.r; + const chartH = h - pad.t - pad.b; + + const xScale = (i: number) => pad.l + (i / Math.max(xLabels.length - 1, 1)) * chartW; + const yScale = (v: number) => pad.t + chartH - (v / 100) * chartH; + + // Grid + labels + let svg = ''; + for (let v = 0; v <= 100; v += 25) { + svg += `<text x="${pad.l - 8}" y="${yScale(v) + 4}" text-anchor="end">${v}%</text>`; + svg += `<line class="grid-line" x1="${pad.l}" x2="${w - pad.r}" y1="${yScale(v)}" y2="${yScale(v)}" stroke-dasharray="3"/>`; + } + for (let i = 0; i < xLabels.length; i++) { + svg += `<text x="${xScale(i)}" y="${h - 5}" text-anchor="middle">${xLabels[i]}</text>`; + } + + // Lines + for (const line of lines) { + if (line.points.length < 2) continue; + const pts = line.points.map(p => `${xScale(p.x)},${yScale(p.y)}`).join(' '); + svg += `<polyline points="${pts}" fill="none" stroke="${line.color}" stroke-width="2"/>`; + for (const p of line.points) { + svg += `<circle cx="${xScale(p.x)}" cy="${yScale(p.y)}" r="3" fill="${line.color}"/>`; + } + } + + // Legend + const legend = lines.map(l => + `<span class="chart-legend-item"><span class="chart-legend-swatch" style="background:${l.color}"></span>${l.label}</span>` + ).join(''); + + return `<svg viewBox="0 0 ${w} ${h}" style="width:100%;max-width:${w}px">${svg}</svg> + <div class="chart-legend">${legend}</div>`; +} diff --git a/explorer/src/data.ts b/explorer/src/data.ts @@ -16,7 +16,6 @@ export interface RedFlag { detail: string; } -// Slim entry for paper table (papers-index.json) export interface PaperIndex { id: string; title: string; @@ -28,18 +27,9 @@ export interface PaperIndex { games: string[]; arxiv_id: string; doi: string; + code_url: string | null; } -export interface Pipeline { - registry_total: number; - v2_scanned: number; - v1_needs_rescan: number; - has_text_no_scan: number; - no_text: number; - excluded: number; -} - -// Full detail for per-paper pages (papers/{slug}.json) export interface PaperDetail extends PaperIndex { category_scores: Record<string, number>; claims: Claim[]; @@ -56,6 +46,15 @@ export interface HistBin { count: number; } +export interface Pipeline { + registry_total: number; + v2_scanned: number; + v1_needs_rescan: number; + has_text_no_scan: number; + no_text: number; + excluded: number; +} + export interface Dashboard { n: number; median: number; @@ -70,6 +69,31 @@ export interface Dashboard { pipeline: Pipeline; } +export interface QuestionRate { + rate: number; + n: number; +} + +export interface GroupStat { + n: number; + mean: number; + median: number; +} + +export interface Findings { + question_rates: Record<string, QuestionRate>; + year_category_trends: Record<string, Record<string, number>>; + venue_stats: Record<string, GroupStat>; + citation_band_stats: Record<string, GroupStat>; + optimism_rigor: Record<string, { positive_n: number; positive_mean: number; nuanced_n: number; nuanced_mean: number; gap: number }>; + homophily: { threshold: number; baseline_pct: number; high_cite_high_pct: number; high_cite_total: number }; + sampling_effect: { checkpoints: { n: number; median: number }[] }; + benchmark_monoculture: Record<string, { benchmark_only: number; total: number; pct: number }>; + funding_gap: Record<string, GroupStat>; + repro_detail: Record<string, QuestionRate | number> & { full_pass_count: number; full_pass_pct: number }; + game_pcts: Record<string, number>; +} + export interface TensionClaim { paper_id: string; claim: string; @@ -97,7 +121,6 @@ export interface CitationNetwork { edges: [string, string][]; } -// Lazy-loaded per-view data const cache: Record<string, unknown> = {}; async function fetchJson<T>(path: string): Promise<T> { @@ -109,6 +132,7 @@ async function fetchJson<T>(path: string): Promise<T> { } export const loadDashboard = () => fetchJson<Dashboard>('/data/dashboard.json'); +export const loadFindings = () => fetchJson<Findings>('/data/findings.json'); export const loadPapersIndex = () => fetchJson<PaperIndex[]>('/data/papers-index.json'); export const loadPaperDetail = (slug: string) => fetchJson<PaperDetail>(`/data/papers/${slug}.json`); export const loadNetwork = () => fetchJson<CitationNetwork>('/data/network.json'); diff --git a/explorer/src/main.ts b/explorer/src/main.ts @@ -4,6 +4,7 @@ import { renderPapers } from './views/papers'; import { renderPaperDetail } from './views/paper-detail'; import { renderNetwork } from './views/network'; import { renderTensions } from './views/tensions'; +import { renderFindings } from './views/findings'; import { initTheme } from './theme'; function init() { @@ -15,6 +16,7 @@ function init() { route('/paper/:slug', ({ slug }) => renderPaperDetail(app, slug)); route('/network', () => renderNetwork(app)); route('/tensions', () => renderTensions(app)); + route('/findings', () => renderFindings(app)); startRouter(); } diff --git a/explorer/src/style.css b/explorer/src/style.css @@ -488,3 +488,23 @@ td.score { /* Year trend chart */ .trend-chart { margin-top: 0.5rem; } + +/* Multi-line chart legend */ +.chart-legend { display: flex; gap: 1rem; flex-wrap: wrap; margin-top: 0.5rem; } +.chart-legend-item { font-size: 0.75rem; color: var(--text-dim); display: flex; align-items: center; gap: 4px; } +.chart-legend-swatch { width: 14px; height: 3px; border-radius: 1px; display: inline-block; } + +/* Toggle buttons for category lines */ +.toggle-group { display: flex; gap: 0.5rem; flex-wrap: wrap; margin-bottom: 0.75rem; } +.toggle-btn { + font-size: 0.72rem; + padding: 0.2rem 0.55rem; + border: 1px solid var(--border); + border-radius: 3px; + background: none; + color: var(--text-dim); + cursor: pointer; + transition: all 0.15s; +} +.toggle-btn:hover { border-color: var(--text-dim); } +.toggle-btn.active { background: rgba(108, 140, 255, 0.08); } diff --git a/explorer/src/views/findings.ts b/explorer/src/views/findings.ts @@ -0,0 +1,304 @@ +import { loadFindings, type Findings, type QuestionRate } from '../data'; +import { renderBarChart } from '../components/bar-chart'; +import { renderMultiLineChart } from '../components/multi-line-chart'; + +function formatName(name: string): string { + return name.replace(/_/g, ' ').replace(/\b\w/g, c => c.toUpperCase()); +} + +const CAT_COLORS: Record<string, string> = { + contamination: '#f06565', + data_leakage: '#e08050', + statistical_methodology: '#6c8cff', + experimental_rigor: '#f0c050', + artifacts: '#3dd68c', + evaluation_design: '#a080f0', + claims_and_evidence: '#50c0c0', + survey_methodology: '#ff80b0', + setup_transparency: '#90b060', + limitations_and_scope: '#c0a060', + cost_and_practicality: '#8090a0', + human_studies: '#b070b0', + data_integrity: '#70a0d0', + conflicts_of_interest: '#d07070', +}; + +const TENSION_NAMES: Record<string, string> = { + productivity: 'Productivity', + benchmarks: 'Benchmarks', + agents: 'Agents', +}; + +export async function renderFindings(app: HTMLElement) { + app.innerHTML = '<div class="spinner"></div>'; + const f = await loadFindings(); + + app.innerHTML = ` + ${renderQuestionRates(f)} + ${renderYearCategoryTrends(f)} + ${renderVenueCitation(f)} + ${renderOptimismRigor(f)} + ${renderHomophily(f)} + ${renderSamplingEffect(f)} + ${renderBenchmarkMonoculture(f)} + ${renderFundingGap(f)} + ${renderReproDetail(f)} + ${renderGames(f)} + `; + + // Attach toggle listeners for year-category chart + attachCategoryToggles(f); +} + +function renderQuestionRates(f: Findings): string { + const sorted = Object.entries(f.question_rates) + .sort((a, b) => a[1].rate - b[1].rate); + const worst20 = sorted.slice(0, 20); + const best10 = sorted.slice(-10).reverse(); + + return `<div class="section"> + <h2>Per-Question Pass Rates</h2> + <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">${sorted.length} questions across 14 categories. Sorted by pass rate, worst first.</p> + <h3 style="font-size:0.85rem;color:var(--red);margin-bottom:0.5rem">Worst 20</h3> + ${worst20.map(([key, d]) => { + const [cat, q] = key.split('.'); + const color = d.rate < 10 ? 'var(--red)' : d.rate < 30 ? 'var(--yellow)' : 'var(--accent)'; + return `<div class="hbar"> + <div class="hbar-label"><span>${formatName(q)} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div> + <div class="hbar-track"><div class="hbar-fill" style="width:${d.rate}%;background:${color}"></div></div> + </div>`; + }).join('')} + <h3 style="font-size:0.85rem;color:var(--green);margin:1rem 0 0.5rem">Best 10</h3> + ${best10.map(([key, d]) => { + const [cat, q] = key.split('.'); + return `<div class="hbar"> + <div class="hbar-label"><span>${formatName(q)} <span style="color:var(--text-dim);font-size:0.7rem">${formatName(cat)}</span></span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div> + <div class="hbar-track"><div class="hbar-fill" style="width:${d.rate}%;background:var(--green)"></div></div> + </div>`; + }).join('')} + </div>`; +} + +function renderYearCategoryTrends(f: Findings): string { + const years = Object.keys(f.year_category_trends).sort(); + const defaultCats = ['contamination', 'data_leakage', 'statistical_methodology', 'experimental_rigor']; + const allCats = Object.keys(CAT_COLORS); + + const toggles = allCats.map(cat => { + const active = defaultCats.includes(cat) ? ' active' : ''; + return `<button class="toggle-btn${active}" data-cat="${cat}" style="border-color:${active ? CAT_COLORS[cat] : ''};color:${active ? CAT_COLORS[cat] : ''}">${formatName(cat)}</button>`; + }).join(''); + + const lines = defaultCats.map(cat => ({ + label: formatName(cat), + color: CAT_COLORS[cat], + points: years.map((y, i) => ({ x: i, y: f.year_category_trends[y]?.[cat] ?? 0 })) + .filter(p => p.y > 0), + })); + + return `<div class="section"> + <h2>Year Trends by Category</h2> + <div class="toggle-group" id="cat-toggles">${toggles}</div> + <div id="cat-chart">${renderMultiLineChart(lines, years, { width: 700 })}</div> + </div>`; +} + +function attachCategoryToggles(f: Findings) { + const container = document.getElementById('cat-toggles'); + const chartEl = document.getElementById('cat-chart'); + if (!container || !chartEl) return; + + container.addEventListener('click', e => { + const btn = (e.target as HTMLElement).closest('.toggle-btn') as HTMLElement; + if (!btn) return; + btn.classList.toggle('active'); + const cat = btn.dataset.cat!; + const color = CAT_COLORS[cat]; + if (btn.classList.contains('active')) { + btn.style.borderColor = color; + btn.style.color = color; + } else { + btn.style.borderColor = ''; + btn.style.color = ''; + } + + // Re-render chart with active categories + const activeCats = Array.from(container.querySelectorAll('.toggle-btn.active')) + .map(b => (b as HTMLElement).dataset.cat!); + const years = Object.keys(f.year_category_trends).sort(); + const lines = activeCats.map(c => ({ + label: formatName(c), + color: CAT_COLORS[c], + points: years.map((y, i) => ({ x: i, y: f.year_category_trends[y]?.[c] ?? 0 })) + .filter(p => p.y > 0), + })); + chartEl.innerHTML = renderMultiLineChart(lines, years, { width: 700 }); + }); +} + +function renderVenueCitation(f: Findings): string { + const venues = Object.entries(f.venue_stats).sort((a, b) => b[1].mean - a[1].mean); + const bands = ['0', '1-50', '51-500', '500+']; + + return `<div class="section"> + <h2>Venue & Citation Scoring</h2> + <div class="detail-grid"> + <div> + <h3 style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">Score by Venue (3+ papers)</h3> + ${venues.map(([v, d]) => { + const color = d.mean < 40 ? 'var(--red)' : d.mean < 55 ? 'var(--yellow)' : 'var(--green)'; + return `<div class="hbar"> + <div class="hbar-label"><span>${v}</span><span>${d.mean}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div> + <div class="hbar-track"><div class="hbar-fill" style="width:${d.mean}%;background:${color}"></div></div> + </div>`; + }).join('')} + </div> + <div> + <h3 style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">Score by Citation Count</h3> + ${bands.map(band => { + const d = f.citation_band_stats[band]; + if (!d) return ''; + const color = d.mean < 40 ? 'var(--red)' : d.mean < 55 ? 'var(--yellow)' : 'var(--green)'; + return `<div class="hbar"> + <div class="hbar-label"><span>${band} citations</span><span>${d.mean}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div> + <div class="hbar-track"><div class="hbar-fill" style="width:${d.mean}%;background:${color}"></div></div> + </div>`; + }).join('')} + <p style="font-size:0.8rem;color:var(--text-dim);margin-top:0.75rem">Most-cited papers (500+) score <strong style="color:var(--red)">below average</strong>. Citations measure influence, not rigor.</p> + </div> + </div> + </div>`; +} + +function renderOptimismRigor(f: Findings): string { + return `<div class="section"> + <h2>Optimism-Rigor Inversion</h2> + <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">Across all three claim tensions, papers making positive/optimistic claims have <strong>lower</strong> methodology scores than papers with nuanced findings.</p> + ${Object.entries(f.optimism_rigor).map(([key, d]) => ` + <div class="game-row"> + <span class="game-name">${TENSION_NAMES[key] || key}</span> + <span style="font-family:var(--font);font-size:0.85rem"> + <span style="color:var(--yellow)">Positive ${d.positive_mean}%</span> + <span style="color:var(--text-dim)"> vs </span> + <span style="color:var(--green)">Nuanced ${d.nuanced_mean}%</span> + <span style="color:var(--accent)"> (+${d.gap}pp)</span> + </span> + </div> + `).join('')} + </div>`; +} + +function renderHomophily(f: Findings): string { + const h = f.homophily; + const ratio = h.high_cite_total > 0 ? (h.high_cite_high_pct / h.baseline_pct).toFixed(1) : '?'; + return `<div class="section"> + <h2>Quality Homophily</h2> + <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">Do high-quality papers cite other high-quality papers more than expected? (threshold: ${h.threshold}%+ score)</p> + <div class="hbar"> + <div class="hbar-label"><span>Expected (baseline)</span><span>${h.baseline_pct}%</span></div> + <div class="hbar-track"><div class="hbar-fill" style="width:${h.baseline_pct}%;background:var(--text-dim)"></div></div> + </div> + <div class="hbar"> + <div class="hbar-label"><span>Observed (high cites high)</span><span>${h.high_cite_high_pct}%</span></div> + <div class="hbar-track"><div class="hbar-fill" style="width:${Math.min(h.high_cite_high_pct, 100)}%;background:var(--green)"></div></div> + </div> + <p style="font-size:0.85rem;margin-top:0.5rem"><strong>${ratio}x</strong> more likely to cite high-quality work <span style="color:var(--text-dim)">(n=${h.high_cite_total} citations)</span></p> + </div>`; +} + +function renderSamplingEffect(f: Findings): string { + const pts = f.sampling_effect.checkpoints; + const w = 400, h = 150; + const pad = { l: 50, r: 20, t: 15, b: 30 }; + const chartW = w - pad.l - pad.r; + const chartH = h - pad.t - pad.b; + + const xScale = (i: number) => pad.l + (i / (pts.length - 1)) * chartW; + const yMin = 40, yMax = 60; + const yScale = (v: number) => pad.t + chartH - ((v - yMin) / (yMax - yMin)) * chartH; + + let svg = ''; + for (let v = yMin; v <= yMax; v += 5) { + svg += `<text x="${pad.l - 8}" y="${yScale(v) + 4}" text-anchor="end">${v}%</text>`; + svg += `<line class="grid-line" x1="${pad.l}" x2="${w - pad.r}" y1="${yScale(v)}" y2="${yScale(v)}" stroke-dasharray="3"/>`; + } + + const polyline = pts.map((p, i) => `${xScale(i)},${yScale(p.median)}`).join(' '); + svg += `<polyline points="${polyline}" fill="none" stroke="var(--yellow)" stroke-width="2"/>`; + pts.forEach((p, i) => { + svg += `<circle cx="${xScale(i)}" cy="${yScale(p.median)}" r="4" fill="var(--yellow)"/>`; + svg += `<text x="${xScale(i)}" y="${yScale(p.median) - 10}" text-anchor="middle" fill="var(--yellow)" font-size="11">${p.median}%</text>`; + svg += `<text x="${xScale(i)}" y="${h - 5}" text-anchor="middle">n=${p.n}</text>`; + }); + + return `<div class="section"> + <h2>Sampling Effect</h2> + <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">Median score drops as the long tail is scanned. Visibility correlates with quality.</p> + <svg viewBox="0 0 ${w} ${h}" style="width:100%;max-width:${w}px">${svg}</svg> + </div>`; +} + +function renderBenchmarkMonoculture(f: Findings): string { + const years = Object.keys(f.benchmark_monoculture).sort(); + const pts = years.map((y, i) => ({ x: i, y: f.benchmark_monoculture[y].pct })); + const lines = [{ + label: 'Benchmark-only papers', + color: '#f0c050', + points: pts, + }]; + + return `<div class="section"> + <h2>Benchmark Monoculture</h2> + <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">Share of papers using only benchmark evaluation, no other methodology.</p> + ${renderMultiLineChart(lines, years)} + ${years.length > 0 ? `<p style="font-size:0.85rem;margin-top:0.5rem">${f.benchmark_monoculture[years[years.length - 1]]?.pct ?? 0}% of ${years[years.length - 1]} papers are pure benchmark-eval.</p>` : ''} + </div>`; +} + +function renderFundingGap(f: Findings): string { + const disc = f.funding_gap.disclosed; + const nodisc = f.funding_gap.not_disclosed; + if (!disc || !nodisc) return ''; + const gap = (disc.mean - nodisc.mean).toFixed(1); + + return `<div class="section"> + <h2>Funding Disclosure Gap</h2> + <div class="hbar"> + <div class="hbar-label"><span>Funding disclosed</span><span>${disc.mean}% <span style="color:var(--text-dim)">(n=${disc.n})</span></span></div> + <div class="hbar-track"><div class="hbar-fill" style="width:${disc.mean}%;background:var(--green)"></div></div> + </div> + <div class="hbar"> + <div class="hbar-label"><span>Not disclosed</span><span>${nodisc.mean}% <span style="color:var(--text-dim)">(n=${nodisc.n})</span></span></div> + <div class="hbar-track"><div class="hbar-fill" style="width:${nodisc.mean}%;background:var(--red)"></div></div> + </div> + <p style="font-size:0.85rem;margin-top:0.5rem"><strong>${gap}pp gap</strong> — papers that disclose funding score substantially higher.</p> + </div>`; +} + +function renderReproDetail(f: Findings): string { + const qs = ['code_released', 'data_released', 'environment_specified', 'reproduction_instructions']; + return `<div class="section"> + <h2>Reproducibility Drill-Down</h2> + ${qs.map(q => { + const d = f.repro_detail[q] as QuestionRate | undefined; + if (!d || typeof d === 'number') return ''; + const color = d.rate < 15 ? 'var(--red)' : d.rate < 50 ? 'var(--yellow)' : 'var(--green)'; + return `<div class="hbar"> + <div class="hbar-label"><span>${formatName(q)}</span><span>${d.rate}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div> + <div class="hbar-track"><div class="hbar-fill" style="width:${d.rate}%;background:${color}"></div></div> + </div>`; + }).join('')} + <p style="font-size:1rem;margin-top:1rem;font-weight:600"><span style="color:var(--red);font-family:var(--font);font-size:1.3rem">${f.repro_detail.full_pass_pct}%</span> of papers are fully reproducible <span style="color:var(--text-dim)">(${f.repro_detail.full_pass_count} papers pass all 4 criteria)</span></p> + </div>`; +} + +function renderGames(f: Findings): string { + const sorted = Object.entries(f.game_pcts).sort((a, b) => b[1] - a[1]); + return `<div class="section"> + <h2>Named Games</h2> + <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.75rem">Recurring methodological patterns detected across the corpus.</p> + ${sorted.map(([name, pct]) => + `<div class="game-row"><span class="game-name">${name}</span><span class="game-pct">${pct}%</span></div>` + ).join('')} + </div>`; +} diff --git a/explorer/src/views/paper-detail.ts b/explorer/src/views/paper-detail.ts @@ -50,6 +50,9 @@ export async function renderPaperDetail(app: HTMLElement, slug: string) { if (paper.source_url && !paper.source_url.includes('arxiv.org')) { links.push(`<a href="${paper.source_url}" target="_blank" rel="noopener">Source</a>`); } + if (paper.code_url) { + links.push(`<a href="${paper.code_url}" target="_blank" rel="noopener">Code</a>`); + } // Load network for citations (lazy, non-blocking) let incomingHtml = ''; diff --git a/explorer/tests/explorer.spec.ts b/explorer/tests/explorer.spec.ts @@ -202,6 +202,52 @@ test.describe('Navigation', () => { }); }); +test.describe('Findings', () => { + test('loads and shows all sections', async ({ page }) => { + await page.goto('/#/findings'); + await expect(page.locator('.section').first()).toBeVisible({ timeout: 10000 }); + // Should have 10 sections + expect(await page.locator('.section').count()).toBe(10); + }); + + test('shows per-question pass rates', async ({ page }) => { + await page.goto('/#/findings'); + await expect(page.locator('.section').first()).toBeVisible({ timeout: 10000 }); + await expect(page.locator('h2', { hasText: 'Per-Question Pass Rates' })).toBeVisible(); + // Should have horizontal bars + expect(await page.locator('.section').first().locator('.hbar').count()).toBeGreaterThan(10); + }); + + test('shows year category trends with toggles', async ({ page }) => { + await page.goto('/#/findings'); + await expect(page.locator('#cat-toggles')).toBeVisible({ timeout: 10000 }); + const activeToggles = page.locator('.toggle-btn.active'); + expect(await activeToggles.count()).toBe(4); + // Click a toggle to deactivate + await activeToggles.first().click(); + expect(await page.locator('.toggle-btn.active').count()).toBe(3); + }); + + test('shows funding gap', async ({ page }) => { + await page.goto('/#/findings'); + await expect(page.locator('h2', { hasText: 'Funding Disclosure Gap' })).toBeVisible({ timeout: 10000 }); + await expect(page.locator('text=pp gap')).toBeVisible(); + }); + + test('shows reproducibility drill-down', async ({ page }) => { + await page.goto('/#/findings'); + await expect(page.locator('h2', { hasText: 'Reproducibility Drill-Down' })).toBeVisible({ timeout: 10000 }); + await expect(page.locator('text=fully reproducible')).toBeVisible(); + }); + + test('shows 6 named games', async ({ page }) => { + await page.goto('/#/findings'); + const gamesSection = page.locator('.section', { has: page.locator('h2', { hasText: 'Named Games' }) }); + await expect(gamesSection).toBeVisible({ timeout: 10000 }); + expect(await gamesSection.locator('.game-row').count()).toBe(6); + }); +}); + test.describe('Theme', () => { test('toggle switches theme', async ({ page }) => { await page.goto('/'); diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py @@ -6,8 +6,9 @@ Reads v2 scan.json files, metadata.json, citation-graph.json, and registry.jsonl Outputs view-specific JSON files for fast loading, plus a full explorer.json for power users. Output files: - explorer/public/data/dashboard.json — aggregation stats only (~0.5KB) - explorer/public/data/papers-index.json — table data without checklists (~200KB) + explorer/public/data/dashboard.json — aggregation stats only + explorer/public/data/findings.json — deep analysis findings + explorer/public/data/papers-index.json — table data without checklists explorer/public/data/papers/{slug}.json — full detail per paper explorer/public/data/network.json — citation network explorer/public/data/tensions.json — claim tensions @@ -18,6 +19,7 @@ Usage: """ import json +import re from collections import Counter, defaultdict from pathlib import Path @@ -38,6 +40,10 @@ CONDITIONAL_CATEGORIES = [ ] ALL_CATEGORIES = BASE_CATEGORIES + CONDITIONAL_CATEGORIES +CODE_URL_RE = re.compile( + r'https?://(?:github\.com|gitlab\.com|zenodo\.org|bitbucket\.org|huggingface\.co)[^\s,)\"\'<>]+' +) + def classify_archetype(cat_scores): ed = cat_scores.get("evaluation_design", 0) @@ -125,9 +131,28 @@ def detect_games(checklist, score, cat_scores): bc = checklist.get("contamination", {}).get("benchmark_contamination_addressed", {}) if bc.get("applies") and not bc.get("answer"): games.append("Contamination Dodge") + # Cherry-picked Comparisons + bc2 = checklist.get("evaluation_design", {}).get("baselines_contemporary", {}) + if bc2.get("applies") and not bc2.get("answer"): + games.append("Cherry-picked Comparisons") + # All Show No Substance + ed = cat_scores.get("evaluation_design", 0) + sm = cat_scores.get("statistical_methodology", 0) + ar = cat_scores.get("artifacts", 0) + if ed >= 0.8 and sm < 0.2 and ar < 0.2: + games.append("All Show No Substance") return games +def extract_code_url(checklist): + cr = checklist.get("artifacts", {}).get("code_released", {}) + if cr.get("applies") and cr.get("answer"): + urls = CODE_URL_RE.findall(cr.get("justification", "")) + if urls: + return urls[0].rstrip(".,;:") + return None + + def load_registry(): entries = {} with open(REGISTRY_PATH) as f: @@ -161,14 +186,25 @@ def write_json(path, data): json.dump(data, f, ensure_ascii=False, separators=(",", ":")) +def safe_mean(scores): + return round(sum(scores) / len(scores), 1) if scores else 0 + + +def safe_median(scores): + if not scores: + return 0 + s = sorted(scores) + return round(s[len(s) // 2], 1) + + def build(): registry = load_registry() citation_data = load_citation_graph() # Accumulators - papers_full = [] # full paper objects (for explorer.json) - papers_index = [] # slim objects (for papers-index.json) - paper_details = {} # slug -> detail object (for papers/{slug}.json) + papers_full = [] + papers_index = [] + paper_details = {} all_scores = [] cat_pass_counts = defaultdict(lambda: {"passed": 0, "applicable": 0}) year_scores = defaultdict(list) @@ -177,6 +213,15 @@ def build(): game_counts = Counter() total_papers = 0 + # Findings accumulators + question_pass_counts = defaultdict(lambda: {"passed": 0, "applicable": 0}) + year_cat_scores = defaultdict(lambda: defaultdict(lambda: {"passed": 0, "applicable": 0})) + venue_scores = defaultdict(list) + citation_band_scores = defaultdict(list) + benchmark_only_by_year = defaultdict(lambda: {"benchmark_only": 0, "total": 0}) + funding_groups = {"disclosed": [], "not_disclosed": []} + score_map = {} # paper_id -> score_pct (built incrementally for homophily) + tensions = { "productivity": {"positive": [], "nuanced": []}, "benchmarks": {"positive": [], "nuanced": []}, @@ -211,6 +256,7 @@ def build(): total_papers += 1 score_pct = round(overall * 100, 1) all_scores.append(score_pct) + score_map[paper_id] = score_pct year = paper_meta.get("year") or reg_entry.get("year") venue = metadata.get("venue") or paper_meta.get("venue") or reg_entry.get("venue", "") @@ -223,6 +269,9 @@ def build(): doi = paper_meta.get("doi") or reg_entry.get("doi", "") source_url = reg_entry.get("source_url", "") + # Code URL extraction + code_url = extract_code_url(checklist) + year_scores[year].append(score_pct) for t in tags: tag_counts[t] += 1 @@ -230,7 +279,7 @@ def build(): for g in games: game_counts[g] += 1 - # Category aggregations + # Category + question aggregations for cat in ALL_CATEGORIES: cat_data = checklist.get(cat, {}) if not isinstance(cat_data, dict): @@ -240,8 +289,47 @@ def build(): continue if q_data["applies"]: cat_pass_counts[cat]["applicable"] += 1 + question_pass_counts[f"{cat}.{q_name}"]["applicable"] += 1 if q_data.get("answer", False): cat_pass_counts[cat]["passed"] += 1 + question_pass_counts[f"{cat}.{q_name}"]["passed"] += 1 + # Year × category + if year: + year_cat_scores[year][cat]["applicable"] += 1 + if q_data.get("answer", False): + year_cat_scores[year][cat]["passed"] += 1 + + # Venue scoring (skip arXiv — it's a preprint server, not a venue) + venue_clean = venue.strip() + if venue_clean and venue_clean.lower() not in ("arxiv", "arxiv.org", ""): + venue_scores[venue_clean].append(score_pct) + + # Citation band scoring + cit = metadata.get("citation_count") + if cit is not None: + if cit == 0: + band = "0" + elif cit <= 50: + band = "1-50" + elif cit <= 500: + band = "51-500" + else: + band = "500+" + citation_band_scores[band].append(score_pct) + + # Benchmark monoculture + if year: + benchmark_only_by_year[year]["total"] += 1 + if tags == ["benchmark-eval"]: + benchmark_only_by_year[year]["benchmark_only"] += 1 + + # Funding gap + fd = checklist.get("conflicts_of_interest", {}).get("funding_disclosed", {}) + if fd.get("applies"): + if fd.get("answer"): + funding_groups["disclosed"].append(score_pct) + else: + funding_groups["not_disclosed"].append(score_pct) claims = scan.get("claims", []) red_flags = scan.get("red_flags", []) @@ -270,7 +358,7 @@ def build(): cat_scores_pct = {k: round(v * 100, 1) for k, v in cat_scores.items()} - # Slim index entry (for table) + # Slim index entry index_entry = { "id": paper_id, "title": paper_meta.get("title", reg_entry.get("title", paper_id)), @@ -282,10 +370,11 @@ def build(): "games": games, "arxiv_id": arxiv_id, "doi": doi, + "code_url": code_url, } papers_index.append(index_entry) - # Full detail (for per-paper file) + # Full detail detail = { **index_entry, "category_scores": cat_scores_pct, @@ -297,11 +386,9 @@ def build(): "source_url": source_url, } paper_details[paper_id] = detail - - # Full entry for explorer.json papers_full.append(detail) - # --- Aggregations --- + # --- Dashboard aggregations --- all_scores.sort() n = len(all_scores) median = all_scores[n // 2] if n else 0 @@ -368,9 +455,121 @@ def build(): "pipeline": pipeline, } + # --- Findings aggregations --- + + # 1. Per-question pass rates + q_rates = {} + for key, d in question_pass_counts.items(): + if d["applicable"] > 0: + q_rates[key] = {"rate": round(d["passed"] / d["applicable"] * 100, 1), "n": d["applicable"]} + + # 2. Year trends by category + year_cat_trends = {} + for y in sorted(year_cat_scores.keys()): + year_cat_trends[str(y)] = {} + for cat in ALL_CATEGORIES: + d = year_cat_scores[y][cat] + if d["applicable"] > 0: + year_cat_trends[str(y)][cat] = round(d["passed"] / d["applicable"] * 100, 1) + + # 3. Venue & citation scoring + venue_stats = {} + for v, scores in venue_scores.items(): + if len(scores) >= 3: + venue_stats[v] = {"n": len(scores), "mean": safe_mean(scores), "median": safe_median(scores)} + + cit_band_stats = {} + for band in ["0", "1-50", "51-500", "500+"]: + scores = citation_band_scores.get(band, []) + if scores: + cit_band_stats[band] = {"n": len(scores), "mean": safe_mean(scores), "median": safe_median(scores)} + + # 4. Optimism-rigor inversion + optimism_rigor = {} + for key, sides in tensions.items(): + pos = [c["score"] for c in sides["positive"]] + nua = [c["score"] for c in sides["nuanced"]] + optimism_rigor[key] = { + "positive_n": len(pos), "positive_mean": safe_mean(pos), + "nuanced_n": len(nua), "nuanced_mean": safe_mean(nua), + "gap": round(safe_mean(nua) - safe_mean(pos), 1), + } + + # 5. Quality homophily + threshold = 60 + high_quality_ids = {pid for pid, sc in score_map.items() if sc >= threshold} + baseline_pct = round(len(high_quality_ids) / total_papers * 100, 1) if total_papers else 0 + + cited_high = 0 + cited_total = 0 + for edge in citation_data.get("edges", []): + s, t = edge["source"], edge["target"] + if s in high_quality_ids and t in score_map: + cited_total += 1 + if score_map[t] >= threshold: + cited_high += 1 + + homophily = { + "threshold": threshold, + "baseline_pct": baseline_pct, + "high_cite_high_pct": round(cited_high / cited_total * 100, 1) if cited_total else 0, + "high_cite_total": cited_total, + } + + # 6. Sampling effect (historical checkpoints + current) + sampling_effect = { + "checkpoints": [ + {"n": 135, "median": 53.3}, + {"n": 271, "median": 50.6}, + {"n": 467, "median": 50.0}, + {"n": total_papers, "median": round(median, 1)}, + ] + } + + # 7. Benchmark monoculture + bench_mono = {} + for y in sorted(benchmark_only_by_year.keys()): + d = benchmark_only_by_year[y] + if d["total"] > 0: + bench_mono[str(y)] = { + "benchmark_only": d["benchmark_only"], + "total": d["total"], + "pct": round(d["benchmark_only"] / d["total"] * 100, 1), + } + + # 8. Funding gap + funding_gap = {} + for group, scores in funding_groups.items(): + if scores: + funding_gap[group] = {"n": len(scores), "mean": safe_mean(scores), "median": safe_median(scores)} + + # 9. Reproducibility drill-down + artifacts_qs = ["code_released", "data_released", "environment_specified", "reproduction_instructions"] + repro_detail = {} + for q in artifacts_qs: + key = f"artifacts.{q}" + d = question_pass_counts.get(key, {"passed": 0, "applicable": 0}) + if d["applicable"] > 0: + repro_detail[q] = {"rate": round(d["passed"] / d["applicable"] * 100, 1), "n": d["applicable"]} + repro_detail["full_pass_count"] = repro_count + repro_detail["full_pass_pct"] = round(repro_count / total_papers * 100, 1) if total_papers else 0 + + findings = { + "question_rates": q_rates, + "year_category_trends": year_cat_trends, + "venue_stats": venue_stats, + "citation_band_stats": cit_band_stats, + "optimism_rigor": optimism_rigor, + "homophily": homophily, + "sampling_effect": sampling_effect, + "benchmark_monoculture": bench_mono, + "funding_gap": funding_gap, + "repro_detail": repro_detail, + "game_pcts": game_pcts, + } + # --- Citation network --- v2_ids = {p["id"] for p in papers_full} - score_map = {p["id"]: p["score"] for p in papers_full} year_map = {p["id"]: p["year"] for p in papers_full} all_graph_ids = {n["id"] for n in citation_data.get("nodes", [])} @@ -401,7 +600,7 @@ def build(): papers_detail_dir = OUTPUT_DIR / "papers" papers_detail_dir.mkdir(parents=True, exist_ok=True) - # Add unscanned registry entries to papers-index (score=null, no detail page) + # Add unscanned registry entries to papers-index scanned_ids = {p["id"] for p in papers_index} for entry in registry.values(): if entry["id"] in scanned_ids: @@ -419,9 +618,11 @@ def build(): "games": [], "arxiv_id": entry.get("arxiv_id", ""), "doi": entry.get("doi", ""), + "code_url": None, }) write_json(OUTPUT_DIR / "dashboard.json", dashboard) + write_json(OUTPUT_DIR / "findings.json", findings) write_json(OUTPUT_DIR / "papers-index.json", papers_index) write_json(OUTPUT_DIR / "network.json", network) write_json(OUTPUT_DIR / "tensions.json", tensions) @@ -431,9 +632,10 @@ def build(): # Full monolith explorer = { - "generated": "2026-03-18", + "generated": "2026-03-21", "papers": papers_full, "agg": dashboard, + "findings": findings, "tensions": tensions, "citation_network": network, } @@ -441,15 +643,19 @@ def build(): # Report dash_size = (OUTPUT_DIR / "dashboard.json").stat().st_size + find_size = (OUTPUT_DIR / "findings.json").stat().st_size idx_size = (OUTPUT_DIR / "papers-index.json").stat().st_size net_size = (OUTPUT_DIR / "network.json").stat().st_size tens_size = (OUTPUT_DIR / "tensions.json").stat().st_size full_size = (OUTPUT_DIR / "explorer.json").stat().st_size + code_url_count = sum(1 for p in papers_full if p.get("code_url")) print(f"Papers: {total_papers}") print(f"Median score: {median:.1f}%") + print(f"Code URLs extracted: {code_url_count}") print(f"Network: {len(net_nodes)} nodes, {len(net_edges)} edges") print(f"Files:") print(f" dashboard.json: {dash_size:>8,} bytes") + print(f" findings.json: {find_size:>8,} bytes") print(f" papers-index.json: {idx_size:>8,} bytes") print(f" papers/*.json: {len(paper_details):>5} files") print(f" network.json: {net_size:>8,} bytes")

Impressum · Datenschutz