commit 96e47d0e1acc7b77cf730518cd97e6b0cb6f419b
parent 6d3758b3b52628e5f9c0bb8cb38aae235f766dde
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Mon, 23 Mar 2026 09:59:27 +0100
Rebuild citation network, add ego mode, quality flow, and network findings
Network rebuilt from cited_papers (960 nodes, 2952 edges vs old 572/715).
Scanned 3 foundational papers: Codex 71.7%, CoT 56.6%, ReAct 48.2%.
Network view:
- Directed arrows on edges (visible when zoomed in or in ego mode)
- Click node = ego mode: shows 1-hop neighborhood with in/out distinction,
color-coded edges (blue=cited-by, orange=cites), info panel with stats
- Double-click = navigate to paper detail
- Edge color toggle: default or quality flow (green=good→good, red=weak→good)
- Escape or "Show all" exits ego mode
- Hover shows "Cites N / Cited by M" with directional counts
Findings:
- Citation Network Insights section with foundational paper leaderboard,
quality contagion gradient (43.1% → 52.3%), and rigor diffusion table
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
10 files changed, 1381 insertions(+), 307 deletions(-)
diff --git a/explorer/src/data.ts b/explorer/src/data.ts
@@ -93,6 +93,11 @@ export interface Findings {
funding_gap: Record<string, GroupStat>;
repro_detail: Record<string, QuestionRate | number> & { full_pass_count: number; full_pass_pct: number };
game_pcts: Record<string, number>;
+ network_insights: {
+ foundational: { id: string; title: string; in_degree: number; score: number | null }[];
+ quality_contagion: Record<string, { n: number; mean: number }>;
+ rigor_diffusion: { id: string; title: string; score: number | null; in_degree: number; citer_mean: number | null; citer_n: number }[];
+ };
correlation: {
categories: string[];
matrix: { r: number | null; n: number }[][];
@@ -126,6 +131,7 @@ export interface NetNode {
score: number | null;
year: number | null;
in_degree: number;
+ out_degree: number;
has_scan: boolean;
}
diff --git a/explorer/src/style.css b/explorer/src/style.css
@@ -362,6 +362,26 @@ td.score {
cursor: grab;
}
#network-canvas:active { cursor: grabbing; }
+.ego-panel {
+ background: var(--surface);
+ border: 1px solid var(--border);
+ border-radius: 8px;
+ padding: 1rem;
+ margin-top: 1rem;
+ max-height: 400px;
+ overflow-y: auto;
+}
+.ego-header { margin-bottom: 0.5rem; font-size: 0.9rem; }
+.ego-stats { font-size: 0.8rem; margin-bottom: 0.75rem; }
+.ego-list { margin-bottom: 0.75rem; }
+.ego-list-label { font-size: 0.75rem; font-weight: 600; margin-bottom: 0.25rem; }
+.ego-list-item {
+ font-size: 0.8rem;
+ padding: 0.15rem 0;
+ cursor: pointer;
+ border-bottom: 1px solid var(--border);
+}
+.ego-list-item:hover { color: var(--accent); }
.network-tooltip {
position: fixed;
background: var(--surface);
diff --git a/explorer/src/views/findings.ts b/explorer/src/views/findings.ts
@@ -46,6 +46,7 @@ export async function renderFindings(app: HTMLElement) {
${renderBenchmarkMonoculture(f)}
${renderFundingGap(f)}
${renderReproDetail(f)}
+ ${renderNetworkInsights(f)}
${renderGames(f)}
`;
@@ -523,6 +524,46 @@ function renderReproDetail(f: Findings): string {
</div>`;
}
+function renderNetworkInsights(f: Findings): string {
+ const ni = (f as any).network_insights;
+ if (!ni) return '';
+
+ const foundational = ni.foundational as { id: string; title: string; in_degree: number; score: number | null }[];
+ const contagion = ni.quality_contagion as Record<string, { n: number; mean: number }>;
+ const diffusion = ni.rigor_diffusion as { id: string; title: string; score: number | null; in_degree: number; citer_mean: number | null; citer_n: number }[];
+
+ return `<div class="section">
+ <h2>Citation Network Insights</h2>
+
+ <h3 style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">Foundational Papers (most internally cited)</h3>
+ <div class="table-wrap"><table style="font-size:0.82rem">
+ <thead><tr><th>Paper</th><th>Citations</th><th>Score</th><th>Citer Mean</th></tr></thead>
+ <tbody>${diffusion.slice(0, 15).map(p => {
+ const sc = p.score != null ? `<span style="color:${p.score < 40 ? 'var(--red)' : p.score < 55 ? 'var(--yellow)' : 'var(--green)'}">${p.score}%</span>` : '--';
+ const cm = p.citer_mean != null ? `${p.citer_mean}%` : '--';
+ return `<tr>
+ <td>${p.score != null ? `<a href="#/paper/${p.id}" style="color:var(--accent);text-decoration:none">${p.title.length > 55 ? p.title.slice(0, 52) + '...' : p.title}</a>` : (p.title.length > 55 ? p.title.slice(0, 52) + '...' : p.title)}</td>
+ <td style="font-family:var(--font)">${p.in_degree}</td>
+ <td class="score">${sc}</td>
+ <td style="font-family:var(--font)">${cm}</td>
+ </tr>`;
+ }).join('')}</tbody>
+ </table></div>
+
+ <h3 style="font-size:0.85rem;color:var(--text-dim);margin:1.5rem 0 0.5rem">Quality Contagion — You Are Who You Cite</h3>
+ <p style="font-size:0.82rem;color:var(--text-dim);margin-bottom:0.5rem">Mean methodology score by the proportion of high-quality (\u226550%) papers in a paper's reference list.</p>
+ ${['0%', '1-33%', '34-66%', '67-100%'].map(band => {
+ const d = contagion[band];
+ if (!d) return '';
+ const color = d.mean < 45 ? 'var(--red)' : d.mean < 50 ? 'var(--yellow)' : 'var(--green)';
+ return `<div class="hbar">
+ <div class="hbar-label"><span>${band} high-quality refs</span><span>${d.mean}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
+ <div class="hbar-track"><div class="hbar-fill" style="width:${d.mean}%;background:${color}"></div></div>
+ </div>`;
+ }).join('')}
+ </div>`;
+}
+
function renderGames(f: Findings): string {
const sorted = Object.entries(f.game_pcts).sort((a, b) => b[1] - a[1]);
return `<div class="section">
diff --git a/explorer/src/views/network.ts b/explorer/src/views/network.ts
@@ -8,6 +8,8 @@ interface SimNode extends NetNode {
vy: number;
}
+type EdgeMode = 'none' | 'quality';
+
export async function renderNetwork(app: HTMLElement) {
app.innerHTML = '<div class="spinner"></div>';
const { nodes, edges } = await loadNetwork();
@@ -16,15 +18,25 @@ export async function renderNetwork(app: HTMLElement) {
<div class="filters" style="margin-bottom:1rem">
<label style="font-size:0.8rem;color:var(--text-dim)">Min connections: <input type="number" id="net-min-conn" value="1" min="0" max="50" style="width:50px"></label>
<label style="font-size:0.8rem;color:var(--text-dim)">Scanned only: <input type="checkbox" id="net-scanned-only"></label>
+ <label style="font-size:0.8rem;color:var(--text-dim)">Edge color:
+ <select id="net-edge-mode">
+ <option value="none">Default</option>
+ <option value="quality">Quality flow</option>
+ </select>
+ </label>
<span class="filter-count" id="net-count"></span>
+ <button id="net-reset-ego" style="display:none;font-size:0.75rem;padding:0.2rem 0.6rem;border:1px solid var(--border);border-radius:3px;background:none;color:var(--accent);cursor:pointer">Show all (Esc)</button>
</div>
<canvas id="network-canvas" width="1200" height="700"></canvas>
<div class="network-tooltip" id="net-tooltip" style="display:none"></div>
+ <div id="ego-panel" class="ego-panel" style="display:none"></div>
`;
const canvas = document.getElementById('network-canvas') as HTMLCanvasElement;
const ctx = canvas.getContext('2d')!;
const tooltip = document.getElementById('net-tooltip')!;
+ const egoPanel = document.getElementById('ego-panel')!;
+ const resetBtn = document.getElementById('net-reset-ego')!;
const adjCount = new Map<string, number>();
for (const [s, t] of edges) {
@@ -34,6 +46,8 @@ export async function renderNetwork(app: HTMLElement) {
let minConn = 1;
let scannedOnly = false;
+ let edgeMode: EdgeMode = 'none';
+ let selectedNode: string | null = null;
function getFilteredGraph() {
const filteredNodes = nodes.filter(n => {
@@ -51,6 +65,12 @@ export async function renderNetwork(app: HTMLElement) {
let simEdges: [number, number][] = [];
let transform = { x: 0, y: 0, k: 1 };
let animId = 0;
+ let hoveredNode: SimNode | null = null;
+
+ // Adjacency for ego mode
+ let simIncoming = new Map<number, number[]>(); // target_idx -> [source_idxs]
+ let simOutgoing = new Map<number, number[]>(); // source_idx -> [target_idxs]
+ let nodeIdxMap = new Map<string, number>(); // id -> simNodes index
function initSim() {
const { nodes: fNodes, edges: fEdges } = getFilteredGraph();
@@ -65,13 +85,26 @@ export async function renderNetwork(app: HTMLElement) {
vx: 0, vy: 0,
}));
- const nodeIdx = new Map(simNodes.map((n, i) => [n.id, i]));
+ nodeIdxMap = new Map(simNodes.map((n, i) => [n.id, i]));
simEdges = [];
+ simIncoming = new Map();
+ simOutgoing = new Map();
+
for (const [s, t] of fEdges) {
- const si = nodeIdx.get(s), ti = nodeIdx.get(t);
- if (si !== undefined && ti !== undefined) simEdges.push([si, ti]);
+ const si = nodeIdxMap.get(s), ti = nodeIdxMap.get(t);
+ if (si !== undefined && ti !== undefined) {
+ simEdges.push([si, ti]);
+ if (!simIncoming.has(ti)) simIncoming.set(ti, []);
+ simIncoming.get(ti)!.push(si);
+ if (!simOutgoing.has(si)) simOutgoing.set(si, []);
+ simOutgoing.get(si)!.push(ti);
+ }
}
+ selectedNode = null;
+ egoPanel.style.display = 'none';
+ resetBtn.style.display = 'none';
+
let alpha = 1;
cancelAnimationFrame(animId);
@@ -92,7 +125,6 @@ export async function renderNetwork(app: HTMLElement) {
}
fx += (600 - simNodes[i].x) * 0.01;
fy += (350 - simNodes[i].y) * 0.01;
-
simNodes[i].vx = (simNodes[i].vx + fx * alpha) * 0.6;
simNodes[i].vy = (simNodes[i].vy + fy * alpha) * 0.6;
}
@@ -130,16 +162,57 @@ export async function renderNetwork(app: HTMLElement) {
return '#3dd68c';
}
- // Read theme-aware edge color from CSS variable
function getEdgeColor(): string {
return getComputedStyle(document.documentElement).getPropertyValue('--net-edge').trim();
}
+ function qualityEdgeColor(si: number, ti: number): string {
+ const ss = simNodes[si].score;
+ const ts = simNodes[ti].score;
+ if (ss === null || ts === null) return 'rgba(100,100,100,0.2)';
+ if (ss >= 50 && ts >= 50) return 'rgba(61, 214, 140, 0.5)'; // green: good cites good
+ if (ss < 50 && ts >= 50) return 'rgba(240, 101, 101, 0.5)'; // red: weak cites good (free-riding)
+ if (ts < 50) return 'rgba(240, 192, 80, 0.3)'; // yellow: citing weak work
+ return 'rgba(100,100,100,0.2)';
+ }
+
+ function drawArrow(x1: number, y1: number, x2: number, y2: number, nodeR: number) {
+ const dx = x2 - x1, dy = y2 - y1;
+ const d = Math.sqrt(dx * dx + dy * dy) || 1;
+ const ux = dx / d, uy = dy / d;
+ // Arrow tip at edge of target node
+ const tipX = x2 - ux * nodeR;
+ const tipY = y2 - uy * nodeR;
+ const size = 4;
+ ctx.beginPath();
+ ctx.moveTo(tipX, tipY);
+ ctx.lineTo(tipX - ux * size - uy * size * 0.5, tipY - uy * size + ux * size * 0.5);
+ ctx.lineTo(tipX - ux * size + uy * size * 0.5, tipY - uy * size - ux * size * 0.5);
+ ctx.closePath();
+ ctx.fill();
+ }
+
+ const selectedIdx = () => selectedNode ? nodeIdxMap.get(selectedNode) : undefined;
+
+ function isEgoVisible(ni: number): boolean {
+ const sel = selectedIdx();
+ if (sel === undefined) return true;
+ if (ni === sel) return true;
+ const incoming = simIncoming.get(sel) || [];
+ const outgoing = simOutgoing.get(sel) || [];
+ return incoming.includes(ni) || outgoing.includes(ni);
+ }
+
+ function isEgoEdge(si: number, ti: number): boolean {
+ const sel = selectedIdx();
+ if (sel === undefined) return true;
+ return si === sel || ti === sel;
+ }
+
function draw() {
const w = canvas.width, h = canvas.height;
ctx.clearRect(0, 0, w, h);
- // Fill background with theme color
const bgColor = getComputedStyle(document.documentElement).getPropertyValue('--net-bg').trim();
ctx.fillStyle = bgColor;
ctx.fillRect(0, 0, w, h);
@@ -148,43 +221,88 @@ export async function renderNetwork(app: HTMLElement) {
ctx.translate(transform.x, transform.y);
ctx.scale(transform.k, transform.k);
- // Edges — visible on both themes
- ctx.strokeStyle = getEdgeColor();
- ctx.lineWidth = 1.2;
+ const sel = selectedIdx();
+ const hoverIdx = hoveredNode ? nodeIdxMap.get(hoveredNode.id) : undefined;
+ const showArrows = transform.k > 0.8;
+
+ // Edges
+ const defaultEdgeColor = getEdgeColor();
for (const [si, ti] of simEdges) {
+ const egoVis = isEgoEdge(si, ti);
+ if (sel !== undefined && !egoVis) continue; // hide non-ego edges entirely
+
+ let color: string;
+ let width = 1.0;
+
+ if (edgeMode === 'quality') {
+ color = qualityEdgeColor(si, ti);
+ width = 1.2;
+ } else if (hoverIdx !== undefined && (si === hoverIdx || ti === hoverIdx)) {
+ // Directional highlight on hover
+ color = si === hoverIdx ? 'rgba(240, 160, 50, 0.7)' : 'rgba(100, 160, 255, 0.7)';
+ width = 1.8;
+ } else {
+ color = sel !== undefined ? defaultEdgeColor.replace(/[\d.]+\)$/, '0.6)') : defaultEdgeColor;
+ }
+
+ ctx.strokeStyle = color;
+ ctx.lineWidth = width;
ctx.beginPath();
ctx.moveTo(simNodes[si].x, simNodes[si].y);
ctx.lineTo(simNodes[ti].x, simNodes[ti].y);
ctx.stroke();
+
+ // Arrowheads when zoomed in or in ego mode
+ if (showArrows || sel !== undefined) {
+ const tr = Math.max(3, Math.min(10, 3 + simNodes[ti].in_degree * 0.6));
+ ctx.fillStyle = color;
+ drawArrow(simNodes[si].x, simNodes[si].y, simNodes[ti].x, simNodes[ti].y, tr);
+ }
}
// Nodes
- for (const n of simNodes) {
+ for (let i = 0; i < simNodes.length; i++) {
+ const n = simNodes[i];
+ const visible = isEgoVisible(i);
+ if (sel !== undefined && !visible) {
+ // Ghost dimmed node
+ ctx.beginPath();
+ ctx.arc(n.x, n.y, 2, 0, Math.PI * 2);
+ ctx.fillStyle = 'rgba(100,100,100,0.1)';
+ ctx.fill();
+ continue;
+ }
+
const r = Math.max(3, Math.min(10, 3 + n.in_degree * 0.6));
+ const isSelected = sel !== undefined && i === sel;
+
ctx.beginPath();
- ctx.arc(n.x, n.y, r, 0, Math.PI * 2);
+ ctx.arc(n.x, n.y, isSelected ? r + 3 : r, 0, Math.PI * 2);
ctx.fillStyle = scoreToColor(n.score);
+ ctx.globalAlpha = (sel !== undefined && !isSelected) ? 0.85 : (visible ? 0.8 : 0.15);
ctx.fill();
- // Subtle outline for visibility
- ctx.strokeStyle = 'rgba(0,0,0,0.3)';
- ctx.lineWidth = 0.5;
- ctx.stroke();
+ ctx.globalAlpha = 1;
+
+ if (isSelected) {
+ ctx.strokeStyle = '#fff';
+ ctx.lineWidth = 2;
+ ctx.stroke();
+ // Label
+ ctx.fillStyle = getComputedStyle(document.documentElement).getPropertyValue('--text').trim();
+ ctx.font = '11px sans-serif';
+ ctx.textAlign = 'center';
+ ctx.fillText(n.title.length > 40 ? n.title.slice(0, 37) + '...' : n.title, n.x, n.y - r - 8);
+ } else {
+ ctx.strokeStyle = 'rgba(0,0,0,0.2)';
+ ctx.lineWidth = 0.5;
+ ctx.stroke();
+ }
}
ctx.restore();
}
- // Pan & zoom
- let dragging = false;
- let dragMoved = false;
- let lastX = 0, lastY = 0;
-
- canvas.addEventListener('mousedown', e => {
- dragging = true;
- dragMoved = false;
- lastX = e.clientX; lastY = e.clientY;
- });
- // Convert mouse event to canvas-space coordinates (accounts for CSS scaling)
+ // --- Mouse interaction ---
function canvasCoords(e: MouseEvent): { cx: number; cy: number } {
const rect = canvas.getBoundingClientRect();
const scaleX = canvas.width / rect.width;
@@ -195,11 +313,39 @@ export async function renderNetwork(app: HTMLElement) {
};
}
+ function simCoords(e: MouseEvent): { mx: number; my: number } {
+ const { cx, cy } = canvasCoords(e);
+ return {
+ mx: (cx - transform.x) / transform.k,
+ my: (cy - transform.y) / transform.k,
+ };
+ }
+
+ function findNearest(e: MouseEvent, maxDist = 25): SimNode | null {
+ const { mx, my } = simCoords(e);
+ let closest: SimNode | null = null;
+ let closestDist = maxDist;
+ for (const n of simNodes) {
+ const d = Math.sqrt((n.x - mx) ** 2 + (n.y - my) ** 2);
+ if (d < closestDist) { closest = n; closestDist = d; }
+ }
+ return closest;
+ }
+
+ let dragging = false;
+ let dragMoved = false;
+ let lastX = 0, lastY = 0;
+
+ canvas.addEventListener('mousedown', e => {
+ dragging = true;
+ dragMoved = false;
+ lastX = e.clientX; lastY = e.clientY;
+ });
+
canvas.addEventListener('mousemove', e => {
if (dragging) {
const dx = e.clientX - lastX, dy = e.clientY - lastY;
if (Math.abs(dx) > 2 || Math.abs(dy) > 2) dragMoved = true;
- // Scale drag delta to canvas space
const rect = canvas.getBoundingClientRect();
const scaleX = canvas.width / rect.width;
const scaleY = canvas.height / rect.height;
@@ -209,33 +355,32 @@ export async function renderNetwork(app: HTMLElement) {
draw();
}
- // Tooltip — use canvas-space coordinates
- const { cx, cy } = canvasCoords(e);
- const mx = (cx - transform.x) / transform.k;
- const my = (cy - transform.y) / transform.k;
+ const nearest = findNearest(e);
+ hoveredNode = nearest;
+ draw(); // redraw for hover highlight
- let closest: SimNode | null = null;
- let closestDist = 25;
- for (const n of simNodes) {
- const d = Math.sqrt((n.x - mx) ** 2 + (n.y - my) ** 2);
- if (d < closestDist) { closest = n; closestDist = d; }
- }
-
- if (closest) {
- canvas.style.cursor = closest.has_scan ? 'pointer' : 'default';
+ if (nearest) {
+ canvas.style.cursor = 'pointer';
tooltip.style.display = 'block';
tooltip.style.left = e.clientX + 14 + 'px';
tooltip.style.top = e.clientY + 14 + 'px';
- tooltip.innerHTML = `<strong>${closest.title}</strong><br>
- ${closest.score != null ? `Score: ${closest.score}%` : 'Not scanned'}<br>
- Connections: ${closest.in_degree}${closest.has_scan ? '<br><em>Click to view</em>' : ''}`;
+ tooltip.innerHTML = `<strong>${nearest.title}</strong><br>
+ ${nearest.score != null ? `Score: ${nearest.score}%` : 'Not scanned'}<br>
+ Cites: ${nearest.out_degree} · Cited by: ${nearest.in_degree}
+ ${nearest.has_scan ? '<br><em style="color:var(--text-dim)">Click to explore · Dbl-click for detail</em>' : ''}`;
} else {
canvas.style.cursor = dragging ? 'grabbing' : 'grab';
tooltip.style.display = 'none';
}
});
+
canvas.addEventListener('mouseup', () => dragging = false);
- canvas.addEventListener('mouseleave', () => { dragging = false; tooltip.style.display = 'none'; });
+ canvas.addEventListener('mouseleave', () => {
+ dragging = false;
+ tooltip.style.display = 'none';
+ hoveredNode = null;
+ draw();
+ });
canvas.addEventListener('wheel', e => {
e.preventDefault();
@@ -247,17 +392,87 @@ export async function renderNetwork(app: HTMLElement) {
draw();
}, { passive: false });
+ // Single click = ego mode
canvas.addEventListener('click', e => {
- if (dragMoved) return; // don't navigate after a drag
- const { cx, cy } = canvasCoords(e);
- const mx = (cx - transform.x) / transform.k;
- const my = (cy - transform.y) / transform.k;
- for (const n of simNodes) {
- const d = Math.sqrt((n.x - mx) ** 2 + (n.y - my) ** 2);
- if (d < 20 && n.has_scan) { navigate(`/paper/${n.id}`); return; }
+ if (dragMoved) return;
+ const nearest = findNearest(e);
+ if (nearest) {
+ enterEgoMode(nearest.id);
+ } else {
+ exitEgoMode();
}
});
+ // Double click = navigate to detail
+ canvas.addEventListener('dblclick', e => {
+ const nearest = findNearest(e);
+ if (nearest && nearest.has_scan) {
+ navigate(`/paper/${nearest.id}`);
+ }
+ });
+
+ function enterEgoMode(nodeId: string) {
+ selectedNode = nodeId;
+ resetBtn.style.display = 'inline-block';
+
+ const idx = nodeIdxMap.get(nodeId);
+ if (idx === undefined) return;
+
+ const node = simNodes[idx];
+ const incoming = (simIncoming.get(idx) || []).map(i => simNodes[i]);
+ const outgoing = (simOutgoing.get(idx) || []).map(i => simNodes[i]);
+
+ const inScores = incoming.filter(n => n.score !== null).map(n => n.score!);
+ const outScores = outgoing.filter(n => n.score !== null).map(n => n.score!);
+ const inMean = inScores.length ? (inScores.reduce((a, b) => a + b, 0) / inScores.length).toFixed(1) : '?';
+ const outMean = outScores.length ? (outScores.reduce((a, b) => a + b, 0) / outScores.length).toFixed(1) : '?';
+
+ egoPanel.style.display = 'block';
+ egoPanel.innerHTML = `
+ <div class="ego-header">
+ <strong>${node.title}</strong>
+ ${node.score != null ? ` <span style="color:${scoreToColor(node.score)};font-family:var(--font)">${node.score}%</span>` : ' <span style="color:var(--gray)">not scanned</span>'}
+ ${node.has_scan ? ` <a href="#/paper/${node.id}" style="color:var(--accent);font-size:0.8rem">view detail</a>` : ''}
+ </div>
+ <div class="ego-stats">
+ <span style="color:rgba(100,160,255,0.9)">Cited by ${incoming.length} (mean ${inMean}%)</span> ·
+ <span style="color:rgba(240,160,50,0.9)">Cites ${outgoing.length} (mean ${outMean}%)</span>
+ </div>
+ ${incoming.length ? `<div class="ego-list"><div class="ego-list-label" style="color:rgba(100,160,255,0.9)">Cited by</div>
+ ${incoming.sort((a, b) => (b.score ?? -1) - (a.score ?? -1)).slice(0, 15).map(n =>
+ `<div class="ego-list-item" data-id="${n.id}"><span style="color:${scoreToColor(n.score)};font-family:var(--font);font-size:0.75rem;width:35px;display:inline-block">${n.score != null ? n.score + '%' : '--'}</span> ${n.title.length > 50 ? n.title.slice(0, 47) + '...' : n.title}</div>`
+ ).join('')}${incoming.length > 15 ? `<div style="color:var(--text-dim);font-size:0.75rem">+ ${incoming.length - 15} more</div>` : ''}
+ </div>` : ''}
+ ${outgoing.length ? `<div class="ego-list"><div class="ego-list-label" style="color:rgba(240,160,50,0.9)">Cites</div>
+ ${outgoing.sort((a, b) => (b.score ?? -1) - (a.score ?? -1)).slice(0, 15).map(n =>
+ `<div class="ego-list-item" data-id="${n.id}"><span style="color:${scoreToColor(n.score)};font-family:var(--font);font-size:0.75rem;width:35px;display:inline-block">${n.score != null ? n.score + '%' : '--'}</span> ${n.title.length > 50 ? n.title.slice(0, 47) + '...' : n.title}</div>`
+ ).join('')}${outgoing.length > 15 ? `<div style="color:var(--text-dim);font-size:0.75rem">+ ${outgoing.length - 15} more</div>` : ''}
+ </div>` : ''}
+ `;
+
+ // Click on ego list items to switch ego focus
+ egoPanel.querySelectorAll('.ego-list-item').forEach(el => {
+ el.addEventListener('click', () => {
+ const id = (el as HTMLElement).dataset.id;
+ if (id) enterEgoMode(id);
+ });
+ });
+
+ draw();
+ }
+
+ function exitEgoMode() {
+ selectedNode = null;
+ egoPanel.style.display = 'none';
+ resetBtn.style.display = 'none';
+ draw();
+ }
+
+ resetBtn.addEventListener('click', exitEgoMode);
+ document.addEventListener('keydown', e => {
+ if (e.key === 'Escape' && selectedNode) exitEgoMode();
+ });
+
// Filter controls
document.getElementById('net-min-conn')?.addEventListener('input', e => {
minConn = parseInt((e.target as HTMLInputElement).value) || 0;
@@ -267,6 +482,10 @@ export async function renderNetwork(app: HTMLElement) {
scannedOnly = (e.target as HTMLInputElement).checked;
initSim();
});
+ document.getElementById('net-edge-mode')?.addEventListener('change', e => {
+ edgeMode = (e.target as HTMLSelectElement).value as EdgeMode;
+ draw();
+ });
initSim();
}
diff --git a/explorer/tests/explorer.spec.ts b/explorer/tests/explorer.spec.ts
@@ -6,8 +6,8 @@ test.describe('Dashboard', () => {
await expect(page.locator('.card .value').first()).toBeVisible({ timeout: 10000 });
const cards = page.locator('.card');
await expect(cards).toHaveCount(4);
- await expect(cards.nth(0).locator('.value')).toHaveText('741');
- await expect(cards.nth(1).locator('.value')).toHaveText('48%');
+ await expect(cards.nth(0).locator('.value')).toHaveText('744');
+ await expect(cards.nth(1).locator('.value')).toHaveText('48.1%');
});
test('shows spinner then content', async ({ page }) => {
@@ -206,8 +206,8 @@ test.describe('Findings', () => {
test('loads and shows all sections', async ({ page }) => {
await page.goto('/#/findings');
await expect(page.locator('.section').first()).toBeVisible({ timeout: 10000 });
- // Should have 12 sections
- expect(await page.locator('.section').count()).toBe(12);
+ // Should have 13 sections
+ expect(await page.locator('.section').count()).toBe(13);
});
test('shows per-question pass rates', async ({ page }) => {
diff --git a/papers/chain-of-thought-prompting-2022/scan.json b/papers/chain-of-thought-prompting-2022/scan.json
@@ -1,415 +1,503 @@
{
"paper": {
"title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
- "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Brian Ichter", "Fei Xia", "Ed H. Chi", "Quoc V. Le", "Denny Zhou"],
+ "authors": [
+ "Jason Wei",
+ "Xuezhi Wang",
+ "Dale Schuurmans",
+ "Maarten Bosma",
+ "Brian Ichter",
+ "Fei Xia",
+ "Ed H. Chi",
+ "Quoc V. Le",
+ "Denny Zhou"
+ ],
"year": 2022,
"venue": "NeurIPS 2022",
"arxiv_id": "2201.11903"
},
+ "scan_version": 2,
+ "active_modules": ["experimental_rigor", "data_leakage"],
"checklist": {
"artifacts": {
"code_released": {
"applies": true,
"answer": false,
- "justification": "No code repository is provided. The paper states inputs, outputs, and targets for LaMDA 137B are provided in supplementary material as a zip file, and GPT-3 results are reproducible via the public API, but no source code for running the experiments is released."
+ "justification": "No code repository URL is provided in the paper. The authors provide inputs/outputs/targets for LaMDA and GPT-3 in supplementary material but do not release evaluation code or scripts."
},
"data_released": {
"applies": true,
"answer": true,
- "justification": "The paper uses publicly available benchmarks (GSM8K, SVAMP, ASDiv, AQuA, MAWPS, CSQA, StrategyQA, BIG-bench tasks, SayCan). The two new synthetic datasets (coin flip, last letter concatenation) are stated to be given in the Supplementary Materials (Checklist item 4c). Full prompts are provided in Appendix G."
+ "justification": "The paper uses publicly available benchmarks (GSM8K, SVAMP, ASDiv, AQuA, MAWPS, CSQA, StrategyQA, BIG-bench tasks). The coinflip and last letter concatenation datasets are new and are provided in supplementary material (Appendix E.3). LaMDA inputs/outputs/targets are also provided."
},
"environment_specified": {
"applies": true,
"answer": false,
- "justification": "Appendix E.2 mentions TPU v3 (8x8) for LaMDA 137B and TPU v4 (4x4x12) for PaLM 540B, and GPT-3 via the public API, but no software environment specifications (library versions, requirements.txt, etc.) are provided."
+ "justification": "Appendix E.2 states TPU v3 (8x8) for LaMDA 137B and TPU v4 (4x4x12) for PaLM 540B inference, and GPT-3 via public API. However, no software environment details (library versions, dependencies) are provided."
},
"reproduction_instructions": {
"applies": true,
"answer": false,
- "justification": "Appendix E.1 discusses reproducibility: exact input prompts are in Appendix G, GPT-3 API model names are specified, and LaMDA inputs/targets/predictions are in supplementary material. However, there are no step-by-step reproduction instructions (no README, no scripts). Two of the three model families (LaMDA, PaLM) are not publicly available."
+ "justification": "No step-by-step reproduction instructions are provided. While full prompts are given in Appendix G and model outputs are in supplementary material, there are no scripts or instructions to replicate the experiments."
}
},
"statistical_methodology": {
"confidence_intervals_or_error_bars": {
"applies": true,
- "answer": true,
- "justification": "Standard deviations across five random seeds (different exemplar orders) are reported for LaMDA 137B in Tables 6 and 7. For example, GSM8K chain-of-thought prompting: 14.3 ± 0.4. However, these are only for LaMDA 137B, not for GPT-3 or PaLM."
+ "answer": false,
+ "justification": "No confidence intervals or error bars are reported on main results. Standard deviations are only provided for LaMDA 137B ablation/robustness experiments in Tables 6 and 7, but not for the primary results across models and benchmarks in Tables 1-5."
},
"significance_tests": {
"applies": true,
"answer": false,
- "justification": "No statistical significance tests are reported. The paper claims chain-of-thought prompting 'outperforms' standard prompting and achieves 'state-of-the-art' based solely on comparing point estimates, without any formal statistical testing."
+ "justification": "No statistical significance tests are reported. Claims of improvement are based solely on comparing point estimates (e.g., '56.9% vs 17.9%') without any formal testing."
},
"effect_sizes_reported": {
"applies": true,
"answer": true,
- "justification": "Effect sizes are reported as absolute percentage improvements with baseline context throughout. For example, Table 1 shows GSM8K: standard 17.9% vs chain-of-thought 56.9% (+39.0), providing both baseline and improvement magnitude. This pattern is consistent across all results tables."
+ "justification": "Performance improvements are reported with baseline context throughout (e.g., Table 1 shows '+39.0' for PaLM 540B on GSM8K going from 17.9% to 56.9%). Absolute and relative gains are consistently provided."
},
"sample_size_justified": {
"applies": true,
"answer": false,
- "justification": "No justification is given for the sample sizes of evaluation datasets used or the number of seeds tested. The manual error analysis uses 50 random examples (Section 3.2) without justification for why 50 was sufficient."
+ "justification": "No justification is given for the number of evaluation examples or the choice of 50 examples for error analysis (Sections 3.2, Appendix D). The evaluation set sizes are inherited from existing benchmarks without discussion of statistical power."
},
"variance_reported": {
"applies": true,
"answer": true,
- "justification": "Standard deviation across five random seeds (different exemplar orderings) is reported for LaMDA 137B in Tables 6 and 7. The paper states: 'As LaMDA experiments did not show large variance among different seeds, to save compute we report results for a single exemplar order for all other models' (Section 3.1)."
+ "justification": "Standard deviations across five random seeds (different exemplar orderings) are reported for LaMDA 137B in Tables 6 and 7. For other models, single exemplar orders are used to save compute, which is explicitly stated in Section 3.1."
}
},
"evaluation_design": {
"baselines_included": {
"applies": true,
"answer": true,
- "justification": "Standard few-shot prompting is used as the primary baseline throughout all experiments (Section 3.1). Prior supervised best results are also compared against (Table 1, Figures 4 and 7)."
+ "justification": "Standard few-shot prompting is used as the primary baseline throughout. Prior supervised state-of-the-art results are also compared against (e.g., finetuned GPT-3 with verifier for GSM8K from Cobbe et al. 2021)."
},
"baselines_contemporary": {
"applies": true,
"answer": true,
- "justification": "Baselines include contemporary prior best results from 2021-2022 papers: Cobbe et al. (2021) for GSM8K, Pi et al. (2022) for SVAMP/MAWPS, and leaderboard entries for CSQA and StrategyQA as of May 2022."
+ "justification": "Baselines include contemporary prior best results from Cobbe et al. (2021), Jie et al. (2022), Pi et al. (2022), and others. These were recent at the time of publication."
},
"ablation_study": {
"applies": true,
"answer": true,
- "justification": "Section 3.3 presents a thorough ablation study with three variations: equation only (testing whether math equations alone explain the benefit), variable compute only (testing whether extra tokens explain the benefit), and chain of thought after answer (testing whether the reasoning must precede the answer). Results in Figure 5 and Tables 6-7."
+ "justification": "Section 3.3 presents a detailed ablation study with three variations: equation only, variable compute only (dots), and chain of thought after answer. Results shown in Figure 5 and Tables 6-7."
},
"multiple_metrics": {
"applies": true,
"answer": false,
- "justification": "Only accuracy (solve rate) is used as the evaluation metric across all benchmarks. No other metrics (e.g., reasoning chain quality, token efficiency, calibration) are reported."
+ "justification": "Only solve rate (accuracy) is used as the evaluation metric across all benchmarks. No secondary metrics are reported."
},
"human_evaluation": {
"applies": true,
"answer": true,
- "justification": "Manual analysis of generated chains of thought is performed: 50 correct examples and 50 incorrect examples from LaMDA 137B on GSM8K (Section 3.2, Appendices D.1 and D.2), with error categorization. Additional manual analysis of 45 PaLM 62B errors (Appendix A.1)."
+ "justification": "Manual error analysis of 50 correct and 50 incorrect chain-of-thought outputs from LaMDA 137B on GSM8K (Section 3.2, Appendices D.1-D.2). Also manual analysis of 45 PaLM 62B errors (Appendix A.1)."
},
"held_out_test_set": {
"applies": true,
"answer": true,
- "justification": "Standard evaluation splits are used for all benchmarks. For BIG-bench tasks without training sets, the first ten examples are used as exemplars and the rest as the evaluation set (Section 4). GSM8K and other datasets have separate evaluation splits."
+ "justification": "Results are reported on standard evaluation splits of each benchmark. The few-shot exemplars were manually composed or drawn from training sets, while results are on evaluation/test splits. Section 3.1 notes 'most of the datasets only have an evaluation split.'"
},
"per_category_breakdown": {
"applies": true,
"answer": true,
- "justification": "Results are broken down by individual benchmark (Tables 1-5), by model scale (Tables 2-5), by MAWPS subsets of varying difficulty (Table 3: SingleOp, SingleEq, AddSub, MultiArith), and by in-domain vs. out-of-domain for symbolic reasoning (Table 5, Figure 8)."
+ "justification": "Results are broken down by individual benchmark (Tables 1-5), by model size (Figures 4, 7, 8), and by MAWPS subsets (Table 3: SingleOp, SingleEq, AddSub, MultiArith). In-domain vs OOD results provided for symbolic reasoning."
},
"failure_cases_discussed": {
"applies": true,
"answer": true,
- "justification": "Extensive failure analysis in Appendix D.2: 50 incorrect examples categorized into calculator errors (8%), symbol mapping errors (16%), one-step missing errors (22%), and semantic understanding/incoherent errors (54%). Concrete failure examples are shown in Tables 10-11. Additional failure examples across tasks in Tables 13-19."
+ "justification": "Appendix D.2 provides detailed error analysis categorizing 50 incorrect outputs into calculator errors (8%), symbol mapping errors (16%), one-step missing (22%), and semantic understanding/incoherence errors (54%). Specific examples are shown in Tables 10-11."
},
"negative_results_reported": {
"applies": true,
"answer": true,
- "justification": "Several negative results are reported: chain-of-thought prompting hurts performance for small models (Section 3.2, Table 2), gains are minimal for easy tasks like SingleOp (Table 3), gains do not transfer perfectly across models (Appendix A.2 notes GPT-3 does not improve on CSQA and StrategyQA), and AQuA performance decreases for LaMDA 137B with chain-of-thought (-4.9 in Table 1)."
+ "justification": "The paper reports that chain-of-thought prompting hurts performance for small models (<10B parameters), shown in Figure 4 and Table 2. Also reports minimal gains on CSQA (Section 4) and easy MAWPS subsets (Section 3.2, Table 3)."
}
},
"claims_and_evidence": {
"abstract_claims_supported": {
"applies": true,
"answer": true,
- "justification": "The abstract claims that chain-of-thought prompting 'significantly improves' reasoning ability, that it is an emergent property of scale, and that PaLM 540B achieves state-of-the-art on GSM8K surpassing finetuned GPT-3. All three claims are supported by the experimental results in Sections 3-5 and Tables 1-5."
+ "justification": "The abstract claims CoT improves reasoning, emerges in sufficiently large models, and achieves SOTA on GSM8K. All are supported: Figure 4 shows emergence, Table 1 shows GSM8K SOTA (56.9% vs 55% prior best with PaLM 540B)."
},
"causal_claims_justified": {
"applies": true,
"answer": true,
- "justification": "The paper makes causal claims that chain-of-thought prompting 'improves' performance. The ablation study (Section 3.3) provides controlled single-variable manipulation: equation only, variable compute only, and reasoning after answer variants isolate different potential causal mechanisms. The comparison is between standard prompting and chain-of-thought prompting with everything else held constant."
+ "justification": "The paper makes causal claims ('chain-of-thought prompting improves performance') and supports them with controlled ablation studies (Section 3.3): equation only, variable compute only, and reasoning after answer. These ablations isolate specific factors, supporting the causal claim that the content of the chain of thought matters."
},
"generalization_bounded": {
"applies": true,
"answer": true,
- "justification": "The paper is careful to bound its claims. Section 6 explicitly states conditions for when chain-of-thought helps: challenging tasks, large models, flat scaling curves. Appendix A.3 discusses when it may not help. The paper acknowledges 'chain of thought prompting can potentially be applied to any task...We leave the empirical evaluation...to future work' rather than overclaiming."
+ "justification": "The paper clearly scopes claims to specific tasks (arithmetic, commonsense, symbolic reasoning), specific models (GPT-3, LaMDA, PaLM), and specific model scales (≥100B). Section 6 explicitly discusses when CoT helps (Appendix A.3 gives three conditions) and states it only helps 'sufficiently large' models."
},
"alternative_explanations_discussed": {
"applies": true,
"answer": true,
- "justification": "The ablation study (Section 3.3) specifically tests alternative explanations: that the benefit comes from equation generation alone (equation only ablation), from extra computation tokens (variable compute ablation), or from accessing relevant knowledge (reasoning after answer ablation). Each alternative is tested and found insufficient. Section 6 also discusses whether the model is 'actually reasoning.'"
+ "justification": "The ablation study (Section 3.3) tests three alternative explanations: (1) equations alone suffice, (2) more computation is the key factor, (3) CoT just activates knowledge. All are refuted with evidence. Section 6 also acknowledges models may not be 'actually reasoning.'"
+ },
+ "proxy_outcome_distinction": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper consistently measures solve rate/accuracy on specific benchmarks and does not overclaim beyond these measurements. Claims are tied to specific datasets (e.g., 'on the GSM8K benchmark') rather than broad ungrounded claims about 'reasoning ability' in general."
}
},
"setup_transparency": {
"model_versions_specified": {
"applies": true,
"answer": true,
- "justification": "GPT-3 models are specified by API name: text-ada-001, text-babbage-001, text-curie-001, text-davinci-002 (Section 3.1). Codex is identified as code-davinci-002. LaMDA and PaLM are specified by parameter count (LaMDA 422M/2B/8B/68B/137B, PaLM 8B/62B/540B). UL2 20B is named. These are specific enough to identify the models used."
+ "justification": "Section 3.1 specifies exact model versions: GPT-3 text-ada-001, text-babbage-001, text-curie-001, text-davinci-002 with presumed parameter counts. LaMDA model sizes (422M to 137B), PaLM sizes (8B, 62B, 540B), UL2 20B, and Codex code-davinci-002 are specified."
},
"prompts_provided": {
"applies": true,
"answer": true,
- "justification": "Full prompts for all tasks are provided in Appendix G (Tables 20-27). These include the exact chain-of-thought exemplars used for math word problems, AQuA, last letter concatenation, coin flip, CSQA, StrategyQA, date understanding, and sports understanding."
+ "justification": "Full prompt text for all tasks is provided in Appendix G (Tables 20-28). This includes all few-shot exemplars with chain-of-thought annotations. Alternate annotator prompts are in Appendix H (Tables 29-30)."
},
"hyperparameters_reported": {
"applies": true,
"answer": true,
- "justification": "Section 3.1 states 'We sample from the models via greedy decoding.' For LaMDA, 'we report averaged results over five random seeds, where each seed had a different randomly shuffled order of exemplars.' The input context length is noted as 1024 tokens. Greedy decoding implies temperature=0."
+ "justification": "Section 3.1 states 'We sample from the models via greedy decoding.' For LaMDA, 'averaged results over five random seeds' with different exemplar orderings. Input context window limited to 1024 tokens (Appendix D.3)."
},
"scaffolding_described": {
"applies": false,
"answer": false,
- "justification": "No agentic scaffolding is used. The approach is standard few-shot prompting with chain-of-thought exemplars — a single model call per example with no tool use, retry logic, or multi-step agent workflow."
+ "justification": "No agentic scaffolding is used. The approach is simple few-shot prompting with no tool use, retry logic, or agent workflows."
},
"data_preprocessing_documented": {
"applies": true,
"answer": true,
- "justification": "For arithmetic reasoning, the chain-of-thought exemplars are described: 8 manually composed exemplars for most benchmarks, 4 from the AQuA training set (Section 3.1). For symbolic reasoning, the data generation process is described (random name concatenation from name census data). For commonsense, exemplar selection is described (Section 4). Footnote 2 describes filtering: 'We sample examples ≤60 tokens to fit into our input context window, and also limit the examples to ≤2 steps.'"
+ "justification": "The exemplar selection process is documented: 8 manually composed exemplars for most benchmarks (Section 3.1), 4 from AQuA training set, and specific criteria for GSM8K exemplars (≤60 tokens, ≤2 steps, Section 3.4). Symbolic reasoning data generation is described in Section 5."
}
},
"limitations_and_scope": {
"limitations_section_present": {
"applies": true,
"answer": true,
- "justification": "Section 6 (Discussion) contains a dedicated limitations paragraph discussing four specific limitations: (1) no guarantee the model is 'actually reasoning,' (2) annotation costs for finetuning, (3) no guarantee of correct reasoning paths, and (4) emergence only at large model scales making it costly to serve."
+ "justification": "Section 6 (Discussion) contains a dedicated paragraph on limitations, covering: (1) CoT doesn't prove reasoning, (2) annotation costs for finetuning, (3) no guarantee of correct reasoning paths, (4) only works at large scale making it costly to serve."
},
"threats_to_validity_specific": {
"applies": true,
"answer": true,
- "justification": "The paper discusses specific threats: sensitivity to prompt engineering (Appendix A.2 with concrete examples like coin flip variance from 99.6% to 71.4%), the fact that correct chains of thought do not guarantee correct reasoning (Appendix D.1 — 1 of 50 correct answers came from incorrect reasoning), and that gains don't transfer perfectly across models (Appendix A.2)."
+ "justification": "The paper discusses specific threats: generated chains of thought are not always factual (Section 6, Appendix D.1), incorrect reasoning can accidentally lead to correct answers especially for classification tasks (Appendix D.1), and prompt sensitivity affects results (Section 3.4, Appendix A.2)."
},
"scope_boundaries_stated": {
"applies": true,
"answer": true,
- "justification": "Appendix A.3 explicitly states when chain-of-thought prompting is expected NOT to help: (1) tasks that don't require multi-step reasoning, (2) small language models, (3) tasks where the scaling curve is already steep. The paper also states 'We leave the empirical evaluation of chain-of-thought prompting on such diverse tasks (e.g., machine translation, etc.) to future work.'"
+ "justification": "Appendix A.3 explicitly states when CoT helps and when it doesn't: three conditions must be met (challenging task, large model, flat scaling curve). Section 6 states CoT emergence 'only at large model scales makes it costly to serve.' The paper scopes to arithmetic, commonsense, and symbolic reasoning tasks."
}
},
"data_integrity": {
"raw_data_available": {
"applies": true,
"answer": true,
- "justification": "The paper states 'we make exact inputs, targets, and predictions for LaMDA 137B for each task available as a zip file in the supplementary material' (Appendix E.1). GPT-3 results are reproducible via the public API. Benchmarks are public."
+ "justification": "Appendix E.1 states: 'we make exact inputs, targets, and predictions for LaMDA 137B for each task available as a zip file in the supplementary material.' GPT-3 results are reproducible via the public API."
},
"data_collection_described": {
"applies": true,
"answer": true,
- "justification": "The data collection is well described: public benchmarks are cited with URLs and licenses (Appendix E.3), chain-of-thought exemplars are manually composed by the authors (Section 3.1), and synthetic datasets (coin flip, letter concatenation) have their generation procedures described (Section 5)."
+ "justification": "All benchmarks are publicly available with citations. The synthetic symbolic reasoning datasets are described in Section 5 with generation procedures (random concatenation from top-1000 names from namecensus.com). Chain-of-thought annotations are fully provided."
},
"recruitment_methods_described": {
"applies": false,
"answer": false,
- "justification": "No human participants were involved. The paper uses standard public benchmarks and three co-authors as annotators for chain-of-thought prompts. This is not a human subjects study."
+ "justification": "No human participants. The study evaluates language models on standard benchmarks."
},
"data_pipeline_documented": {
"applies": true,
"answer": true,
- "justification": "The pipeline is straightforward and documented: prompts are composed manually (full text in Appendix G), fed to language models via greedy decoding, and final answers are extracted. The external calculator post-processing step is described in Appendix B. Exemplar selection criteria are stated."
+ "justification": "The pipeline is straightforward: benchmark datasets → prompt construction with exemplars → model inference via greedy decoding → answer extraction → accuracy comparison. External calculator variant is also described (Python eval function, Table 1). The process for creating chain-of-thought annotations is documented."
}
},
"conflicts_of_interest": {
"funding_disclosed": {
"applies": true,
"answer": false,
- "justification": "No funding source is explicitly disclosed. All authors are from Google Research, Brain Team, but there is no acknowledgments section listing grants or funding agencies beyond thanking colleagues."
+ "justification": "No funding sources are disclosed. The paper is from Google Research, Brain Team, but no specific grant numbers or funding acknowledgments are provided beyond naming individual colleagues who gave feedback."
},
"affiliations_disclosed": {
"applies": true,
"answer": true,
- "justification": "All authors are listed as 'Google Research, Brain Team' on the first page of the paper. This is prominently displayed."
+ "justification": "All authors are identified as Google Research, Brain Team with email addresses {jasonwei,dennyzhou}@google.com on the first page."
},
"funder_independent_of_outcome": {
"applies": true,
"answer": false,
- "justification": "Google employees are evaluating Google's own models (LaMDA, PaLM) alongside competitors. Google has a financial interest in demonstrating the capabilities of large language models. The funder (Google) is not independent of the outcome."
+ "justification": "The authors work at Google, which develops and markets the LaMDA and PaLM models being evaluated. Google has a commercial interest in demonstrating that large language models (especially their own) can perform better with prompting techniques."
},
"financial_interests_declared": {
"applies": true,
"answer": false,
- "justification": "No competing interests statement is present in the paper. The authors work at Google Research and are evaluating Google's proprietary models (LaMDA, PaLM), but no financial interest disclosure is provided."
+ "justification": "No competing interests statement is present in the paper. Given all authors work at Google and evaluate Google's proprietary models (LaMDA, PaLM), financial interests are plausible but undisclosed."
}
},
"contamination": {
"training_cutoff_stated": {
"applies": true,
"answer": false,
- "justification": "No training data cutoff dates are stated for any of the models (GPT-3, LaMDA, PaLM, Codex, UL2). The paper evaluates these models on public benchmarks without specifying when their training data was collected."
+ "justification": "No training data cutoff dates are stated for any of the five model families evaluated (GPT-3, LaMDA, PaLM, UL2, Codex). The reader cannot assess whether benchmark data appeared in training."
},
"train_test_overlap_discussed": {
"applies": true,
"answer": false,
- "justification": "No discussion of potential train/test overlap. The paper evaluates pre-trained models on public benchmarks (GSM8K published 2021, CSQA published 2019, etc.) without analyzing whether these benchmarks appeared in training data."
+ "justification": "No discussion of whether evaluation benchmark examples could have appeared in the training data of any model. This is a significant omission given the use of publicly available benchmarks with large-scale models."
},
"benchmark_contamination_addressed": {
"applies": true,
"answer": false,
- "justification": "No discussion of benchmark contamination risk. Several benchmarks (CSQA, MAWPS subsets, ASDiv) were published years before the models were trained and could plausibly be in the training data. This is not addressed."
+ "justification": "Many benchmarks used (GSM8K 2021, CSQA 2019, SVAMP 2021, MAWPS 2016) were publicly available before model training. No contamination analysis is performed or discussed."
}
},
"human_studies": {
"pre_registered": {
"applies": false,
"answer": false,
- "justification": "No human participants in the study. The paper is a benchmark evaluation of language models."
+ "justification": "No human participants. This is a benchmark evaluation study of language models."
},
"irb_or_ethics_approval": {
"applies": false,
"answer": false,
- "justification": "No human participants. The NeurIPS checklist (Section 5) confirms 'No human data collected' and marks IRB as N/A."
+ "justification": "No human participants. This is a benchmark evaluation study."
},
"demographics_reported": {
"applies": false,
"answer": false,
- "justification": "No human participants in the study."
+ "justification": "No human participants."
},
"inclusion_exclusion_criteria": {
"applies": false,
"answer": false,
- "justification": "No human participants in the study."
+ "justification": "No human participants."
},
"randomization_described": {
"applies": false,
"answer": false,
- "justification": "No human participants in the study."
+ "justification": "No human participants."
},
"blinding_described": {
"applies": false,
"answer": false,
- "justification": "No human participants in the study."
+ "justification": "No human participants."
},
"attrition_reported": {
"applies": false,
"answer": false,
- "justification": "No human participants in the study."
+ "justification": "No human participants."
}
},
"cost_and_practicality": {
"inference_cost_reported": {
"applies": true,
"answer": false,
- "justification": "No inference costs, token counts, or latency measurements are reported. The paper notes that chain-of-thought prompting requires generating more tokens but does not quantify this cost."
+ "justification": "No inference costs, latency, or token consumption are reported for any experiment despite using multiple large models (up to PaLM 540B). Section 6 mentions large scale 'makes it costly to serve' but provides no quantification."
},
"compute_budget_stated": {
"applies": true,
"answer": false,
- "justification": "Appendix E.2 mentions the hardware used (TPU v3 for LaMDA, TPU v4 for PaLM, GPT-3 API) but explicitly states 'though we did not estimate the total amount of compute' in the NeurIPS checklist (Section 3d)."
+ "justification": "Appendix E.2 describes hardware used (TPU v3 for LaMDA, TPU v4 for PaLM) but explicitly states 'we did not estimate the total amount of compute.' No GPU hours, total API spend, or wall-clock time are provided."
+ }
+ },
+ "experimental_rigor": {
+ "seed_sensitivity_reported": {
+ "applies": true,
+ "answer": true,
+ "justification": "For LaMDA, results are averaged over five random seeds (different exemplar orderings) with standard deviations reported in Tables 6-7. However, for GPT-3 and PaLM, single exemplar orderings are used, justified by LaMDA's low variance."
+ },
+ "number_of_runs_stated": {
+ "applies": true,
+ "answer": true,
+ "justification": "Section 3.1 states: 'For LaMDA, we report averaged results over five random seeds.' For other models, 'to save compute we report results for a single exemplar order.'"
+ },
+ "hyperparameter_search_budget": {
+ "applies": true,
+ "answer": false,
+ "justification": "No hyperparameter search is reported. The paper uses greedy decoding and manually composed prompts. Section 3.1 notes 'These particular exemplars did not undergo prompt engineering' but no search budget is documented."
+ },
+ "best_config_selection_justified": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper uses a single set of 8 exemplars across most benchmarks (Section 3.1), and Section 3.4 demonstrates robustness across different annotators, exemplar sets, and orderings. No cherry-picking of configurations is apparent."
+ },
+ "multiple_comparison_correction": {
+ "applies": false,
+ "answer": false,
+ "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable. The paper reports only point estimates and standard deviations."
+ },
+ "self_comparison_bias_addressed": {
+ "applies": true,
+ "answer": false,
+ "justification": "The authors do not discuss the bias of evaluating their own prompting technique. While they compare against prior work, they do not acknowledge that their implementations of baselines could be disadvantaged."
+ },
+ "compute_budget_vs_performance": {
+ "applies": true,
+ "answer": false,
+ "justification": "No analysis of performance as a function of compute. CoT prompting generates more tokens (longer outputs) than standard prompting, increasing compute cost, but this trade-off is not quantified."
+ },
+ "benchmark_construct_validity": {
+ "applies": true,
+ "answer": false,
+ "justification": "The paper does not discuss whether the benchmarks actually measure 'reasoning ability' as claimed, or whether solve rate on these tasks is a valid proxy for reasoning. The connection between benchmark performance and the claimed cognitive ability is assumed rather than argued."
+ },
+ "scaffold_confound_addressed": {
+ "applies": false,
+ "answer": false,
+ "justification": "No scaffolding is involved. The approach is direct few-shot prompting without any agentic framework."
+ }
+ },
+ "data_leakage": {
+ "temporal_leakage_addressed": {
+ "applies": true,
+ "answer": false,
+ "justification": "No discussion of temporal leakage. Several benchmarks (MAWPS 2016, CSQA 2019) were published years before model training and could be in training data. No temporal analysis is provided."
+ },
+ "feature_leakage_addressed": {
+ "applies": true,
+ "answer": false,
+ "justification": "No discussion of feature leakage. The few-shot exemplars provide the reasoning structure that the model then imitates, but whether this constitutes a form of answer leakage for the test examples is not discussed."
+ },
+ "non_independence_addressed": {
+ "applies": true,
+ "answer": false,
+ "justification": "No discussion of whether training and test data are independent. Given the models were trained on large web corpora that could include these benchmarks, independence is not verified."
+ },
+ "leakage_detection_method": {
+ "applies": true,
+ "answer": false,
+ "justification": "No leakage detection or prevention methods are used. No canary strings, membership inference, decontamination, or temporal splits are applied."
}
}
},
"claims": [
{
- "claim": "Chain-of-thought prompting significantly improves the ability of large language models to perform complex reasoning tasks across arithmetic, commonsense, and symbolic domains.",
- "evidence": "Tables 1-5 and Figures 4, 7, 8 show consistent improvements across all three domains. For arithmetic, PaLM 540B improves from 17.9% to 56.9% on GSM8K (Table 1). For commonsense, PaLM 540B improves from 68.6% to 77.8% on StrategyQA (Table 4). For symbolic reasoning, PaLM 540B improves from 7.6% to 99.4% on last letter concatenation (Table 5).",
+ "claim": "Chain-of-thought prompting significantly improves the ability of large language models to perform complex reasoning tasks.",
+ "evidence": "Demonstrated across arithmetic (5 benchmarks), commonsense (5 benchmarks), and symbolic reasoning (2 tasks) with three model families. PaLM 540B on GSM8K: 17.9% → 56.9% (Section 3.2, Table 1, Figure 4).",
"supported": "strong"
},
{
- "claim": "Chain-of-thought reasoning is an emergent ability of model scale that does not positively impact performance for small models.",
- "evidence": "Table 2 shows that for models smaller than ~100B parameters, chain-of-thought prompting often hurts performance (e.g., LaMDA 420M on GSM8K drops from 2.6% to 0.4%). Improvements only appear with the largest models in each family. This pattern is consistent across all three model families and all benchmark types.",
+ "claim": "Chain-of-thought reasoning is an emergent ability of model scale, not positively impacting small models.",
+ "evidence": "Performance gains only appear at ~100B parameters across LaMDA, GPT-3, and PaLM (Figure 4, Tables 2-5). Small models produce 'fluent but illogical chains of thought' leading to lower performance than standard prompting (Section 3.2).",
"supported": "strong"
},
{
"claim": "PaLM 540B with chain-of-thought prompting achieves state-of-the-art on GSM8K, surpassing finetuned GPT-3 with a verifier.",
- "evidence": "Table 1 and Figure 2 show PaLM 540B + CoT achieves 56.9% on GSM8K (58.6% with external calculator), compared to the prior best of 55% from finetuned GPT-3 (Cobbe et al., 2021).",
- "supported": "strong"
+ "evidence": "PaLM 540B CoT achieves 56.9% on GSM8K vs prior best of 55% from Cobbe et al. (2021) finetuned GPT-3 with verifier (Figure 2, Table 1).",
+ "supported": "moderate"
},
{
- "claim": "Chain-of-thought prompting has larger performance gains for more complicated problems.",
- "evidence": "Table 3 and Figure 4 show that gains on GSM8K (challenging, multi-step) are much larger than on SingleOp (one-step, easy). PaLM 540B gains +39.0 on GSM8K but 0.0 on SingleOp. Section 3.2 and Appendix A.3 discuss this pattern.",
+ "claim": "The benefit of chain-of-thought prompting comes from the semantic reasoning content, not just additional computation or knowledge activation.",
+ "evidence": "Ablation study (Section 3.3, Figure 5): variable compute only (dots) performs same as baseline; reasoning after answer performs same as baseline; equation only helps partially but not as much as full CoT.",
"supported": "strong"
},
{
- "claim": "Chain-of-thought prompting facilitates length generalization to longer sequences in symbolic reasoning.",
- "evidence": "Table 5 and Figure 8 show that for OOD test sets (more steps than exemplars), standard prompting fails while chain-of-thought prompting achieves substantial performance. PaLM 540B achieves 94.8% on 3-word last letter concatenation (OOD) with CoT vs 0.2% with standard prompting.",
+ "claim": "Chain-of-thought prompting is robust across different annotators, exemplar sets, and exemplar orderings.",
+ "evidence": "Three annotators, three GSM8K-sampled exemplar sets, varying numbers of exemplars, and multiple orderings all outperform standard prompting (Section 3.4, Figure 6, Tables 6-7, Figure 11).",
"supported": "strong"
},
{
- "claim": "The benefit of chain-of-thought prompting comes from the natural language reasoning steps, not just extra computation or equation generation.",
- "evidence": "Section 3.3 ablation study shows that 'variable compute only' (dots instead of reasoning) performs about the same as baseline, and 'equation only' helps on easy problems but not GSM8K. 'Reasoning after answer' also matches baseline, showing the chain must precede the answer. Results in Figure 5 and Tables 6-7.",
- "supported": "strong"
+ "claim": "Chain-of-thought prompting facilitates length generalization to longer sequences in symbolic reasoning.",
+ "evidence": "Models trained on 2-step exemplars generalize to 3- and 4-step problems for last letter concatenation and coin flip tasks (Section 5, Figure 8, Table 5). Performance lower than in-domain but scales upward with model size.",
+ "supported": "moderate"
}
],
"methodology_tags": ["benchmark-eval"],
- "key_findings": "Chain-of-thought prompting, where few-shot exemplars include intermediate reasoning steps, substantially improves large language model performance on arithmetic, commonsense, and symbolic reasoning benchmarks. This is an emergent ability of model scale, only appearing in models with approximately 100B+ parameters, with larger gains on more complex tasks. PaLM 540B with chain-of-thought prompting achieved state-of-the-art on GSM8K (56.9%), surpassing finetuned models. Ablation studies show the benefit comes from natural language reasoning steps specifically, not from extra computation tokens or equation generation alone.",
+ "key_findings": "Chain-of-thought prompting—providing intermediate reasoning steps in few-shot exemplars—dramatically improves large language model performance on arithmetic, commonsense, and symbolic reasoning tasks, but only at model scales of ~100B+ parameters. PaLM 540B with CoT achieved state-of-the-art on GSM8K (56.9%) surpassing finetuned approaches. Ablation studies show the benefit comes from the semantic content of reasoning steps, not merely from additional computation or knowledge activation. The approach is robust across different annotators, exemplar sets, and model families.",
"red_flags": [
{
"flag": "Company evaluating own models",
- "detail": "All authors are Google Research employees evaluating Google's proprietary models (LaMDA, PaLM) alongside competitors. While GPT-3 and Codex results are included and show similar patterns, two of the three primary model families are Google products. The strongest results (state-of-the-art on GSM8K) are achieved by PaLM, a Google model."
- },
- {
- "flag": "No significance testing",
- "detail": "All comparative claims are based on comparing point estimates. Standard deviations are only reported for LaMDA 137B. No statistical tests are used to determine whether differences are significant, despite the paper making numerous 'X outperforms Y' claims."
+ "detail": "All authors are from Google Research, Brain Team. Two of the five model families evaluated (LaMDA and PaLM) are Google proprietary models. The paper demonstrates that Google's largest models benefit most from the technique."
},
{
"flag": "No contamination analysis",
- "detail": "Several benchmarks (CSQA 2019, MAWPS subsets, ASDiv 2020) were publicly available before model training. No analysis of whether benchmark data appeared in training sets. Training data cutoff dates are not reported for any model."
+ "detail": "Multiple benchmarks used (MAWPS 2016, CSQA 2019, GSM8K 2021) were publicly available before model training, yet no contamination analysis is performed. Training data cutoffs are not stated for any model."
},
{
- "flag": "Variance only for one model",
- "detail": "Standard deviations across seeds are only reported for LaMDA 137B. For GPT-3 and PaLM, only single-run results are reported with the justification that 'LaMDA experiments did not show large variance.' This assumes variance properties transfer across model families."
+ "flag": "Single metric evaluation",
+ "detail": "Only solve rate (accuracy) is reported across all experiments. No secondary metrics (e.g., reasoning chain quality, calibration, partial credit) are used to evaluate the approach."
},
{
- "flag": "No cost reporting",
- "detail": "Chain-of-thought prompting generates substantially more tokens than standard prompting, but no cost analysis (tokens consumed, latency, API costs) is provided. The paper explicitly acknowledges not estimating total compute."
+ "flag": "Incomplete variance reporting",
+ "detail": "Standard deviations across seeds are only reported for LaMDA 137B. For GPT-3 and PaLM (including the SOTA-claiming PaLM 540B results), only single-run numbers are reported, justified by LaMDA's low variance—but variance may differ across model families."
}
],
"cited_papers": [
{
- "title": "Training verifiers to solve math word problems",
- "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian", "Jacob Hilton", "Reiichiro Nakano", "Christopher Hesse", "John Schulman"],
- "year": 2021,
- "arxiv_id": "2110.14168",
- "relevance": "Key baseline for math reasoning; introduces GSM8K benchmark and verifier approach that CoT prompting surpasses."
- },
- {
- "title": "Language models are few-shot learners",
+ "title": "Language Models are Few-Shot Learners",
"authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
"year": 2020,
- "relevance": "Foundational work on few-shot prompting with GPT-3 that CoT prompting builds upon and extends."
+ "relevance": "Foundational work on few-shot prompting with GPT-3, the baseline approach that CoT prompting extends."
},
{
- "title": "Evaluating large language models trained on code",
+ "title": "Training Verifiers to Solve Math Word Problems",
+ "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian"],
+ "year": 2021,
+ "arxiv_id": "2110.14168",
+ "relevance": "Introduced GSM8K benchmark and the finetuned GPT-3 verifier approach that CoT prompting surpasses."
+ },
+ {
+ "title": "Evaluating Large Language Models Trained on Code",
"authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
"year": 2021,
"arxiv_id": "2107.03374",
- "relevance": "Introduces Codex, one of the models evaluated in CoT prompting experiments; foundational for code generation evaluation."
+ "relevance": "Introduces Codex and code evaluation methodology; one of the five model families evaluated with CoT prompting."
},
{
- "title": "Self-consistency improves chain of thought reasoning in language models",
- "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans", "Quoc Le", "Ed Chi", "Denny Zhou"],
+ "title": "Emergent Abilities of Large Language Models",
+ "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"],
"year": 2022,
- "arxiv_id": "2203.11171",
- "relevance": "Follow-up work showing majority voting over multiple CoT samples further improves reasoning; key extension of the CoT paradigm."
+ "relevance": "Provides theoretical framing for the emergence of CoT reasoning at scale, directly tied to this paper's core finding."
},
{
- "title": "Emergent abilities of large language models",
- "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"],
+ "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
+ "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
"year": 2022,
- "relevance": "Provides the theoretical framework for understanding CoT as an emergent ability of scale, directly referenced in this paper."
+ "arxiv_id": "2203.11171",
+ "relevance": "Follow-up work showing majority voting over sampled CoT generations further improves performance."
},
{
- "title": "Show your work: Scratchpads for intermediate computation with language models",
+ "title": "Show Your Work: Scratchpads for Intermediate Computation with Language Models",
"authors": ["Maxwell Nye", "Anders Johan Andreassen", "Guy Gur-Ari"],
"year": 2021,
"arxiv_id": "2112.00114",
- "relevance": "Closely related work using intermediate steps (scratchpads) for program execution; precursor to CoT prompting for reasoning."
+ "relevance": "Closest prior work using intermediate computation steps for program execution; CoT generalizes this to natural language."
},
{
- "title": "STaR: Bootstrapping reasoning with reasoning",
- "authors": ["Eric Zelikman", "Yuhuai Wu", "Noah D. Goodman"],
- "year": 2022,
- "arxiv_id": "2203.14465",
- "relevance": "Complementary approach to generating reasoning chains via self-training; relevant to understanding how reasoning capability scales."
+ "title": "Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems",
+ "authors": ["Wang Ling", "Dani Yogatama", "Chris Dyer"],
+ "year": 2017,
+ "relevance": "Pioneered natural language rationales for math problem solving, the training-based predecessor to CoT prompting."
},
{
- "title": "Training language models to follow instructions with human feedback",
- "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
+ "title": "Do As I Can, Not As I Say: Grounding Language in Robotic Affordances",
+ "authors": ["Michael Ahn", "Anthony Brohan", "Noah Brown"],
"year": 2022,
- "arxiv_id": "2203.02155",
- "relevance": "InstructGPT paper describing the instruction-tuned GPT-3 models used in CoT experiments; foundational for understanding prompting effectiveness."
+ "arxiv_id": "2204.01691",
+ "relevance": "SayCan robot planning benchmark used to evaluate CoT prompting for commonsense reasoning in robotic instruction following."
},
{
- "title": "Scaling language models: Methods, analysis & insights from training Gopher",
+ "title": "Scaling Language Models: Methods, Analysis & Insights from Training Gopher",
"authors": ["Jack W. Rae", "Sebastian Borgeaud", "Trevor Cai"],
"year": 2021,
"arxiv_id": "2112.11446",
- "relevance": "Demonstrates that scaling alone is insufficient for reasoning tasks, motivating the need for prompting techniques like CoT."
+ "relevance": "Documented that scaling alone is insufficient for reasoning tasks, motivating CoT prompting as an alternative approach."
},
{
- "title": "LaMDA: Language models for dialog applications",
- "authors": ["Romal Thoppilan", "Daniel De Freitas", "Jamie Hall"],
+ "title": "STaR: Bootstrapping Reasoning with Reasoning",
+ "authors": ["Eric Zelikman", "Yuhuai Wu", "Noah D. Goodman"],
"year": 2022,
- "arxiv_id": "2201.08239",
- "relevance": "Describes one of the three primary model families evaluated in this paper; relevant to understanding model capabilities."
+ "arxiv_id": "2203.14465",
+ "relevance": "Extends CoT idea to self-training: models generate rationales, filter correct ones, and finetune on them."
},
{
- "title": "Program induction by rationale generation: Learning to solve and explain algebraic word problems",
- "authors": ["Wang Ling", "Dani Yogatama", "Chris Dyer", "Phil Blunsom"],
- "year": 2017,
- "relevance": "Pioneered natural language rationales for math problem solving; direct predecessor to the chain-of-thought approach."
+ "title": "Finetuned Language Models Are Zero-Shot Learners",
+ "authors": ["Jason Wei", "Maarten Bosma", "Vincent Y. Zhao"],
+ "year": 2022,
+ "relevance": "Instruction tuning work (FLAN) that augments inputs with task instructions; CoT takes the orthogonal approach of augmenting outputs."
+ },
+ {
+ "title": "Program Synthesis with Large Language Models",
+ "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
+ "year": 2021,
+ "arxiv_id": "2108.07732",
+ "relevance": "Evaluates LLMs for code generation, related to using intermediate steps in program synthesis."
}
]
}
diff --git a/papers/codex-humaneval-2021/scan.json b/papers/codex-humaneval-2021/scan.json
@@ -65,418 +65,502 @@
"venue": "arXiv",
"arxiv_id": "2107.03374"
},
+ "scan_version": 2,
+ "active_modules": ["experimental_rigor", "data_leakage"],
"checklist": {
"artifacts": {
"code_released": {
"applies": true,
"answer": true,
- "justification": "The HumanEval dataset and evaluation framework are released at https://www.github.com/openai/human-eval (Section 2.2). Alignment evaluation data is released at https://github.com/openai/code-align-evals-data (Appendix E.4). However, the Codex model itself is not open-sourced."
+ "justification": "The HumanEval evaluation framework is released at https://www.github.com/openai/human-eval. Alignment evaluation data is released at https://github.com/openai/code-align-evals-data. However, the Codex model itself is not released."
},
"data_released": {
"applies": true,
"answer": true,
- "justification": "The HumanEval dataset of 164 hand-written programming problems is released at https://www.github.com/openai/human-eval (Section 1, Section 2.2). This is the paper's primary evaluation artifact."
+ "justification": "The HumanEval dataset of 164 hand-written programming problems is released at https://www.github.com/openai/human-eval, as stated in Section 2.2."
},
"environment_specified": {
"applies": true,
"answer": false,
- "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions using gVisor for sandboxing and Kubernetes infrastructure but does not provide dependency specifications for reproducing the evaluation."
+ "justification": "No requirements.txt, Dockerfile, or detailed environment setup is provided. The paper mentions Python and numpy (Figure 3) but does not provide enough detail to recreate the environment."
},
"reproduction_instructions": {
"applies": true,
"answer": false,
- "justification": "While the HumanEval dataset is released with an evaluation framework, the paper does not provide step-by-step instructions for reproducing the main experiments. The Codex models are not publicly available, making full reproduction impossible."
+ "justification": "No step-by-step reproduction instructions are provided. While the evaluation dataset is released, the model weights are not, and there are no scripts or README with commands to replicate the main experiments."
}
},
"statistical_methodology": {
"confidence_intervals_or_error_bars": {
"applies": true,
"answer": false,
- "justification": "The paper reports point estimates for pass@k metrics throughout (e.g., 28.8% for Codex-12B pass@1, 72.31% pass@100 in Table 1) without confidence intervals or error bars. The unbiased estimator in Equation 1 is used, but no uncertainty quantification is provided on the final numbers."
+ "justification": "All pass@k results in Tables 1 and 2 are reported as point estimates without confidence intervals or error bars. While the paper develops an unbiased estimator for pass@k (Equation 1, Appendix A), no uncertainty bounds on the estimates are provided."
},
"significance_tests": {
"applies": true,
"answer": false,
- "justification": "The paper makes numerous comparative claims (e.g., 'GPT-J-6B achieves 11.6% pass@1... roughly equivalent to Codex-300M') based solely on comparing numbers. No statistical significance tests are performed."
+ "justification": "Comparative claims (e.g., Codex outperforms GPT-J, Codex-S outperforms Codex) are made by comparing raw pass@k numbers without any statistical significance tests."
},
"effect_sizes_reported": {
"applies": true,
"answer": true,
- "justification": "The paper reports performance differences with baseline context throughout. For example, 'Codex-S outperforms the corresponding Codex by an average margin of 6.5 percentage points on pass@1 and by a larger average margin of 15.1 percentage points on pass@100' (Section 4.5). Pass@k results are given as percentages with comparisons across models and sizes (Table 1, Table 2)."
+ "justification": "Results are reported with full baseline context: Codex-12B solves 28.8% pass@1 vs GPT-3 at 0% and GPT-J at 11.4% (Section 1). Codex-S's improvement over Codex is quantified as 'an average margin of 6.5 percentage points on pass@1 and 15.1 percentage points on pass@100' (Section 4.5). The magnitude of differences is clear throughout."
},
"sample_size_justified": {
"applies": true,
"answer": false,
- "justification": "The HumanEval dataset contains 164 problems and the paper generates n=200 samples per problem, but there is no justification for why 164 problems is sufficient or any power analysis. The choice of 200 samples is not justified either."
+ "justification": "The HumanEval dataset contains 164 problems but there is no justification for why 164 was chosen, no power analysis, and no discussion of whether this sample size is sufficient for the claims made."
},
"variance_reported": {
"applies": true,
"answer": false,
- "justification": "The paper uses an unbiased estimator for pass@k (Equation 1) but does not report variance, standard deviation, or any spread measure across experimental runs. Results appear to be from single evaluation runs."
+ "justification": "Results are reported as single point estimates. While the unbiased pass@k estimator (Equation 1) accounts for sampling variance mathematically, no standard deviations, error bars, or spread measures across experimental runs are reported."
}
},
"evaluation_design": {
"baselines_included": {
"applies": true,
"answer": true,
- "justification": "The paper compares against multiple baselines: GPT-3 (0% on HumanEval), GPT-Neo (125M, 1.3B, 2.7B), GPT-J-6B (11.62% pass@1), and TabNine (2.58% pass@1), all reported in Table 1 and Section 3.4."
+ "justification": "Multiple baselines are included: GPT-3 (various sizes), GPT-Neo (125M, 1.3B, 2.7B), GPT-J-6B, and TabNine, all evaluated on HumanEval (Table 1). For APPS, GPT-Neo 2.7B fine-tuned results from Hendrycks et al. serve as baseline (Table 2)."
},
"baselines_contemporary": {
"applies": true,
"answer": true,
- "justification": "GPT-Neo and GPT-J were released in 2021, the same year as this paper. TabNine was a leading commercial code completion system at the time. The APPS benchmark comparison (Table 2) uses GPT-Neo 2.7B fine-tuned on APPS. These were the relevant contemporary baselines."
+ "justification": "GPT-Neo, GPT-J, and TabNine were all contemporary at time of publication (2021). GPT-J was released in May 2021, the same year as this paper."
},
"ablation_study": {
"applies": true,
"answer": true,
- "justification": "The paper ablates multiple components: the effect of code fine-tuning (GPT vs. Codex), supervised fine-tuning (Codex vs. Codex-S), model size (8 model sizes from 12M to 12B in Table 1), sampling temperature optimization (Figure 5), and sample selection heuristics (mean log-prob vs. random vs. back-translation in Figure 7)."
+ "justification": "The paper systematically ablates key components: GPT (no code fine-tuning) vs Codex (code fine-tuned) vs Codex-S (supervised fine-tuned), showing the contribution of each stage. Model size is varied across 8 scales (12M to 12B). Different sampling strategies (random, mean log-prob, back-translation) are compared (Figure 7)."
},
"multiple_metrics": {
"applies": true,
"answer": true,
- "justification": "The paper reports pass@1, pass@10, and pass@100 (Table 1), as well as BLEU score analysis (Section 3.3, Figure 8). It also evaluates on both HumanEval and APPS (Table 2), with raw and filtered pass@k variants."
+ "justification": "Multiple evaluation metrics are used: pass@1, pass@10, pass@100 (Table 1), BLEU scores (Figure 8), test loss (Figure 4), and the APPS dataset metrics including raw and filtered pass@k (Table 2)."
},
"human_evaluation": {
"applies": true,
"answer": true,
- "justification": "The docstring generation model Codex-D is evaluated by hand-grading 10 samples per problem (1640 problems total) because 'there is no similar way to evaluate docstring samples automatically' (Section 5, Table 3)."
+ "justification": "Codex-D docstring outputs are graded by hand: 'we grade sample docstrings by hand, considering a docstring correct if it uniquely and accurately specifies the code body. Due to the time consuming nature of this process, we only grade 10 samples per problem, for a total of 1640 problems' (Section 5, Table 3)."
},
"held_out_test_set": {
"applies": true,
"answer": true,
- "justification": "HumanEval is a hand-written dataset of 164 original programming problems specifically created to benchmark the models (Section 2.2). The paper explicitly notes: 'It is important for these tasks to be hand-written, since our models are trained on a large fraction of GitHub, which already contains solutions to problems from a variety of sources.' The supervised fine-tuning training problems are separate from HumanEval."
+ "justification": "HumanEval is specifically designed as a held-out test set: 'It is important for these tasks to be hand-written, since our models are trained on a large fraction of GitHub, which already contains solutions to problems from a variety of sources' (Section 2.2)."
},
"per_category_breakdown": {
"applies": true,
"answer": true,
- "justification": "Results are broken down by model size (Table 1), by APPS difficulty level (Introductory/Interview/Competition in Table 2), by sampling temperature (Figure 5), and by number of chained operations in the synthetic task evaluation (Figure 11)."
+ "justification": "Table 1 breaks down results by model size across all baselines. Table 2 breaks APPS results by difficulty level (Introductory, Interview, Competition). Figure 11 shows performance degradation by chain length. Figure 5 breaks down results by temperature."
},
"failure_cases_discussed": {
"applies": true,
"answer": true,
- "justification": "Section 6 (Limitations) extensively discusses failure modes: difficulty with long operation chains (Figure 11), variable binding errors (explicit code example shown), suggesting syntactically incorrect or undefined code, and struggling with system-level specifications. Appendix B shows 8 random problems with both correct and incorrect completions."
+ "justification": "Section 6 (Limitations) provides detailed failure analysis: difficulty with long chains of operations (Figure 11), variable binding errors (concrete code example in Section 6), misalignment producing buggy code when prompted with buggy code (Figure 12), and insecure code generation (Figure 15, Appendix G)."
},
"negative_results_reported": {
"applies": true,
"answer": true,
- "justification": "Several negative results are reported: (1) 'Surprisingly, we did not observe improvements when starting from a pre-trained language model' for code fine-tuning (Section 3.2); (2) back-translation ranking underperforms mean log-probability ranking (Section 5, Figure 7); (3) model performance degrades exponentially with docstring complexity (Section 6, Figure 11); (4) BLEU score does not reliably indicate functional correctness (Section 3.3, Figure 8)."
+ "justification": "Multiple negative results: 'we did not observe improvements when starting from a pre-trained language model' (Section 3.2); back-translation ranking 'underperforms mean log-probability ranking' (Section 5); 'choosing the sample based on sum log probability can perform slightly worse than picking randomly' (Section 3.3); misalignment worsens with scale (Figure 12)."
}
},
"claims_and_evidence": {
"abstract_claims_supported": {
"applies": true,
"answer": true,
- "justification": "The abstract claims Codex solves 28.8% of HumanEval (supported by Table 1: 28.81%), GPT-3 solves 0% (supported by Section 3.4), GPT-J solves 11.4% (supported by Table 1: 11.62%), and 70.2% solved with 100 samples (supported by pass@100 results). The 70.2% figure appears to refer to Codex-S at 77.5% pass@100 or Codex-12B at 72.31% — there is a minor discrepancy but the claim is approximately supported."
+ "justification": "All abstract claims are supported: 28.8% pass rate (Table 1), GPT-3 at 0% (Table 1), GPT-J at 11.4% (Table 1), 70.2% with 100 samples (Section 1, consistent with Figure 1 showing Codex-S at 77.5%). The abstract's qualitative claims about limitations are supported by Section 6."
},
"causal_claims_justified": {
"applies": true,
"answer": true,
- "justification": "The paper makes causal claims through ablations that are well-controlled: the effect of code fine-tuning is isolated by comparing same-architecture GPT vs. Codex models, supervised fine-tuning effect by comparing Codex vs. Codex-S at the same sizes, and model size scaling by holding training procedure constant. The alignment experiment (Appendix E, Figure 14) uses controlled prompt manipulation to establish that buggy context causes worse code generation."
+ "justification": "Causal claims about fine-tuning improving performance are supported by controlled comparisons: same base model architecture with and without code fine-tuning (GPT vs Codex), and with and without supervised fine-tuning (Codex vs Codex-S). Each comparison varies a single factor. The alignment analysis (Appendix E) carefully distinguishes capability from alignment."
},
"generalization_bounded": {
"applies": true,
- "answer": false,
- "justification": "The title 'Evaluating Large Language Models Trained on Code' is broader than the tested setting. The paper primarily evaluates Python function synthesis from docstrings on 164 problems. While the APPS evaluation provides additional evidence, generalizations to other programming languages or code generation tasks are not explicitly bounded. The paper does note Python-specific limitations in the economics section (Appendix H) but not in the technical claims."
+ "answer": true,
+ "justification": "The paper explicitly bounds its scope: 'In this work, we focus on the task of generating standalone Python functions from docstrings' (Section 1). The abstract states 'study its Python code-writing capabilities.' While the broader impacts section discusses general code generation, the empirical claims are bounded to Python."
},
"alternative_explanations_discussed": {
"applies": true,
"answer": true,
- "justification": "The paper discusses several alternative explanations: (1) for the alignment finding, they consider whether it could be a robustness failure rather than misalignment, concluding it is unlikely since GitHub contains plenty of poor-quality code (Appendix E.3); (2) for code fine-tuning showing no benefit from pre-training, they hypothesize 'possibly because the fine-tuning dataset is so large' (Section 3.2); (3) for Codex-S preferring higher temperatures, they suggest it 'possibly reflects the fact that Codex-S captures a narrower distribution' (Section 4.5)."
+ "justification": "Section 7.2 and Appendix E explicitly distinguish misalignment from incompetence as alternative explanations for model failures. Appendix E.3 considers whether poor performance on buggy prompts could be a robustness failure rather than misalignment. Section 4 discusses data distribution mismatch as a factor."
+ },
+ "proxy_outcome_distinction": {
+ "applies": true,
+ "answer": true,
+ "justification": "Section 2.1 explicitly argues for functional correctness over BLEU score as the evaluation metric, and Figure 8 demonstrates empirically that BLEU is a poor proxy for correctness. The paper measures pass@k and claims code generation capability — the measurement and claim are well-aligned with minimal proxy gap."
}
},
"setup_transparency": {
"model_versions_specified": {
"applies": true,
- "answer": false,
- "justification": "The paper describes model sizes (12M to 12B parameters) and that Codex is fine-tuned from the GPT-3 model family, but does not provide specific model version identifiers or snapshot dates. GPT-3 and GPT-J are referenced generically without version numbers."
+ "answer": true,
+ "justification": "Exact model sizes are specified for all models evaluated: Codex at 12M, 25M, 42M, 85M, 300M, 679M, 2.5B, and 12B parameters; GPT-Neo at 125M, 1.3B, 2.7B; GPT-J at 6B (Table 1). Since these are the authors' own models, parameter counts uniquely identify each variant."
},
"prompts_provided": {
"applies": true,
"answer": true,
- "justification": "The paper provides the prompt format for HumanEval evaluation (Figure 2), the pass@k computation code (Figure 3), example prompts for alignment evaluation (Appendix E.5, Examples 1-2), and the building blocks for synthetic tasks (Appendix C). The HumanEval dataset itself (released publicly) contains all prompts."
+ "justification": "Figure 2 shows three complete example prompts with the exact format used (header, signature, docstring). Stop sequences are specified: '\\nclass', '\\ndef', '\\n#', '\\nif', '\\nprint' (Section 3.2). Appendix B provides 8 additional full prompt examples. Appendix E.5 shows alignment evaluation prompts."
},
"hyperparameters_reported": {
"applies": true,
"answer": true,
- "justification": "Training hyperparameters are reported: 175-step linear warmup, cosine learning rate decay, 100 billion tokens, Adam optimizer with beta1=0.9, beta2=0.95, epsilon=10^-8, weight decay 0.1 (Section 3.2). Sampling uses nucleus sampling with top-p=0.95 (Section 3.2). Optimal temperatures are reported: T*=0.2 for pass@1, T*=0.8 for pass@100 (Section 3.3). Codex-S training uses 1/10 learning rate (Section 4.4)."
+ "justification": "Detailed hyperparameters are reported: nucleus sampling with top p=0.95, temperatures tested (0.2, 0.4, 0.8), n=200 samples per task. Training: 175-step linear warmup, cosine learning rate decay, 100 billion tokens, Adam with β1=0.9, β2=0.95, ε=10⁻⁸, weight decay 0.1 (Section 3.2)."
},
"scaffolding_described": {
"applies": false,
"answer": false,
- "justification": "No agentic scaffolding is used. Codex generates code through direct autoregressive sampling with stop sequences, not through an agent loop."
+ "justification": "No agentic scaffolding is used. Codex performs direct model inference from prompts without any tool use, retry logic, or multi-step workflow."
},
"data_preprocessing_documented": {
"applies": true,
"answer": true,
- "justification": "Training data collection is described in Section 3.1: collected May 2020 from 54 million public GitHub repos, 179 GB of unique Python files under 1 MB, filtered out auto-generated files, files with average line length >100, max line length >1000, or small percentage of alphanumeric characters. Final dataset: 159 GB. Supervised fine-tuning data filtering is described in Section 4.3."
+ "justification": "Section 3.1 documents data preprocessing in detail: collected from 54 million GitHub repos (179 GB unique Python files under 1 MB), filtered out auto-generated files, average line length >100, max line length >1000, low alphanumeric percentage, resulting in 159 GB final dataset. Tokenizer adaptations for whitespace are described (Section 3.2)."
}
},
"limitations_and_scope": {
"limitations_section_present": {
"applies": true,
"answer": true,
- "justification": "Section 6 is entirely dedicated to 'Limitations,' discussing sample efficiency, failure modes with complex docstrings, variable binding issues, and referencing Appendix D for the evaluation framework. Section 7 provides extensive broader impacts and hazard analysis."
+ "justification": "Section 6 is a dedicated 'Limitations' section discussing specific shortcomings. Additionally, Section 7 (Broader Impacts) provides extensive discussion of risks and limitations across over-reliance, misalignment, bias, security, and economic impacts."
},
"threats_to_validity_specific": {
"applies": true,
"answer": true,
- "justification": "The paper discusses specific threats: (1) HumanEval problems may not be fully novel despite being hand-written, as 'there are more than ten public repositories containing solutions to Codeforces problems' (Section 2.2); (2) performance degrades exponentially with docstring complexity (Section 6, Figure 11); (3) the model may 'deliberately' introduce bugs matching context quality (Appendix E); (4) insecure code generation with specific frequency measurements (Appendix G.3, Figure 15)."
+ "justification": "Section 6 discusses specific threats: Codex is 'not sample efficient to train,' struggles with 'docstrings describing long chains of operations' (quantified in Figure 11), and has 'difficulty with binding operations to variables' (concrete code example provided). Appendix E discusses specific alignment threats with empirical evidence."
},
"scope_boundaries_stated": {
"applies": true,
"answer": true,
- "justification": "The paper explicitly states scope boundaries: 'In this work, we focus on the task of generating standalone Python functions from docstrings' (Section 1). It acknowledges 'coding is a broad activity which involves much more than synthesizing code from docstrings' (Section 8). The system-level limitations discussion (Appendix D) explicitly states what Codex cannot do: 'The current capabilities of synthesis methodologies are only able to tackle tightly specified, constrained problem instances or narrow tasks.'"
+ "justification": "The paper explicitly states it focuses on 'generating standalone Python functions from docstrings' (Section 1). Section 6 notes the model 'struggles to parse through increasingly long and higher-level or system-level specifications.' The broader impacts section (7.5) states 'at their current level of capability, Codex models do not materially lower the barrier to entry for malware development.'"
}
},
"data_integrity": {
"raw_data_available": {
"applies": true,
- "answer": true,
- "justification": "The HumanEval dataset with all 164 problems, unit tests, and reference solutions is publicly released at https://www.github.com/openai/human-eval (Section 2.2). The alignment evaluation data is at https://github.com/openai/code-align-evals-data (Appendix E.4). However, model outputs (generated samples) are not released."
+ "answer": false,
+ "justification": "The HumanEval evaluation dataset is released, but the 159 GB training dataset is not. The training data cannot be independently verified. The model weights are also not released, preventing independent replication of results."
},
"data_collection_described": {
"applies": true,
"answer": true,
- "justification": "Training data collection is described in detail in Section 3.1 (54 million GitHub repos, May 2020, filtering criteria). HumanEval creation is described in Section 2.2 (hand-written, 164 problems, average 7.7 tests per problem). Supervised fine-tuning data sources are described in Sections 4.1 (competitive programming sites, 10,000 curated problems) and 4.2 (continuous integration tracing, ~40,000 functions)."
+ "justification": "Section 3.1 describes data collection in detail: '54 million public software repositories hosted on GitHub, containing 179 GB of unique Python files under 1 MB,' collected in May 2020. Filtering criteria are specified. Section 4.1-4.2 describe supervised fine-tuning data collection from competitive programming sites and CI-traced projects."
},
"recruitment_methods_described": {
"applies": false,
"answer": false,
- "justification": "No human participants were recruited for the study. The HumanEval problems were written by the authors themselves. The docstring evaluation was graded by unspecified individuals (likely the authors), which is a limitation but does not constitute a human subjects study."
+ "justification": "No human participants are involved. Training data comes from public GitHub repositories. HumanEval problems were hand-written by the authors. The Codex-D docstring grading (Section 5) is done by the authors, not recruited participants."
},
"data_pipeline_documented": {
"applies": true,
"answer": true,
- "justification": "The pipeline is documented: GitHub collection (54M repos, 179 GB) → filtering (auto-generated, line length, alphanumeric ratio) → final dataset (159 GB) (Section 3.1). For supervised fine-tuning: problem curation → Codex-12B verification (100 samples per problem, filtering ambiguous/non-deterministic) → final training set (Section 4.3). The tokenization process is also described (Section 3.2)."
+ "justification": "The data pipeline is documented: GitHub scraping (179 GB) → filtering by auto-generation, line length, alphanumeric content (159 GB final). For Codex-S training data: competitive programming problems (10,000 curated) + CI-traced functions (~40,000) → quality filtering using Codex-12B to remove ambiguous/stateful problems (Section 4.3)."
}
},
"conflicts_of_interest": {
"funding_disclosed": {
"applies": true,
- "answer": false,
- "justification": "The Acknowledgements section thanks individuals and teams but does not disclose funding sources or grants. The work is from OpenAI and partner organizations but no explicit funding statement is provided."
+ "answer": true,
+ "justification": "The acknowledgments section states 'we thank GitHub for partnering to build GitHub Copilot and Microsoft Azure for supporting model training with infrastructure management,' disclosing corporate support for the research."
},
"affiliations_disclosed": {
"applies": true,
"answer": true,
- "justification": "Author affiliations are clearly listed: '1 OpenAI', '2 Anthropic AI... Work performed while at OpenAI', '3 Zipline... Work performed while at OpenAI'. The paper is authored primarily by OpenAI employees evaluating their own model."
+ "justification": "Author affiliations are clearly listed: '1 OpenAI, San Francisco, California, USA. 2 Anthropic AI, San Francisco, California, USA. Work performed while at OpenAI. 3 Zipline, South San Francisco, California, USA. Work performed while at OpenAI.'"
},
"funder_independent_of_outcome": {
"applies": true,
"answer": false,
- "justification": "OpenAI employees are evaluating OpenAI's own Codex model, which powers the commercial GitHub Copilot product. The funder (OpenAI) has a direct financial interest in demonstrating strong Codex performance. This conflict is not acknowledged in the paper."
+ "justification": "OpenAI has a direct financial interest in Codex's performance — the paper states 'A distinct production version of Codex powers GitHub Copilot.' OpenAI is evaluating its own commercial product. Microsoft Azure (infrastructure provider) and GitHub (partner for Copilot) also have financial stakes."
},
"financial_interests_declared": {
"applies": true,
"answer": false,
- "justification": "No competing interests or financial interests statement is provided. The paper explicitly states 'A distinct production version of Codex powers GitHub Copilot' but does not include a formal conflicts-of-interest disclosure, despite the clear commercial interest."
+ "justification": "No competing interests or financial disclosure statement is present. The paper does not include a standard conflicts-of-interest declaration, despite OpenAI's commercial interest in Codex through GitHub Copilot."
}
},
"contamination": {
"training_cutoff_stated": {
"applies": true,
"answer": true,
- "justification": "The training data collection date is stated: 'Our training dataset was collected in May 2020 from 54 million public software repositories hosted on GitHub' (Section 3.1). This establishes the training data cutoff."
+ "justification": "Section 3.1 states 'Our training dataset was collected in May 2020 from 54 million public software repositories hosted on GitHub.'"
},
"train_test_overlap_discussed": {
"applies": true,
"answer": true,
- "justification": "The paper explicitly addresses this: 'It is important for these tasks to be hand-written, since our models are trained on a large fraction of GitHub, which already contains solutions to problems from a variety of sources. For example, there are more than ten public repositories containing solutions to Codeforces problems' (Section 2.2). HumanEval was specifically created to avoid overlap."
+ "justification": "Section 2.2 explicitly discusses this concern: 'It is important for these tasks to be hand-written, since our models are trained on a large fraction of GitHub, which already contains solutions to problems from a variety of sources. For example, there are more than ten public repositories containing solutions to Codeforces problems, which make up part of the recently proposed APPS dataset.'"
},
"benchmark_contamination_addressed": {
"applies": true,
"answer": true,
- "justification": "HumanEval was hand-written specifically to avoid contamination from GitHub training data (Section 2.2). For APPS, the paper acknowledges that Codex was not fine-tuned on APPS, providing a 1-shot setting instead (Section 3.5). The legal section notes that Codex rarely generates code identical to training data (<0.1%, Section 7.7)."
+ "justification": "HumanEval was specifically designed to avoid contamination: hand-written problems created after the training data collection (May 2020). The paper notes 'Though not a guarantee for problem novelty, all problems were hand-written and not programmatically copied from existing sources' (Figure 2 caption). For APPS, contamination risk is acknowledged."
}
},
"human_studies": {
"pre_registered": {
"applies": false,
"answer": false,
- "justification": "No human participants. This is a benchmark evaluation paper with no human subjects study."
+ "justification": "No human participants in this study. The paper evaluates models on benchmarks."
},
"irb_or_ethics_approval": {
"applies": false,
"answer": false,
- "justification": "No human participants. No IRB approval needed for benchmark evaluation."
+ "justification": "No human participants in this study."
},
"demographics_reported": {
"applies": false,
"answer": false,
- "justification": "No human participants in the study."
+ "justification": "No human participants in this study."
},
"inclusion_exclusion_criteria": {
"applies": false,
"answer": false,
- "justification": "No human participants in the study."
+ "justification": "No human participants in this study."
},
"randomization_described": {
"applies": false,
"answer": false,
- "justification": "No human participants or experimental conditions involving humans."
+ "justification": "No human participants in this study."
},
"blinding_described": {
"applies": false,
"answer": false,
- "justification": "No human participants or experimental conditions involving humans."
+ "justification": "No human participants in this study."
},
"attrition_reported": {
"applies": false,
"answer": false,
- "justification": "No human participants in the study."
+ "justification": "No human participants in this study."
}
},
"cost_and_practicality": {
"inference_cost_reported": {
"applies": true,
"answer": false,
- "justification": "The paper does not report inference cost, latency, or tokens consumed per problem. It generates n=200 samples per problem with models up to 12B parameters but does not quantify the computational cost of this evaluation."
+ "justification": "No inference costs are reported. The paper generates 200 samples per problem for 164 problems across multiple model sizes and temperatures, but does not report the total inference cost, tokens consumed, or wall-clock time."
},
"compute_budget_stated": {
"applies": true,
"answer": true,
- "justification": "Section 7.6 states: 'The original training of GPT-3-12B consumed hundreds of petaflop/s-days of compute, while fine-tuning it to create Codex-12B consumed a similar amount of compute.' The training platform (Azure) is mentioned. However, total API/inference compute is not quantified."
+ "justification": "Section 7.6 states 'The original training of GPT-3-12B consumed hundreds of petaflop/s-days of compute, while fine-tuning it to create Codex-12B consumed a similar amount of compute.' Section 3.2 states training was for 100 billion tokens. The platform (Azure) is identified."
+ }
+ },
+ "experimental_rigor": {
+ "seed_sensitivity_reported": {
+ "applies": true,
+ "answer": false,
+ "justification": "No seed sensitivity analysis is reported. Results are generated from a single set of 200 samples per problem with no discussion of how results vary across random seeds."
+ },
+ "number_of_runs_stated": {
+ "applies": true,
+ "answer": true,
+ "justification": "The number of samples is clearly stated: 'we generate n ≥ k samples per task (in this paper, we use n = 200 and k ≤ 100)' (Section 2.1). For APPS evaluation, 1000 samples are generated per task."
+ },
+ "hyperparameter_search_budget": {
+ "applies": true,
+ "answer": false,
+ "justification": "While Figure 5 shows pass@k at different temperatures (0.2, 0.4, 0.8), no formal hyperparameter search budget is stated (number of configurations tried, search method, or total compute spent on search)."
+ },
+ "best_config_selection_justified": {
+ "applies": true,
+ "answer": true,
+ "justification": "The temperature selection process is transparent: Figure 5 plots pass@k against temperature for various k values, and the optimal temperature is selected from the upper hull. They report 'the optimal temperature for pass@1 is T*=0.2 and the optimal temperature for pass@100 is T*=0.8' for the 679M model (Section 3.3)."
+ },
+ "multiple_comparison_correction": {
+ "applies": false,
+ "answer": false,
+ "justification": "No formal statistical tests with p-values are performed, so multiple comparison correction is not applicable."
+ },
+ "self_comparison_bias_addressed": {
+ "applies": true,
+ "answer": false,
+ "justification": "The authors compare their own Codex against GPT-Neo, GPT-J, and TabNine without acknowledging the potential bias of authors evaluating their own system. There is no discussion of how having full control over one system but not others might affect the comparison."
+ },
+ "compute_budget_vs_performance": {
+ "applies": true,
+ "answer": true,
+ "justification": "Figures 1, 4, and 6 show performance as a function of model size (a proxy for compute). Figure 4 shows test loss follows a power law with model size. Figure 6 shows pass@1 and pass@100 scale as sigmoids in log-parameters."
+ },
+ "benchmark_construct_validity": {
+ "applies": true,
+ "answer": true,
+ "justification": "Section 2.1 provides extensive discussion of construct validity: argues functional correctness is superior to BLEU for measuring code generation, shows empirically that BLEU fails to distinguish correct from incorrect code (Figure 8), and discusses how functional correctness mirrors real software development practice (test-driven development)."
+ },
+ "scaffold_confound_addressed": {
+ "applies": false,
+ "answer": false,
+ "justification": "No scaffolding is involved. All models are evaluated via direct inference with the same prompting approach, so there is no scaffold confound."
+ }
+ },
+ "data_leakage": {
+ "temporal_leakage_addressed": {
+ "applies": true,
+ "answer": true,
+ "justification": "Training data was collected in May 2020 (Section 3.1). HumanEval was hand-written specifically for this evaluation and did not exist before the training data collection, inherently addressing temporal leakage. For APPS, the paper acknowledges that GitHub contains Codeforces solutions."
+ },
+ "feature_leakage_addressed": {
+ "applies": true,
+ "answer": false,
+ "justification": "The paper does not explicitly discuss whether the evaluation setup leaks information through features. For example, function signatures and docstring style in HumanEval may provide implicit cues not available in real usage scenarios."
+ },
+ "non_independence_addressed": {
+ "applies": true,
+ "answer": true,
+ "justification": "Section 2.2 addresses non-independence by hand-writing HumanEval: 'It is important for these tasks to be hand-written, since our models are trained on a large fraction of GitHub, which already contains solutions to problems from a variety of sources.' This ensures test problems are independent of training data."
+ },
+ "leakage_detection_method": {
+ "applies": true,
+ "answer": false,
+ "justification": "No concrete leakage detection method is applied (no canary strings, membership inference, or n-gram overlap analysis). While hand-writing HumanEval is a prevention strategy, the paper acknowledges 'Though not a guarantee for problem novelty' (Figure 2) and no verification method confirms the problems do not overlap with training data."
}
}
},
"claims": [
{
- "claim": "Codex-12B solves 28.8% of HumanEval problems with a single sample (pass@1).",
- "evidence": "Table 1 reports Codex-12B pass@1 = 28.81%. Figure 1 shows pass rates as a function of model size.",
+ "claim": "Codex-12B solves 28.8% of HumanEval problems with a single sample, while GPT-3 solves 0% and GPT-J solves 11.4%.",
+ "evidence": "Table 1 reports pass@1 for Codex-12B at 28.81%, GPT-Neo models at 0.75-6.41%, and GPT-J at 11.62%. GPT-3 models achieve near 0% (Section 3.4).",
"supported": "strong"
},
{
- "claim": "Repeated sampling is a surprisingly effective strategy: Codex-S solves 77.5% of problems with 100 samples.",
- "evidence": "Section 1 and Section 4.5 report this result. Table 1 shows Codex-12B pass@100 = 72.31%. Codex-S further improves this.",
+ "claim": "With 100 samples per problem, Codex-S solves 77.5% of HumanEval problems.",
+ "evidence": "Figure 1 shows Codex-S pass@100 at 77.5%. Section 4.5 confirms the result and shows consistent improvements over Codex across model sizes.",
"supported": "strong"
},
{
- "claim": "GPT-3 solves 0% of HumanEval problems while GPT-J achieves 11.4%.",
- "evidence": "Section 3.4 confirms: GPT models achieve near 0%, GPT-J-6B achieves 11.62% pass@1 (Table 1).",
+ "claim": "Supervised fine-tuning on correctly implemented functions (Codex-S) improves pass@1 by an average of 6.5 percentage points and pass@100 by 15.1 percentage points over Codex.",
+ "evidence": "Section 4.5 reports these margins averaged across model sizes, with Figure 10 showing the comparison visually.",
"supported": "strong"
},
{
- "claim": "Codex-S outperforms Codex by an average margin of 6.5 percentage points on pass@1.",
- "evidence": "Section 4.5 states this directly. The margin increases to 15.1 percentage points on pass@100.",
- "supported": "moderate"
+ "claim": "BLEU score is not a reliable indicator of functional correctness for code generation.",
+ "evidence": "Figure 8 shows significant overlap between BLEU score distributions of correct and incorrect solutions for 4 HumanEval tasks, demonstrating that functionally inequivalent programs can have higher BLEU scores than correct ones (Section 3.3).",
+ "supported": "strong"
},
{
- "claim": "BLEU score is not a reliable indicator of functional correctness.",
- "evidence": "Section 3.3 and Figure 8 show significant overlap between BLEU score distributions for correct and incorrect solutions, meaning 'improvements in BLEU score may not indicate improved rates of functional correctness in practice.'",
+ "claim": "Model performance on chained operations degrades exponentially with docstring complexity.",
+ "evidence": "Figure 11 shows pass rate dropping by roughly a factor of 2-3 per additional chained component in synthetic tasks (Section 6).",
"supported": "strong"
},
{
- "claim": "Performance follows smooth power law scaling with model size.",
- "evidence": "Figure 4 shows test loss scaling as (N/5.92x10^7)^-0.13. Figure 6 shows pass@1 and pass@100 scale smoothly as a sigmoid in log-parameters.",
+ "claim": "Test loss after code fine-tuning follows a power law with model size.",
+ "evidence": "Figure 4 shows the power law relationship with functional form (N / 5.92×10^7)^-0.13, closely fitting the empirical data across model sizes (Section 3.3).",
"supported": "strong"
},
{
- "claim": "Codex model performance degrades exponentially as docstring complexity increases.",
- "evidence": "Section 6 and Figure 11 show pass rate drops by roughly a factor of 2-3 with each additional chained building block in the synthetic task.",
+ "claim": "Mean token log probability is an effective heuristic for selecting the best sample from multiple generations.",
+ "evidence": "Figure 7 shows mean log-probability ranking significantly outperforms random selection and back-translation ranking, reaching 44.5% when selecting from 100 samples (Section 3.3).",
"supported": "strong"
},
{
- "claim": "Codex exhibits misalignment: it produces worse code when prompted with buggy code, even when instructed to write correct code.",
- "evidence": "Appendix E.3 and Figures 12/14 show this effect. The gap increases with model size. An instruction to write correct code 'helps a little but does not fix the problem.'",
+ "claim": "Codex models frequently generate insecure cryptographic code configurations.",
+ "evidence": "Figure 15 shows Codex produces insecure RSA keys (<2048 bits) or AES contexts (ECB mode) in a significant fraction of samples across model sizes (Appendix G.3). However, the study covers only two cryptographic scenarios.",
+ "supported": "moderate"
+ },
+ {
+ "claim": "When prompted with subtly buggy code, Codex produces worse code than it is capable of, and this gap increases with model size.",
+ "evidence": "Figure 12 shows the gap between performance with correct context vs buggy context grows with model size. Adding an instruction to write correct code helps partially but does not eliminate the gap (Appendix E).",
"supported": "strong"
}
],
- "methodology_tags": [
- "benchmark-eval"
- ],
- "key_findings": "Codex, a GPT model fine-tuned on 159 GB of GitHub Python code, solves 28.8% of hand-written HumanEval programming problems with a single sample and 72.3% with 100 samples per problem. Supervised fine-tuning on curated standalone functions (Codex-S) improves pass@1 by 6.5 percentage points on average. The paper demonstrates that functional correctness via unit tests is a more reliable evaluation metric than BLEU score, and introduces the unbiased pass@k estimator. The paper also identifies misalignment behavior where model output quality degrades to match prompt quality, an effect that worsens with scale.",
+ "methodology_tags": ["benchmark-eval"],
+ "key_findings": "Codex, a GPT model fine-tuned on 159 GB of GitHub Python code, solves 28.8% of hand-written HumanEval problems with a single sample, vastly outperforming GPT-3 (0%) and GPT-J (11.4%). Repeated sampling is surprisingly effective: 100 samples per problem yield 77.5% pass rate with Codex-S. The paper establishes functional correctness (pass@k) as the appropriate metric over BLEU, demonstrates power law scaling with model size, and provides extensive analysis of limitations including exponential degradation with docstring complexity, misalignment that worsens with scale, and frequent generation of insecure code.",
"red_flags": [
{
"flag": "Company evaluating own product",
- "detail": "The paper is authored predominantly by OpenAI employees evaluating OpenAI's Codex model, which powers the commercial GitHub Copilot product. This conflict of interest is not formally acknowledged despite the direct commercial stake in positive results."
- },
- {
- "flag": "No confidence intervals or significance tests",
- "detail": "Despite making numerous comparative claims across model sizes and baselines, no confidence intervals, error bars, or statistical significance tests are reported. All comparisons are based on point estimates from what appears to be single evaluation runs."
+ "detail": "OpenAI authors evaluate Codex, which powers their commercial product GitHub Copilot. The paper acknowledges the relationship ('A distinct production version of Codex powers GitHub Copilot') but does not include a formal conflicts-of-interest statement. The funder (OpenAI) has direct financial interest in demonstrating strong Codex performance."
},
{
- "flag": "Model not publicly released",
- "detail": "While the HumanEval benchmark is released, the Codex models are not publicly available. This means independent researchers cannot fully reproduce the main results or verify the reported numbers."
+ "flag": "No uncertainty quantification on main results",
+ "detail": "Despite developing an unbiased estimator for pass@k, the paper reports all main results as point estimates without confidence intervals, error bars, or variance across runs. With 164 problems in HumanEval, the standard error on aggregate pass rates could be substantial."
},
{
- "flag": "Small benchmark size",
- "detail": "HumanEval contains only 164 problems. With pass@1 of 28.8%, this means roughly 47 problems are solved. Small sample effects could be significant but are not quantified."
+ "flag": "Training data not released",
+ "detail": "The 159 GB training dataset and model weights are not released. While the evaluation dataset (HumanEval) is public, the core claims about Codex's performance cannot be independently replicated because neither the training data nor the model is available."
}
],
"cited_papers": [
{
"title": "Language Models are Few-Shot Learners",
- "authors": ["Tom B. Brown", "Benjamin Mann", "Nick Ryder"],
+ "authors": ["Brown, T. B.", "Mann, B.", "Ryder, N.", "et al."],
"year": 2020,
- "relevance": "GPT-3 paper; foundational large language model serving as the base for Codex fine-tuning and a key baseline."
+ "arxiv_id": "2005.14165",
+ "relevance": "Foundation GPT-3 model that Codex is fine-tuned from; baseline for code generation capability comparison."
},
{
"title": "Measuring Coding Challenge Competence with APPS",
- "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"],
+ "authors": ["Hendrycks, D.", "Basart, S.", "Kadavath, S.", "et al."],
"year": 2021,
"arxiv_id": "2105.09938",
- "relevance": "Introduces the APPS benchmark for evaluating code generation, used as a secondary evaluation dataset in this paper."
+ "relevance": "Coding challenge benchmark used to evaluate Codex alongside HumanEval; measures functional correctness on competitive programming tasks."
+ },
+ {
+ "title": "GPT-J-6B: A 6 Billion Parameter Autoregressive Language Model",
+ "authors": ["Wang, B.", "Komatsuzaki, A."],
+ "year": 2021,
+ "relevance": "Open-source language model baseline for code generation, trained on The Pile with 8% GitHub code."
},
{
"title": "GPT-Neo: Large Scale Autoregressive Language Modeling with Mesh-Tensorflow",
- "authors": ["Sid Black", "Leo Gao", "Phil Wang"],
+ "authors": ["Black, S.", "Gao, L.", "Wang, P.", "Leahy, C.", "Biderman, S."],
"year": 2021,
- "relevance": "Open-source GPT alternative trained on The Pile; used as a baseline comparison for code generation capability."
+ "relevance": "Open-source GPT-style model serving as baseline for code generation; trained on The Pile."
+ },
+ {
+ "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
+ "authors": ["Feng, Z.", "Guo, D.", "Tang, D.", "et al."],
+ "year": 2020,
+ "relevance": "Pre-trained code representation model using BERT objective on docstring-function pairs."
},
{
"title": "Scaling Laws for Neural Language Models",
- "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
+ "authors": ["Kaplan, J.", "McCandlish, S.", "Henighan, T.", "et al."],
"year": 2020,
- "relevance": "Establishes power law scaling of language model performance with model size, validated in this paper for code fine-tuning."
+ "arxiv_id": "2003.05950",
+ "relevance": "Establishes power law scaling relationships that Codex's performance also follows after code fine-tuning."
},
{
"title": "SPoC: Search-based Pseudocode to Code",
- "authors": ["Sumith Kulal", "Panupong Pasupat", "Kartik Chandra"],
+ "authors": ["Kulal, S.", "Pasupat, P.", "Chandra, K.", "et al."],
"year": 2019,
- "relevance": "Early work on functional correctness evaluation and the pass@k metric for code generation."
+ "relevance": "Introduced the pass@k metric for evaluating functional correctness of synthesized code."
},
{
- "title": "Extracting Training Data from Large Language Models",
- "authors": ["Nicholas Carlini", "Florian Tramer", "Eric Wallace"],
- "year": 2021,
- "relevance": "Demonstrates training data extraction from LLMs, directly relevant to security and contamination analysis of code generation models."
- },
- {
- "title": "You Autocomplete Me: Poisoning Vulnerabilities in Neural Code Completion",
- "authors": ["Roei Schuster", "Congzheng Song", "Eran Tromer", "Vitaly Shmatikov"],
+ "title": "Unsupervised Translation of Programming Languages",
+ "authors": ["Lachaux, M.-A.", "Rozière, B.", "Chanussot, L.", "Lample, G."],
"year": 2020,
- "relevance": "Demonstrates poisoning attacks on code autocompleters; directly relevant to security analysis of code generation models."
+ "arxiv_id": "2006.03511",
+ "relevance": "Demonstrated functional correctness as a better evaluation metric than BLEU for code translation."
},
{
- "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
- "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
- "year": 2020,
- "relevance": "Pre-trained model for code understanding that represents a different approach to code representation learning."
+ "title": "Extracting Training Data from Large Language Models",
+ "authors": ["Carlini, N.", "Tramèr, F.", "Wallace, E.", "et al."],
+ "year": 2021,
+ "relevance": "Demonstrates privacy risks of training data memorization in large language models, applicable to code models trained on public repositories."
},
{
- "title": "Learning to Summarize from Human Feedback",
- "authors": ["Nisan Stiennon", "Long Ouyang", "Jeff Wu"],
+ "title": "You Autocomplete Me: Poisoning Vulnerabilities in Neural Code Completion",
+ "authors": ["Schuster, R.", "Song, C.", "Tromer, E.", "Shmatikov, V."],
"year": 2020,
- "relevance": "RLHF approach discussed as a potential alignment solution for code generation models."
+ "relevance": "Demonstrates data poisoning attacks on code completion models, a supply chain security risk for code generation."
},
{
"title": "In-IDE Code Generation from Natural Language: Promise and Challenges",
- "authors": ["Frank F. Xu", "Bogdan Vasilescu", "Graham Neubig"],
+ "authors": ["Xu, F. F.", "Vasilescu, B.", "Neubig, G."],
"year": 2021,
"arxiv_id": "2101.11149",
- "relevance": "Evaluates in-IDE code generation from natural language, directly relevant to understanding code generation model capabilities and limitations."
+ "relevance": "Evaluates capabilities and challenges of code generation in IDE settings, directly relevant to code generation evaluation."
},
{
- "title": "Alignment of Language Agents",
- "authors": ["Zachary Kenton", "Tom Everitt", "Laura Weidinger"],
+ "title": "Learning Autocompletion from Real-World Datasets",
+ "authors": ["Aye, G. A.", "Kim, S.", "Li, H."],
"year": 2021,
- "relevance": "Theoretical framework for alignment that informs the paper's analysis of Codex misalignment behavior."
+ "relevance": "Reports on Facebook's internal code autocomplete tool, providing industry perspective on code generation deployment."
},
{
- "title": "On the Dangers of Stochastic Parrots: Can Language Models Be Too Big?",
- "authors": ["Emily M. Bender", "Timnit Gebru", "Angelina McMillan-Major", "Shmargaret Shmitchell"],
- "year": 2021,
- "relevance": "Discusses risks of large language models including bias and environmental costs, both relevant to the broader impacts analysis."
+ "title": "The Pile: An 800GB Dataset of Diverse Text for Language Modeling",
+ "authors": ["Gao, L.", "Biderman, S.", "Black, S.", "et al."],
+ "year": 2020,
+ "relevance": "Training dataset for GPT-Neo and GPT-J baselines, containing 8% GitHub code that enables programming capabilities."
}
]
}
diff --git a/papers/react-synergizing-reasoning-2022/scan.json b/papers/react-synergizing-reasoning-2022/scan.json
@@ -0,0 +1,523 @@
+{
+ "paper": {
+ "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
+ "authors": [
+ "Shunyu Yao",
+ "Jeffrey Zhao",
+ "Dian Yu",
+ "Nan Du",
+ "Izhak Shafran",
+ "Karthik Narasimhan",
+ "Yuan Cao"
+ ],
+ "year": 2022,
+ "venue": "ICLR 2023",
+ "arxiv_id": "2210.03629",
+ "doi": "10.48550/arXiv.2210.03629"
+ },
+ "scan_version": 2,
+ "active_modules": ["experimental_rigor", "data_leakage"],
+ "checklist": {
+ "artifacts": {
+ "code_released": {
+ "applies": true,
+ "answer": true,
+ "justification": "Project page with code linked in footnote: 'https://react-lm.github.io/'. Reproducibility Statement also provides 'associated GPT-3 ReAct prompting code at https://anonymous.4open.science/r/ReAct-2268/'."
+ },
+ "data_released": {
+ "applies": true,
+ "answer": true,
+ "justification": "All four benchmarks used (HotpotQA, FEVER, ALFWorld, WebShop) are publicly available standard datasets. The paper does not modify them."
+ },
+ "environment_specified": {
+ "applies": true,
+ "answer": false,
+ "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper mentions PaLM-540B and GPT-3 (text-davinci-002) but does not specify library versions or dependencies."
+ },
+ "reproduction_instructions": {
+ "applies": true,
+ "answer": false,
+ "justification": "No step-by-step reproduction instructions are included in the paper. Full prompts are provided in Appendix C and code is linked, but there are no specific commands or README-style instructions to replicate experiments."
+ }
+ },
+ "statistical_methodology": {
+ "confidence_intervals_or_error_bars": {
+ "applies": true,
+ "answer": false,
+ "justification": "All results in Tables 1, 3, and 4 are reported as point estimates without confidence intervals or error bars."
+ },
+ "significance_tests": {
+ "applies": true,
+ "answer": false,
+ "justification": "The paper claims 'ReAct outperforms Act consistently' and makes other comparative claims based solely on comparing point estimates without any statistical significance tests."
+ },
+ "effect_sizes_reported": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper reports absolute improvements with baseline context: 'absolute success rate of 34% and 10% respectively' (Abstract). Tables 1, 3, 4 provide both the proposed method and baseline numbers, enabling effect size calculation (e.g., ReAct 71% vs BUTLER 37% on ALFWorld)."
+ },
+ "sample_size_justified": {
+ "applies": true,
+ "answer": false,
+ "justification": "No justification for sample sizes. Uses full evaluation splits for some benchmarks (134 ALFWorld games, 500 WebShop instructions) and a 500-sample subset for GPT-3 HotpotQA, but no power analysis or justification is given."
+ },
+ "variance_reported": {
+ "applies": true,
+ "answer": false,
+ "justification": "For ALFWorld, 'avg' and 'best of 6' results across prompt permutations are reported in Table 3, but no standard deviation or spread measure is provided. HotpotQA and FEVER results are single-run point estimates with greedy decoding."
+ }
+ },
+ "evaluation_design": {
+ "baselines_included": {
+ "applies": true,
+ "answer": true,
+ "justification": "Multiple baselines are included: Standard prompting, CoT, CoT-SC, Act-only, BUTLER (imitation learning), IL, and IL+RL. Tables 1, 3, and 4 present systematic comparisons."
+ },
+ "baselines_contemporary": {
+ "applies": true,
+ "answer": true,
+ "justification": "Baselines include CoT (Wei et al., 2022) and CoT-SC (Wang et al., 2022a) which were state-of-the-art prompting methods at the time, plus BUTLER (2020) and IL/IL+RL methods. Supervised SOTA numbers are also cited."
+ },
+ "ablation_study": {
+ "applies": true,
+ "answer": true,
+ "justification": "Systematic ablations are performed: ReAct vs Act (removes thoughts), ReAct vs CoT (removes actions/observations), ReAct vs ReAct-IM (different thought styles). Section 3.2 describes how baselines are constructed by 'systematically ablating ReAct trajectories.' Table 3 includes ReAct-IM ablation."
+ },
+ "multiple_metrics": {
+ "applies": true,
+ "answer": true,
+ "justification": "Multiple metrics used across tasks: exact match (HotpotQA), accuracy (FEVER), task-specific success rates broken down by 6 task types (ALFWorld), score and success rate (WebShop, Table 4)."
+ },
+ "human_evaluation": {
+ "applies": true,
+ "answer": true,
+ "justification": "Table 2 presents human analysis: 'we randomly sampled 50 trajectories with correct and incorrect answers from ReAct and CoT respectively (thus 200 examples in total), and manually labeled their success and failure modes.'"
+ },
+ "held_out_test_set": {
+ "applies": true,
+ "answer": true,
+ "justification": "Exemplars are selected from the training set ('randomly select 6 and 3 cases from the training set'). ALFWorld uses '134 unseen evaluation games.' WebShop uses '500 test instructions.' Evaluation is on held-out data separate from prompt exemplars."
+ },
+ "per_category_breakdown": {
+ "applies": true,
+ "answer": true,
+ "justification": "Table 3 provides per-task-type breakdown for ALFWorld across 6 categories (Pick, Clean, Heat, Cool, Look, Pick 2). Table 2 provides per-category breakdown of success and failure modes."
+ },
+ "failure_cases_discussed": {
+ "applies": true,
+ "answer": true,
+ "justification": "Table 2 presents a detailed failure analysis with 4 failure modes: reasoning error (47%), search result error (23%), hallucination (0% for ReAct, 56% for CoT), and label ambiguity (29%). Appendix E.1 provides concrete examples of each mode."
+ },
+ "negative_results_reported": {
+ "applies": true,
+ "answer": true,
+ "justification": "ReAct underperforms CoT on HotpotQA (27.4 vs 29.4, Table 1). PaLM-8B/62B prompting with ReAct 'performs worst among four methods due to the difficulty to learn both reasoning and acting from in-context examples' (Section 3.3, Figure 3). ReAct's structural constraints reduce flexibility (Table 2 analysis)."
+ }
+ },
+ "claims_and_evidence": {
+ "abstract_claims_supported": {
+ "applies": true,
+ "answer": true,
+ "justification": "Abstract claims match results: ReAct 'overcomes prevalent issues of hallucination' (Table 2: 0% vs 56% hallucination), outperforms on FEVER (Table 1: 60.9 vs 56.3), outperforms IL/RL on ALFWorld and WebShop by '34% and 10%' (Tables 3, 4). Abstract appropriately hedges that ReAct is 'competitive with' CoT on QA."
+ },
+ "causal_claims_justified": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper's causal claims ('reasoning traces help the model...', 'actions allow it to interface with...') are supported by controlled ablations: ReAct vs Act removes thoughts while keeping everything else constant, ReAct vs CoT removes actions. This single-variable manipulation design is adequate for the causal claims made."
+ },
+ "generalization_bounded": {
+ "applies": true,
+ "answer": false,
+ "justification": "The paper claims 'a general paradigm to combine reasoning and acting with language models for solving diverse language reasoning and decision making tasks' based on 4 benchmarks tested primarily on PaLM-540B. The title 'Synergizing Reasoning and Acting in Language Models' implies generality beyond the tested settings. GPT-3 experiments (Appendix A.1) partially address model generality but only on 2 of 4 tasks."
+ },
+ "alternative_explanations_discussed": {
+ "applies": true,
+ "answer": false,
+ "justification": "Limited discussion of alternative explanations. Footnote 4 suggests repetitive thought loops 'could be due to the sub-optimal greedy decoding procedure.' Section 3.3 discusses trade-offs between factuality and flexibility. But no systematic consideration of confounds or alternative explanations for the observed improvements."
+ },
+ "proxy_outcome_distinction": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper's claims match the granularity of measurements. It measures exact match on HotpotQA, accuracy on FEVER, success rate on ALFWorld, and score/SR on WebShop. Claims about 'interpretability and trustworthiness' are supported by the concrete human analysis in Table 2 showing lower hallucination rates, not vague proxy assertions."
+ }
+ },
+ "setup_transparency": {
+ "model_versions_specified": {
+ "applies": true,
+ "answer": true,
+ "justification": "PaLM-540B is specified by name and parameter count (Chowdhery et al., 2022). GPT-3 is specified as 'text-davinci-002' (Appendix A.1). PaLM-8B and PaLM-62B are specified for finetuning experiments."
+ },
+ "prompts_provided": {
+ "applies": true,
+ "answer": true,
+ "justification": "Full prompts are provided in Appendix C (Sections C.1-C.4) for all four tasks and all prompt formats (Standard, CoT, Act, ReAct). These are actual prompt texts, not descriptions. WebShop prompt in Table 6, ALFWorld prompts in Tables 7-9."
+ },
+ "hyperparameters_reported": {
+ "applies": true,
+ "answer": true,
+ "justification": "Greedy decoding stated for main experiments. Temperature 0.7 stated for CoT-SC sampling with 21 trajectories. Finetuning details in Appendix B.1: batch size 64, training steps (4000 for ReAct/Act, 2000/1000 for Standard/CoT on 8B/62B)."
+ },
+ "scaffolding_described": {
+ "applies": true,
+ "answer": true,
+ "justification": "The ReAct scaffolding is described in detail: interleaved thought-action-observation format, Wikipedia API with three action types (search[entity], lookup[string], finish[answer]), ALFWorld text action space, WebShop action space (search, click, buy). Section 2 formally defines the augmented action space."
+ },
+ "data_preprocessing_documented": {
+ "applies": true,
+ "answer": true,
+ "justification": "The paper describes the 'question-only setup' for HotpotQA and FEVER (no support paragraphs). Exemplar selection: 'randomly select 6 and 3 cases from the training set.' Finetuning data: '3,000 trajectories with correct answers generated by ReAct.' ALFWorld evaluation setup follows Shridhar et al. (2020b)."
+ }
+ },
+ "limitations_and_scope": {
+ "limitations_section_present": {
+ "applies": true,
+ "answer": false,
+ "justification": "No dedicated limitations section. The conclusion contains one sentence: 'complex tasks with large action spaces require more demonstrations to learn well, which unfortunately can easily go beyond the input length limit of in-context learning.' This is insufficient per the criterion requiring 'substantive discussion.'"
+ },
+ "threats_to_validity_specific": {
+ "applies": true,
+ "answer": false,
+ "justification": "No threats-to-validity discussion. The paper does not discuss specific threats like prompt sensitivity, benchmark representativeness, or generalizability limits beyond the brief conclusion mention."
+ },
+ "scope_boundaries_stated": {
+ "applies": true,
+ "answer": false,
+ "justification": "No explicit statements about what the results do NOT show. The paper does not bound its claims to specific model families, task types, or settings. No equivalent of 'what the evidence does not show.'"
+ }
+ },
+ "data_integrity": {
+ "raw_data_available": {
+ "applies": true,
+ "answer": false,
+ "justification": "No raw data (model output trajectories, intermediate results) is released. Code is linked but generated trajectories used for finetuning or the 200 manually analyzed examples are not made available."
+ },
+ "data_collection_described": {
+ "applies": true,
+ "answer": true,
+ "justification": "Data collection is described: benchmarks are publicly available, exemplars 'randomly selected from the training set,' finetuning data is '3,000 trajectories with correct answers generated by ReAct,' human analysis sampled '50 trajectories with correct and incorrect answers from ReAct and CoT respectively.'"
+ },
+ "recruitment_methods_described": {
+ "applies": false,
+ "answer": false,
+ "justification": "No human participants were recruited. The paper uses standard public benchmarks and author-conducted manual analysis."
+ },
+ "data_pipeline_documented": {
+ "applies": true,
+ "answer": true,
+ "justification": "The pipeline is straightforward and documented: select exemplars from training set → compose prompts → run inference with greedy decoding → extract answers → evaluate against gold labels. For finetuning: generate trajectories → filter for correct answers (3,000) → finetune."
+ }
+ },
+ "conflicts_of_interest": {
+ "funding_disclosed": {
+ "applies": true,
+ "answer": true,
+ "justification": "Acknowledgments section: 'This work was supported in part by the National Science Foundation under Grant No. 2107048.'"
+ },
+ "affiliations_disclosed": {
+ "applies": true,
+ "answer": true,
+ "justification": "Author affiliations clearly listed: Princeton University (Yao, Narasimhan) and Google Research, Brain team (Zhao, Yu, Du, Shafran, Cao). The first author's Google internship is also noted."
+ },
+ "funder_independent_of_outcome": {
+ "applies": true,
+ "answer": false,
+ "justification": "While NSF funding is independent, 5 of 7 authors are from Google Research, and the primary model evaluated (PaLM-540B) is Google's proprietary model. Google has a direct interest in demonstrating the capabilities of its LLMs. This conflict is not explicitly acknowledged."
+ },
+ "financial_interests_declared": {
+ "applies": true,
+ "answer": false,
+ "justification": "No competing interests or financial interests statement is included in the paper."
+ }
+ },
+ "contamination": {
+ "training_cutoff_stated": {
+ "applies": true,
+ "answer": false,
+ "justification": "PaLM-540B's training data cutoff is not stated in the paper. The Chowdhery et al. (2022) reference describes PaLM but the cutoff date is not reproduced here."
+ },
+ "train_test_overlap_discussed": {
+ "applies": true,
+ "answer": false,
+ "justification": "No discussion of whether PaLM-540B's training data may contain examples from HotpotQA (2018), FEVER (2018), ALFWorld (2020), or WebShop (2022), all of which were publicly available before PaLM training."
+ },
+ "benchmark_contamination_addressed": {
+ "applies": true,
+ "answer": false,
+ "justification": "HotpotQA and FEVER were published in 2018, well before PaLM's training. The benchmarks and their solutions could be in the training data. No contamination analysis is performed."
+ }
+ },
+ "human_studies": {
+ "pre_registered": {
+ "applies": false,
+ "answer": false,
+ "justification": "No human participants. The paper evaluates LLM prompting methods on automated benchmarks."
+ },
+ "irb_or_ethics_approval": {
+ "applies": false,
+ "answer": false,
+ "justification": "No human participants. An Ethics Statement is included but addresses potential harms of LLM-environment interaction, not human subjects."
+ },
+ "demographics_reported": {
+ "applies": false,
+ "answer": false,
+ "justification": "No human participants in the study."
+ },
+ "inclusion_exclusion_criteria": {
+ "applies": false,
+ "answer": false,
+ "justification": "No human participants in the study."
+ },
+ "randomization_described": {
+ "applies": false,
+ "answer": false,
+ "justification": "No human participants. This is a benchmark evaluation study."
+ },
+ "blinding_described": {
+ "applies": false,
+ "answer": false,
+ "justification": "No human participants. This is a benchmark evaluation study."
+ },
+ "attrition_reported": {
+ "applies": false,
+ "answer": false,
+ "justification": "No human participants in the study."
+ }
+ },
+ "cost_and_practicality": {
+ "inference_cost_reported": {
+ "applies": true,
+ "answer": false,
+ "justification": "No inference cost, latency, or tokens consumed per query is reported. The paper uses PaLM-540B extensively but never quantifies the computational cost of ReAct vs baselines."
+ },
+ "compute_budget_stated": {
+ "applies": true,
+ "answer": false,
+ "justification": "No total computational budget stated. Finetuning uses batch size 64 for 1000-4000 steps but total GPU hours, hardware, or API costs are not reported."
+ }
+ },
+ "experimental_rigor": {
+ "seed_sensitivity_reported": {
+ "applies": true,
+ "answer": false,
+ "justification": "No random seed sensitivity analysis. Main experiments use greedy decoding (deterministic). ALFWorld runs 6 prompt permutations but these test prompt sensitivity, not seed sensitivity."
+ },
+ "number_of_runs_stated": {
+ "applies": true,
+ "answer": true,
+ "justification": "Greedy decoding is stated for main experiments (implying single deterministic run). ALFWorld explicitly uses '6 prompts for each task type through each permutation of 2 annotated trajectories from the 3.' CoT-SC uses '21 CoT trajectories' with temperature 0.7."
+ },
+ "hyperparameter_search_budget": {
+ "applies": true,
+ "answer": false,
+ "justification": "No formal hyperparameter search budget reported. The paper notes 'We find more examples do not improve performance' (footnote 2) suggesting some search, but the budget and configurations tried are not documented."
+ },
+ "best_config_selection_justified": {
+ "applies": true,
+ "answer": true,
+ "justification": "For ALFWorld, both 'avg' and 'best of 6' results are reported transparently in Table 3. For ReAct+CoT-SC combinations, the selection heuristics are explicitly described (Section 3.2). Step limits (7 for HotpotQA, 5 for FEVER) are justified in footnote 3."
+ },
+ "multiple_comparison_correction": {
+ "applies": true,
+ "answer": false,
+ "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite numerous comparisons across methods and tasks."
+ },
+ "self_comparison_bias_addressed": {
+ "applies": true,
+ "answer": false,
+ "justification": "The authors implement all baselines themselves (Standard, CoT, Act are constructed by 'systematically ablating ReAct trajectories'). No acknowledgment of self-comparison bias per Lucic et al. (2018)."
+ },
+ "compute_budget_vs_performance": {
+ "applies": true,
+ "answer": false,
+ "justification": "ReAct generates more tokens per query than Act or CoT (thoughts + actions + observations), but computational cost differences are never discussed. CoT-SC requires 21 samples vs ReAct's single pass, but compute is not compared."
+ },
+ "benchmark_construct_validity": {
+ "applies": true,
+ "answer": false,
+ "justification": "No discussion of whether HotpotQA, FEVER, ALFWorld, or WebShop actually measure the claimed capabilities of 'reasoning and acting synergy.' The paper assumes benchmark validity without questioning it."
+ },
+ "scaffold_confound_addressed": {
+ "applies": true,
+ "answer": true,
+ "justification": "The scaffold is the independent variable being studied. All methods (Standard, CoT, Act, ReAct) are evaluated on the same model (PaLM-540B) with the same benchmarks, isolating the effect of the prompting approach. The comparison is deliberately about scaffolding strategies."
+ }
+ },
+ "data_leakage": {
+ "temporal_leakage_addressed": {
+ "applies": true,
+ "answer": false,
+ "justification": "HotpotQA (2018) and FEVER (2018) benchmark problems existed years before PaLM's training. The paper does not discuss whether solutions could be in the training data."
+ },
+ "feature_leakage_addressed": {
+ "applies": true,
+ "answer": false,
+ "justification": "No discussion of whether evaluation setups leak answer information. For example, CoT baselines use internal knowledge that may come from memorized benchmark answers."
+ },
+ "non_independence_addressed": {
+ "applies": true,
+ "answer": false,
+ "justification": "No discussion of whether training exemplars and evaluation examples share structural similarities or come from the same data distribution."
+ },
+ "leakage_detection_method": {
+ "applies": true,
+ "answer": false,
+ "justification": "No concrete leakage detection or prevention method used. No canary strings, membership inference, or decontamination analysis."
+ }
+ }
+ },
+ "claims": [
+ {
+ "claim": "ReAct outperforms Act-only prompting on both HotpotQA (27.4 vs 25.7 EM) and FEVER (60.9 vs 58.9 accuracy), demonstrating the value of reasoning to guide acting.",
+ "evidence": "Table 1, Section 3.3. Consistent across both knowledge-intensive tasks with PaLM-540B.",
+ "supported": "moderate"
+ },
+ {
+ "claim": "ReAct reduces hallucination compared to CoT: 6% vs 14% false positive rate in successes, and 0% vs 56% hallucination as failure mode.",
+ "evidence": "Table 2, Section 3.3. Based on manual analysis of 200 randomly sampled trajectories (50 correct + 50 incorrect from each method).",
+ "supported": "moderate"
+ },
+ {
+ "claim": "ReAct outperforms imitation and reinforcement learning methods on ALFWorld by 34% absolute success rate (71% vs 37% best BUTLER).",
+ "evidence": "Table 3. Best-of-6 ReAct (71%) vs best-of-8 BUTLER (37%). Average ReAct (57%) also exceeds best BUTLER.",
+ "supported": "strong"
+ },
+ {
+ "claim": "ReAct outperforms IL and IL+RL methods on WebShop by 10% absolute success rate (40.0% vs 29.1%/28.7%).",
+ "evidence": "Table 4. One-shot ReAct prompting outperforms methods trained on 1,012-10,587 examples.",
+ "supported": "moderate"
+ },
+ {
+ "claim": "Combining ReAct with CoT-SC achieves the best prompting results, reaching CoT-SC performance with 21 samples using only 3-5 samples.",
+ "evidence": "Table 1 and Figure 2. ReAct→CoT-SC achieves 35.1 EM on HotpotQA; CoT-SC→ReAct achieves 64.6 on FEVER.",
+ "supported": "strong"
+ },
+ {
+ "claim": "Finetuned ReAct is the best method: PaLM-8B finetuned ReAct outperforms all PaLM-62B prompting methods.",
+ "evidence": "Figure 3, Section 3.3. Finetuning with 3,000 ReAct trajectories shows strong scaling behavior.",
+ "supported": "moderate"
+ },
+ {
+ "claim": "Internal reasoning in ReAct substantially outperforms Inner Monologue-style dense external feedback (71% vs 53% overall success rate on ALFWorld).",
+ "evidence": "Table 3, Section 4. ReAct-IM ablation with consistent advantages on 5 of 6 task types.",
+ "supported": "strong"
+ }
+ ],
+ "methodology_tags": ["benchmark-eval"],
+ "key_findings": "ReAct introduces interleaved reasoning traces and actions for LLM task solving, demonstrating that reasoning helps guide acting (reducing hallucination from 56% to 0% vs CoT on HotpotQA) while actions ground reasoning in external knowledge (outperforming CoT on FEVER 60.9 vs 56.3). On interactive decision making, few-shot ReAct prompting outperforms trained IL/RL methods by 34% on ALFWorld and 10% on WebShop. The combination of ReAct with CoT self-consistency achieves the best prompting results on knowledge-intensive tasks, and finetuned ReAct at 8B scale outperforms all 62B prompting methods.",
+ "red_flags": [
+ {
+ "flag": "No statistical tests or uncertainty quantification",
+ "detail": "All comparisons across 4 benchmarks rely on point estimates without statistical significance tests, confidence intervals, or error bars. Claims of 'outperforming' are based on raw number comparisons (e.g., 27.4 vs 25.7 on HotpotQA — a 1.7 point difference without any indication of whether this is statistically meaningful)."
+ },
+ {
+ "flag": "Company evaluating its own model",
+ "detail": "Five of seven authors are from Google Research, Brain team, and the primary model evaluated is Google's PaLM-540B. The paper demonstrates PaLM's capabilities without acknowledging this conflict. Results on GPT-3 in Appendix A.1 partially mitigate this."
+ },
+ {
+ "flag": "No contamination analysis",
+ "detail": "PaLM-540B was trained on massive web data, and HotpotQA (2018) and FEVER (2018) have been publicly available for years. The model may have memorized benchmark answers, which would inflate results for Standard and CoT baselines and potentially affect ReAct's knowledge retrieval patterns."
+ },
+ {
+ "flag": "Selective reporting of best-of-K results",
+ "detail": "ALFWorld results prominently feature 'best of 6' trials (71% for ReAct vs 37% for BUTLER 'best of 8'). While averages are also reported for ReAct (57%), the headline claim of 34% improvement uses best-of-K comparisons, which overestimates expected performance."
+ }
+ ],
+ "cited_papers": [
+ {
+ "title": "Chain of thought prompting elicits reasoning in large language models",
+ "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Ed Chi", "Quoc Le", "Denny Zhou"],
+ "year": 2022,
+ "arxiv_id": "2201.11903",
+ "relevance": "Foundational prompting method for LLM reasoning; primary baseline and comparator for ReAct's reasoning component."
+ },
+ {
+ "title": "Self-consistency improves chain of thought reasoning in language models",
+ "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans", "Quoc Le", "Ed Chi", "Sharan Narang", "Aakanksha Chowdhery", "Denny Zhou"],
+ "year": 2022,
+ "arxiv_id": "2203.11171",
+ "relevance": "Sampling-based improvement to CoT; combined with ReAct in the best-performing prompting configurations."
+ },
+ {
+ "title": "WebGPT: Browser-assisted question-answering with human feedback",
+ "authors": ["Reiichiro Nakano", "Jacob Hilton", "Suchir Balaji", "Jeff Wu"],
+ "year": 2021,
+ "arxiv_id": "2112.09332",
+ "relevance": "Prior work on LLM-web interaction for QA without explicit reasoning; uses human feedback and RL rather than prompting."
+ },
+ {
+ "title": "Do as I can, not as I say: Grounding language in robotic affordances",
+ "authors": ["Michael Ahn", "Anthony Brohan", "Noah Brown"],
+ "year": 2022,
+ "arxiv_id": "2204.01691",
+ "relevance": "SayCan: LLM-based robotic planning grounded by affordance models; key prior work on LLMs for decision making."
+ },
+ {
+ "title": "Inner monologue: Embodied reasoning through planning with language models",
+ "authors": ["Wenlong Huang", "Fei Xia", "Ted Xiao"],
+ "year": 2022,
+ "arxiv_id": "2207.05608",
+ "relevance": "Closest prior work to ReAct for closed-loop LLM reasoning in interactive environments; ReAct explicitly compares against IM-style prompting."
+ },
+ {
+ "title": "PaLM: Scaling language modeling with pathways",
+ "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin"],
+ "year": 2022,
+ "arxiv_id": "2204.02311",
+ "relevance": "Primary model used in ReAct experiments (PaLM-540B); demonstrates scale effects on reasoning and acting capabilities."
+ },
+ {
+ "title": "STaR: Bootstrapping reasoning with reasoning",
+ "authors": ["Eric Zelikman", "Yuhuai Wu", "Jesse Mu", "Noah D. Goodman"],
+ "year": 2022,
+ "arxiv_id": "2203.14465",
+ "relevance": "Self-bootstrapping approach for reasoning; ReAct's finetuning approach draws on this methodology."
+ },
+ {
+ "title": "A generalist agent",
+ "authors": ["Scott Reed", "Konrad Zolna", "Emilio Parisotto"],
+ "year": 2022,
+ "arxiv_id": "2205.06175",
+ "relevance": "Gato: multi-modal generalist agent; represents the trend toward versatile LLM-based agents that ReAct contributes to."
+ },
+ {
+ "title": "Large language models are zero-shot reasoners",
+ "authors": ["Takeshi Kojima", "Shixiang Shane Gu", "Machel Reid", "Yutaka Matsuo", "Yusuke Iwasawa"],
+ "year": 2022,
+ "arxiv_id": "2205.11916",
+ "relevance": "Zero-shot CoT; demonstrates emergent reasoning in LLMs without exemplars, complementary to ReAct's few-shot approach."
+ },
+ {
+ "title": "Language models are few-shot learners",
+ "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
+ "year": 2020,
+ "relevance": "GPT-3 paper; secondary model used in ReAct experiments (text-davinci-002) and foundation for in-context learning paradigm."
+ },
+ {
+ "title": "ALFWorld: Aligning text and embodied environments for interactive learning",
+ "authors": ["Mohit Shridhar", "Xingdi Yuan", "Marc-Alexandre Côté", "Yonatan Bisk", "Adam Trischler", "Matthew Hausknecht"],
+ "year": 2020,
+ "arxiv_id": "2010.03768",
+ "relevance": "Text-based embodied benchmark used in ReAct evaluation; demonstrates LLM decision-making in simulated household environments."
+ },
+ {
+ "title": "WebShop: Towards scalable real-world web interaction with grounded language agents",
+ "authors": ["Shunyu Yao", "Howard Chen", "John Yang", "Karthik Narasimhan"],
+ "year": 2022,
+ "arxiv_id": "2207.01206",
+ "relevance": "Real-world web interaction benchmark used in ReAct evaluation; tests practical applicability of LLM agents in noisy environments."
+ },
+ {
+ "title": "Improving alignment of dialogue agents via targeted human judgements",
+ "authors": ["Amelia Glaese", "Nat McAleese", "Maja Trebacz"],
+ "year": 2022,
+ "relevance": "Sparrow: dialogue agent with API-calling capability trained via human feedback; contrasts with ReAct's cheaper prompting-based approach."
+ },
+ {
+ "title": "Language models as zero-shot planners: Extracting actionable knowledge for embodied agents",
+ "authors": ["Wenlong Huang", "Pieter Abbeel", "Deepak Pathak", "Igor Mordatch"],
+ "year": 2022,
+ "arxiv_id": "2201.07207",
+ "relevance": "LLMs for action planning in embodied environments; prior work on using language priors for decision making without explicit reasoning."
+ }
+ ]
+}
diff --git a/registry.jsonl b/registry.jsonl
@@ -984,7 +984,7 @@
{"id": "sampleefficient-human-evaluation-2024", "title": "Sample-Efficient Human Evaluation of Large Language Models via Maximum Discrepancy Competition", "authors": ["Kehua Feng", "Keyan Ding", "Kede Ma", "Zhihua Wang", "Qiang Zhang"], "year": 2024, "venue": "arXiv.org", "source_url": "https://arxiv.org/abs/2404.08008", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Reliable evaluation of large language models (LLMs) is impeded by two key challenges: objective metrics often fail to reflect human perception of natural language, and exhaustive human labeling is pro", "arxiv_id": "2404.08008", "doi": "10.48550/arXiv.2404.08008", "directory": "papers/sampleefficient-human-evaluation-2024"}
{"id": "large-language-model-2024-3", "title": "Large language model evaluation for high‐performance computing software development", "authors": ["William F. Godoy", "Pedro Valero-Lara", "Keita Teranishi", "Prasanna Balaprakash", "Jeffrey S. Vetter"], "year": 2024, "venue": "Unknown", "source_url": "https://doi.org/10.1002/cpe.8269", "source": "semantic_scholar", "status": "queued", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: We apply AI‐assisted large language model (LLM) capabilities of GPT‐3 targeting high‐performance computing (HPC) kernels for (i) code generation, and (ii) auto‐parallelization of serial code in C ++, ", "doi": "10.1002/cpe.8269"}
{"id": "cloudevalyaml-practical-benchmark-2023", "title": "CloudEval-YAML: A Practical Benchmark for Cloud Configuration Generation", "authors": ["Yifei Xu", "Yuning Chen", "Xumiao Zhang", "Xianshang Lin", "Pan Hu"], "year": 2023, "venue": "Conference on Machine Learning and Systems", "source_url": "https://arxiv.org/abs/2401.06786", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Among the thriving ecosystem of cloud computing and the proliferation of Large Language Model (LLM)-based code generation tools, there is a lack of benchmarking for code generation in cloud-native app", "arxiv_id": "2401.06786", "doi": "10.48550/arXiv.2401.06786", "directory": "papers/cloudevalyaml-practical-benchmark-2023"}
-{"id": "understanding-large-language-2023", "title": "Understanding Large Language Model Based Fuzz Driver Generation", "authors": ["Cen Zhang", "Ming-Xing Bai", "Yaowen Zheng", "Yeting Li", "Xiaofei Xie"], "year": 2023, "venue": "Unknown", "source_url": "https://doi.org/10.48550/arXiv.2307.12469", "source": "semantic_scholar", "status": "downloaded", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar.", "doi": "10.48550/arXiv.2307.12469"}
+{"id": "understanding-large-language-2023", "title": "Understanding Large Language Model Based Fuzz Driver Generation", "authors": ["Cen Zhang", "Ming-Xing Bai", "Yaowen Zheng", "Yeting Li", "Xiaofei Xie"], "year": 2023, "venue": "Unknown", "source_url": "https://doi.org/10.48550/arXiv.2307.12469", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar.", "doi": "10.48550/arXiv.2307.12469"}
{"id": "glmdialog-noisetolerant-pretraining-2023", "title": "GLM-Dialog: Noise-tolerant Pre-training for Knowledge-grounded Dialogue Generation", "authors": ["Jing Zhang", "Xiaokang Zhang", "Daniel Zhang-Li", "Jifan Yu", "Zijun Yao"], "year": 2023, "venue": "Knowledge Discovery and Data Mining", "source_url": "https://arxiv.org/abs/2302.14401", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: We present GLM-Dialog, a large-scale language model (LLM) with 10B parameters capable of knowledge-grounded conversation in Chinese using a search engine to access the Internet knowledge. GLM-Dialog o", "arxiv_id": "2302.14401", "doi": "10.1145/3580305.3599832", "directory": "papers/glmdialog-noisetolerant-pretraining-2023"}
{"id": "llms-prescient-continuous-2024", "title": "Are LLMs Prescient? A Continuous Evaluation using Daily News as the Oracle", "authors": ["Hui Dai", "R. Teehan", "Mengye Ren"], "year": 2024, "venue": "International Conference on Machine Learning", "source_url": "https://arxiv.org/abs/2411.08324", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Many existing evaluation benchmarks for Large Language Models (LLMs) quickly become outdated due to the emergence of new models and training data. These benchmarks also fall short in assessing how LLM", "arxiv_id": "2411.08324", "doi": "10.48550/arXiv.2411.08324", "directory": "papers/llms-prescient-continuous-2024"}
{"id": "hazard-analysis-framework-2022", "title": "A Hazard Analysis Framework for Code Synthesis Large Language Models", "authors": ["Heidy Khlaaf", "Pamela Mishkin", "Josh Achiam", "Gretchen Krueger", "Miles Brundage"], "year": 2022, "venue": "arXiv.org", "source_url": "https://arxiv.org/abs/2207.14157", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Codex, a large language model (LLM) trained on a variety of codebases, exceeds the previous state of the art in its capacity to synthesize and generate code. Although Codex provides a plethora of bene", "arxiv_id": "2207.14157", "doi": "10.48550/arXiv.2207.14157", "directory": "papers/hazard-analysis-framework-2022"}
@@ -1007,7 +1007,7 @@
{"id": "transforming-wearable-data-2024", "title": "Transforming wearable data into personal health insights using large language model agents", "authors": ["Mike A. Merrill", "Akshay Paruchuri", "Naghmeh Rezaei", "Geza Kovacs", "Javier Perez Matos"], "year": 2024, "venue": "Nature Communications", "source_url": "https://arxiv.org/abs/2406.06464", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Deriving personalized insights from popular wearable trackers requires complex numerical reasoning that challenges standard LLMs, necessitating tool-based approaches like code generation. Large langua", "arxiv_id": "2406.06464", "doi": "10.1038/s41467-025-67922-y", "directory": "papers/transforming-wearable-data-2024"}
{"id": "hdleval-benchmarking-llms-2024", "title": "HDLEval Benchmarking LLMs for multiple HDLs", "authors": ["Farzaneh Rabiei Kashanaki", "Mark Zakharov", "Jose Renau"], "year": 2024, "venue": "Unknown", "source_url": "https://doi.org/10.1109/LAD62341.2024.10691770", "source": "semantic_scholar", "status": "queued", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Large Language Models (LLMs) are transforming code generation and documentation processes across programming languages, including hardware description languages (HDLs). However, existing benchmarks pr", "doi": "10.1109/LAD62341.2024.10691770"}
{"id": "comparison-large-language-2024", "title": "Comparison of Large Language Models in Generating Machine Learning Curricula in High Schools", "authors": ["Gjorgji Noveski", "Mathis Jeroncic", "Thomas Velard", "Primož Kocuvan", "M. Gams"], "year": 2024, "venue": "Unknown", "source_url": "https://doi.org/10.3390/electronics13204109", "source": "semantic_scholar", "status": "queued", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: With the rapid advancement of artificial intelligence technologies, the integration of AI concepts into educational curricula represents an increasingly important issue. This paper presents a comparat", "doi": "10.3390/electronics13204109"}
-{"id": "hardware-security-benchmarking-2024", "title": "Toward Hardware Security Benchmarking of LLMs", "authors": ["Raheel Afsharmazayejani", "Mohammad Moradi Shahmiri", "Parker Link", "H. Pearce", "Benjamin Tan"], "year": 2024, "venue": "Unknown", "source_url": "https://doi.org/10.1109/LAD62341.2024.10691745", "source": "semantic_scholar", "status": "downloaded", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: With the rapid advancement and proliferation of large language models (LLMs), there is a pressing need to explore and, crucially, evaluate their utility. Recently, LLMs have shown promise in digital d", "doi": "10.1109/LAD62341.2024.10691745"}
+{"id": "hardware-security-benchmarking-2024", "title": "Toward Hardware Security Benchmarking of LLMs", "authors": ["Raheel Afsharmazayejani", "Mohammad Moradi Shahmiri", "Parker Link", "H. Pearce", "Benjamin Tan"], "year": 2024, "venue": "Unknown", "source_url": "https://doi.org/10.1109/LAD62341.2024.10691745", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: With the rapid advancement and proliferation of large language models (LLMs), there is a pressing need to explore and, crucially, evaluate their utility. Recently, LLMs have shown promise in digital d", "doi": "10.1109/LAD62341.2024.10691745"}
{"id": "evaluating-diverse-large-2023", "title": "Evaluating Diverse Large Language Models for Automatic and General Bug Reproduction", "authors": ["Sungmin Kang", "Juyeon Yoon", "Nargiz Askarbekkyzy", "Shin Yoo"], "year": 2023, "venue": "IEEE Transactions on Software Engineering", "source_url": "https://arxiv.org/abs/2311.04532", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Bug reproduction is a critical developer activity that is also challenging to automate, as bug reports are often in natural language and thus can be difficult to transform to test cases consistently. ", "arxiv_id": "2311.04532", "doi": "10.1109/TSE.2024.3450837", "directory": "papers/evaluating-diverse-large-2023"}
{"id": "training-llms-generating-2024", "title": "Training LLMs for Generating IEC 61131-3 Structured Text with Online Feedback", "authors": ["Aaron Haag", "Bertram Fuchs", "Altay Kacan", "Oliver Lohse"], "year": 2024, "venue": "2025 IEEE/ACM International Workshop on Large Language Models for Code (LLM4Code)", "source_url": "https://arxiv.org/abs/2410.22159", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: IEC 61131-3 Structured Text (ST) is a widely used programming language for programmable logic controllers (PLCs) in automation systems. However, generating ST code with LLMs poses unique challenges du", "arxiv_id": "2410.22159", "doi": "10.1109/LLM4Code66737.2025.00013", "directory": "papers/training-llms-generating-2024"}
{"id": "leveraging-large-language-2023", "title": "Leveraging large language models for data analysis automation", "authors": ["Jacqueline A Jansen", "A. Manukyan", "Nour Al Khoury", "A. Akalin"], "year": 2023, "venue": "Unknown", "source_url": "https://doi.org/10.1101/2023.12.11.571140", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Data analysis is constrained by a shortage of skilled experts, particularly in biology, where detailed data interpretation is vital for understanding complex biological processes and developing new tr", "doi": "10.1101/2023.12.11.571140"}
@@ -1039,7 +1039,7 @@
{"id": "learn-code-sustainably-2024", "title": "Learn to Code Sustainably: An Empirical Study on LLM-based Green Code Generation", "authors": ["Tina Vartziotis", "Ippolyti Dellatolas", "George Dasoulas", "Maximilian Schmidt", "Florian Schneider"], "year": 2024, "venue": "arXiv.org", "source_url": "https://arxiv.org/abs/2403.03344", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: The increasing use of information technology has led to a significant share of energy consumption and carbon emissions from data centers. These contributions are expected to rise with the growing dema", "arxiv_id": "2403.03344", "doi": "10.48550/arXiv.2403.03344", "directory": "papers/learn-code-sustainably-2024"}
{"id": "defense-against-prompt-2024", "title": "Defense Against Prompt Injection Attack by Leveraging Attack Techniques", "authors": ["Yulin Chen", "Haoran Li", "Zihao Zheng", "Yangqiu Song", "Dekai Wu"], "year": 2024, "venue": "Annual Meeting of the Association for Computational Linguistics", "source_url": "https://arxiv.org/abs/2411.00459", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: With the advancement of technology, large language models (LLMs) have achieved remarkable performance across various natural language processing (NLP) tasks, powering LLM-integrated applications like ", "arxiv_id": "2411.00459", "doi": "10.48550/arXiv.2411.00459", "directory": "papers/defense-against-prompt-2024"}
{"id": "protect-llm-agent-2025", "title": "To Protect the LLM Agent Against the Prompt Injection Attack with Polymorphic Prompt", "authors": ["Zhilong Wang", "Neha Nagaraja", "Lan Zhang", "Hayretdin Bahşi", "Pawan Patil"], "year": 2025, "venue": "2025 55th Annual IEEE/IFIP International Conference on Dependable Systems and Networks - Supplemental Volume (DSN-S)", "source_url": "https://arxiv.org/abs/2506.05739", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: LLM agents are widely used as agents for customer support, content generation, and code assistance. However, they are vulnerable to prompt injection attacks, where adversarial inputs manipulate the mo", "arxiv_id": "2506.05739", "doi": "10.1109/DSN-S65789.2025.00037", "directory": "papers/protect-llm-agent-2025"}
-{"id": "prompt-injection-attacks-2025", "title": "Prompt Injection Attacks on Large Language Models: A Survey of Attack Methods, Root Causes, and Defense Strategies", "authors": ["Tongcheng Geng", "Zhiyuan Xu", "Yubin Qu", "W. E. Wong"], "year": 2025, "venue": "Unknown", "source_url": "https://doi.org/10.32604/cmc.2025.074081", "source": "semantic_scholar", "status": "downloaded", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar.", "doi": "10.32604/cmc.2025.074081"}
+{"id": "prompt-injection-attacks-2025", "title": "Prompt Injection Attacks on Large Language Models: A Survey of Attack Methods, Root Causes, and Defense Strategies", "authors": ["Tongcheng Geng", "Zhiyuan Xu", "Yubin Qu", "W. E. Wong"], "year": 2025, "venue": "Unknown", "source_url": "https://doi.org/10.32604/cmc.2025.074081", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar.", "doi": "10.32604/cmc.2025.074081"}
{"id": "prompt-injection-attacks-2026-2", "title": "Prompt Injection Attacks in Large Language Models and AI Agent Systems: A Comprehensive Review of Vulnerabilities, Attack Vectors, and Defense Mechanisms", "authors": ["Saidakhror Gulyamov", "Saidakhror Gulyamov", "A. Rodionov", "Rustam Khursanov", "Kambariddin Mekhmonov"], "year": 2026, "venue": "Unknown", "source_url": "https://doi.org/10.3390/info17010054", "source": "semantic_scholar", "status": "queued", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Large language models (LLMs) have rapidly transformed artificial intelligence applications across industries, yet their integration into production systems has unveiled critical security vulnerabiliti", "doi": "10.3390/info17010054"}
{"id": "prompt-injection-attacks-2026-2-2", "title": "PROMPT INJECTION ATTACKS IN LARGE LANGUAGE MODELS VIA A COMPREHENSIVE ANALYSIS OF ATTACK VECTORS, DEFENSE MECHANISMS, AND FUTURE DIRECTIONS", "authors": ["Unknown"], "year": 2026, "venue": "Unknown", "source_url": "https://doi.org/10.30546/2225-0530.14.2.2025.2013", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar.", "doi": "10.30546/2225-0530.14.2.2025.2013", "directory": "papers/prompt-injection-attacks-2026-2-2"}
{"id": "melon-provable-defense-2025", "title": "MELON: Provable Defense Against Indirect Prompt Injection Attacks in AI Agents", "authors": ["Kaijie Zhu", "Xianjun Yang", "Jindong Wang", "Wenbo Guo", "William Yang Wang"], "year": 2025, "venue": "International Conference on Machine Learning", "source_url": "https://arxiv.org/abs/2502.05174", "source": "semantic_scholar", "status": "downloaded", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Recent research has explored that LLM agents are vulnerable to indirect prompt injection (IPI) attacks, where malicious tasks embedded in tool-retrieved information can redirect the agent to take unau", "arxiv_id": "2502.05174", "directory": "papers/melon-provable-defense-2025"}
@@ -1077,7 +1077,7 @@
{"id": "cyberphysical-system-defense-2025", "title": "Cyber-Physical System Defense Against Structured False Data Injection Attacks Using an Adaptive Security Framework with Passivity Enhancement", "authors": ["R. Gopi", "Francis Shamili"], "year": 2025, "venue": "Unknown", "source_url": "https://doi.org/10.52783/jisem.v10i43s.8360", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: System integrity, operation, and significant breakdowns can be compromised by coordinated False Data Injection Attacks (FDIAs), which are increasingly prevalent in Cyber-Physical Systems (CPS). Becaus", "doi": "10.52783/jisem.v10i43s.8360", "directory": "papers/cyberphysical-system-defense-2025"}
{"id": "zeroshot-embedding-drift-2026", "title": "Zero-Shot Embedding Drift Detection: A Lightweight Defense Against Prompt Injections in LLMs", "authors": ["A. Sekar", "Mrinal Agarwal", "Rachel Sharma", "Akitsugu Tanaka", "Jasmine Zhang"], "year": 2026, "venue": "arXiv.org", "source_url": "https://arxiv.org/abs/2601.12359", "source": "semantic_scholar", "status": "downloaded", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Prompt injection attacks have become an increasing vulnerability for LLM applications, where adversarial prompts exploit indirect input channels such as emails or user-generated content to circumvent ", "arxiv_id": "2601.12359", "doi": "10.48550/arXiv.2601.12359", "directory": "papers/zeroshot-embedding-drift-2026"}
{"id": "exploring-clean-label-2024", "title": "Exploring Clean Label Backdoor Attacks and Defense in Language Models", "authors": ["Shuai Zhao", "Anh Tuan Luu", "Jie Fu", "Jinming Wen", "Weiqi Luo"], "year": 2024, "venue": "Unknown", "source_url": "https://doi.org/10.1109/TASLP.2024.3407571", "source": "semantic_scholar", "status": "queued", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Despite being widely applied, pre-trained language models have been proven vulnerable to backdoor attacks. Backdoor attacks are designed to introduce targeted vulnerabilities into models by poisoning ", "doi": "10.1109/TASLP.2024.3407571"}
-{"id": "indirect-prompt-injections-2025", "title": "Indirect Prompt Injections: Are Firewalls All You Need, or Stronger Benchmarks?", "authors": ["Rishika Bhagwatkar", "Kevin Kasa", "Abhay Puri", "Gabriel Huang", "Irina Rish"], "year": 2025, "venue": "arXiv.org", "source_url": "https://arxiv.org/abs/2510.05244", "source": "semantic_scholar", "status": "downloaded", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: AI agents are vulnerable to indirect prompt injection attacks, where malicious instructions embedded in external content or tool outputs cause unintended or harmful behavior. Inspired by the well-esta", "arxiv_id": "2510.05244", "doi": "10.48550/arXiv.2510.05244", "directory": "papers/indirect-prompt-injections-2025"}
+{"id": "indirect-prompt-injections-2025", "title": "Indirect Prompt Injections: Are Firewalls All You Need, or Stronger Benchmarks?", "authors": ["Rishika Bhagwatkar", "Kevin Kasa", "Abhay Puri", "Gabriel Huang", "Irina Rish"], "year": 2025, "venue": "arXiv.org", "source_url": "https://arxiv.org/abs/2510.05244", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: AI agents are vulnerable to indirect prompt injection attacks, where malicious instructions embedded in external content or tool outputs cause unintended or harmful behavior. Inspired by the well-esta", "arxiv_id": "2510.05244", "doi": "10.48550/arXiv.2510.05244", "directory": "papers/indirect-prompt-injections-2025"}
{"id": "aegis-automated-coevolutionary-2025", "title": "AEGIS : Automated Co-Evolutionary Framework for Guarding Prompt Injections Schema", "authors": ["Ting-Chun Liu", "C. Hsu", "Kuan-Yi Lee", "C. Fu", "Hung-yi Lee"], "year": 2025, "venue": "arXiv.org", "source_url": "https://arxiv.org/abs/2509.00088", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Prompt injection attacks pose a significant challenge to the safe deployment of Large Language Models (LLMs) in real-world applications. While prompt-based detection offers a lightweight and interpret", "arxiv_id": "2509.00088", "doi": "10.48550/arXiv.2509.00088", "directory": "papers/aegis-automated-coevolutionary-2025"}
{"id": "hacking-llms-technical-2025", "title": "Hacking LLMs: A Technical Analysis of Security Vulnerabilities and Defense Mechanisms", "authors": ["G. Raj", "Hamzah", "Nikhil Raj", "Nikhil Ranjan"], "year": 2025, "venue": "Unknown", "source_url": "https://doi.org/10.1109/CICTN64563.2025.10932638", "source": "semantic_scholar", "status": "queued", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Large Language Models (LLMs) such as GPT-4 and Google’s Gemini have revolutionized the landscape of artificial intelligence, enabling sophisticated natural language processing capabilities across dive", "doi": "10.1109/CICTN64563.2025.10932638"}
{"id": "safeguarding-visionlanguage-models-2024", "title": "Safeguarding Vision-Language Models Against Patched Visual Prompt Injectors", "authors": ["Jiachen Sun", "Changsheng Wang", "Jiong Wang", "Yiwei Zhang", "Chaowei Xiao"], "year": 2024, "venue": "arXiv.org", "source_url": "https://arxiv.org/abs/2405.10529", "source": "semantic_scholar", "status": "downloaded", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Large language models have become increasingly prominent, also signaling a shift towards multimodality as the next frontier in artificial intelligence, where their embeddings are harnessed as prompts ", "arxiv_id": "2405.10529", "doi": "10.48550/arXiv.2405.10529", "directory": "papers/safeguarding-visionlanguage-models-2024"}
@@ -2648,7 +2648,7 @@
{"id": "attention-is-all-you-need-2017", "title": "Attention Is All You Need", "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar", "Jakob Uszkoreit", "Llion Jones", "Aidan N. Gomez", "Lukasz Kaiser", "Illia Polosukhin"], "year": 2017, "venue": "NeurIPS 2017", "source": "manual", "status": "scanned", "tags": ["landmark"], "added": "2026-03-05", "notes": "Foundational transformer architecture paper. 100k+ citations. Basis for all LLMs.", "arxiv_id": "1706.03762", "doi": "10.48550/arXiv.1706.03762", "directory": "papers/attention-is-all-you-need-2017"}
{"id": "bert-pretraining-deep-2018", "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", "authors": ["Jacob Devlin", "Ming-Wei Chang", "Kenton Lee", "Kristina Toutanova"], "year": 2018, "venue": "NAACL 2019", "source": "manual", "status": "scanned", "tags": ["landmark"], "added": "2026-03-05", "notes": "Foundational pre-training paper. Basis for CodeBERT and downstream code models.", "arxiv_id": "1810.04805", "doi": "10.48550/arXiv.1810.04805", "directory": "papers/bert-pretraining-deep-2018"}
{"id": "sparks-agi-early-2023", "title": "Sparks of Artificial General Intelligence: Early Experiments with GPT-4", "authors": ["Sébastien Bubeck", "Varun Chandrasekaran", "Ronen Eldan", "Johannes Gehrke", "Eric Horvitz", "Ece Kamar", "Peter Lee", "Yin Tat Lee", "Yuanzhi Li", "Scott Lundberg"], "year": 2023, "venue": "arXiv preprint", "source": "manual", "status": "downloaded", "tags": ["landmark"], "added": "2026-03-05", "notes": "Most-discussed GPT-4 capabilities analysis. Massive tech media coverage.", "arxiv_id": "2303.12528", "doi": "10.48550/arXiv.2303.12528", "directory": "papers/sparks-agi-early-2023"}
-{"id": "react-synergizing-reasoning-2022", "title": "ReAct: Synergizing Reasoning and Acting in Language Models", "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"], "year": 2022, "venue": "ICLR 2023", "source": "manual", "status": "downloaded", "tags": ["landmark"], "added": "2026-03-05", "notes": "Foundation of agent reasoning+acting paradigm. Cited by virtually every agentic AI paper.", "arxiv_id": "2210.03629", "doi": "10.48550/arXiv.2210.03629", "directory": "papers/react-synergizing-reasoning-2022"}
+{"id": "react-synergizing-reasoning-2022", "title": "ReAct: Synergizing Reasoning and Acting in Language Models", "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"], "year": 2022, "venue": "ICLR 2023", "source": "manual", "status": "scanned", "tags": ["landmark"], "added": "2026-03-05", "notes": "Foundation of agent reasoning+acting paradigm. Cited by virtually every agentic AI paper.", "arxiv_id": "2210.03629", "doi": "10.48550/arXiv.2210.03629", "directory": "papers/react-synergizing-reasoning-2022"}
{"id": "toolformer-language-models-2023", "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì", "Roberta Raileanu", "Maria Lomeli", "Luke Zettlemoyer", "Nicola Cancedda", "Thomas Scialom"], "year": 2023, "venue": "NeurIPS 2023", "source": "manual", "status": "downloaded", "tags": ["landmark"], "added": "2026-03-05", "notes": "Tool-use paradigm that underpins agentic AI. Major Hacker News coverage.", "arxiv_id": "2302.04761", "doi": "10.48550/arXiv.2302.04761", "directory": "papers/toolformer-language-models-2023"}
{"id": "alphacode-competition-level-2022", "title": "Competition-Level Code Generation with AlphaCode", "authors": ["Yujia Li", "David Choi", "Junyoung Chung", "Nate Kushman", "Julian Schrittwieser", "Rémi Leblond", "Tom Eccles", "James Keeling", "Felix Gimeno", "Agustin Dal Lago"], "year": 2022, "venue": "Science", "source": "manual", "status": "scanned", "tags": ["landmark"], "added": "2026-03-05", "notes": "DeepMind competition-level code generation. Published in Science. Major tech media coverage.", "arxiv_id": "2203.07814", "doi": "10.1126/science.abq1158", "directory": "papers/alphacode-competition-level-2022"}
{"id": "codebert-pretrained-model-2020", "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages", "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang", "Nan Duan", "Xiaocheng Feng", "Ming Gong", "Linjun Shou", "Bing Qin", "Ting Liu", "Daxin Jiang", "Ming Zhou"], "year": 2020, "venue": "EMNLP 2020 Findings", "source": "manual", "status": "scanned", "tags": ["landmark"], "added": "2026-03-05", "notes": "Foundational code language model. 5000+ citations. Basis for many downstream code models.", "arxiv_id": "2002.08155", "doi": "10.48550/arXiv.2002.08155", "directory": "papers/codebert-pretrained-model-2020"}
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -730,33 +730,126 @@ def build():
"pca": pca_result,
}
- # --- Citation network ---
+ # --- Citation network (built from cited_papers in scan.json) ---
v2_ids = {p["id"] for p in papers_full}
year_map = {p["id"]: p["year"] for p in papers_full}
+ title_map = {p["id"]: p["title"] for p in papers_full}
- all_graph_ids = {n["id"] for n in citation_data.get("nodes", [])}
- net_edges = []
- in_degree = Counter()
- for edge in citation_data.get("edges", []):
- s, t = edge["source"], edge["target"]
- if s in all_graph_ids and t in all_graph_ids:
- net_edges.append([s, t])
- in_degree[t] += 1
+ # Build title→id index from registry (case-insensitive)
+ title_to_id = {}
+ for entry in registry.values():
+ t = entry.get("title", "").lower().strip()
+ if t:
+ title_to_id[t] = entry["id"]
+ for p in papers_full:
+ t = p["title"].lower().strip()
+ if t:
+ title_to_id[t] = p["id"]
+
+ # Extract directed edges from cited_papers
+ net_edge_set = set()
+ for p in papers_full:
+ src = p["id"]
+ detail = paper_details.get(src, {})
+ for item in detail.get("checklist", []):
+ pass # checklist doesn't have cited_papers
+ # Read cited_papers from the original scan.json
+ scan_path = PAPERS_DIR / src / "scan.json"
+ if scan_path.exists():
+ with open(scan_path) as f:
+ scan_data = json.load(f)
+ for cited in scan_data.get("cited_papers", []):
+ ct = cited.get("title", "").lower().strip()
+ target = title_to_id.get(ct)
+ if target and target != src:
+ net_edge_set.add((src, target))
+
+ net_edges = [[s, t] for s, t in net_edge_set]
+ in_degree = Counter(t for _, t in net_edge_set)
+ out_degree = Counter(s for s, _ in net_edge_set)
+
+ # Collect all node IDs that appear in edges
+ all_net_ids = set()
+ for s, t in net_edge_set:
+ all_net_ids.add(s)
+ all_net_ids.add(t)
net_nodes = []
- for node in citation_data.get("nodes", []):
- nid = node["id"]
+ for nid in sorted(all_net_ids):
+ reg = registry.get(nid, {})
net_nodes.append({
"id": nid,
- "title": node.get("title", nid),
+ "title": title_map.get(nid, reg.get("title", nid)),
"score": score_map.get(nid),
- "year": year_map.get(nid, registry.get(nid, {}).get("year")),
+ "year": year_map.get(nid, reg.get("year")),
"in_degree": in_degree.get(nid, 0),
+ "out_degree": out_degree.get(nid, 0),
"has_scan": nid in v2_ids,
})
network = {"nodes": net_nodes, "edges": net_edges}
+ # --- Network findings ---
+ # Foundational leaderboard: top 15 most-cited with scores
+ foundational = []
+ for nid, deg in in_degree.most_common(20):
+ foundational.append({
+ "id": nid,
+ "title": title_map.get(nid, registry.get(nid, {}).get("title", nid)),
+ "in_degree": deg,
+ "score": score_map.get(nid),
+ })
+
+ # Quality contagion: mean score by % of high-quality citations
+ contagion_threshold = 50
+ high_q = {pid for pid, sc in score_map.items() if sc >= contagion_threshold}
+ contagion_bands = {"0%": [], "1-33%": [], "34-66%": [], "67-100%": []}
+ out_map = defaultdict(list)
+ for s, t in net_edge_set:
+ out_map[s].append(t)
+ for pid in score_map:
+ cited = [t for t in out_map.get(pid, []) if t in score_map]
+ if len(cited) < 2:
+ continue
+ pct = sum(1 for t in cited if t in high_q) / len(cited) * 100
+ if pct == 0:
+ band = "0%"
+ elif pct <= 33:
+ band = "1-33%"
+ elif pct <= 66:
+ band = "34-66%"
+ else:
+ band = "67-100%"
+ contagion_bands[band].append(score_map[pid])
+
+ quality_contagion = {}
+ for band_name in ["0%", "1-33%", "34-66%", "67-100%"]:
+ ss = contagion_bands[band_name]
+ if ss:
+ quality_contagion[band_name] = {"n": len(ss), "mean": safe_mean(ss)}
+
+ # Rigor diffusion: for top cited papers, mean score of their citers
+ in_map = defaultdict(list)
+ for s, t in net_edge_set:
+ in_map[t].append(s)
+ rigor_diffusion = []
+ for nid, deg in in_degree.most_common(15):
+ citers = [s for s in in_map[nid] if s in score_map]
+ rigor_diffusion.append({
+ "id": nid,
+ "title": title_map.get(nid, registry.get(nid, {}).get("title", nid)),
+ "score": score_map.get(nid),
+ "in_degree": deg,
+ "citer_mean": safe_mean([score_map[c] for c in citers]) if citers else None,
+ "citer_n": len(citers),
+ })
+
+ findings["network_insights"] = {
+ "foundational": foundational,
+ "quality_contagion": quality_contagion,
+ "rigor_diffusion": rigor_diffusion,
+ }
+
# --- Write files ---
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
papers_detail_dir = OUTPUT_DIR / "papers"