Expand HN analysis: scatter, tag paradox, repost/controversy signals - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

commit f41b10dd2346179281c7a9ef5e809bb5cb87c2ec
parent 7072c581ce666fdd9dc061902b0e8823888d9732
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue, 24 Mar 2026 06:19:41 +0100

Expand HN analysis: scatter, tag paradox, repost/controversy signals

Social Attention section now includes:
- HN points vs methodology scatter (586 papers, log-scale x-axis)
  with "the blob has no slope" annotation
- Case study paradox: tag comparison showing HN attention vs methodology
  side-by-side (case studies: most HN love, worst methodology)
- Repost signal: 8+ reposts = 50.6% method vs 48.0% for single posts
- Controversy signal: high-discussion papers score 50.7% vs 48.9%
- Updated heatmap annotation text for n=932 correlation values

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M explorer/src/views/findings.ts  | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M scripts/build-explorer-data.py  | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++

2 files changed, 170 insertions(+), 11 deletions(-)
diff --git a/explorer/src/views/findings.ts b/explorer/src/views/findings.ts
@@ -150,9 +150,10 @@ function renderCorrelationHeatmap(f: Findings): string {
     </div>
     <div style="font-size:0.82rem;color:var(--text-dim);margin-top:0.75rem">
       <strong>Key patterns:</strong>
-      Contamination \u2194 data leakage (r=0.87) are effectively the same decision.
-      Artifacts \u2194 statistical methodology (r=0.05) are completely independent — releasing code says nothing about statistical rigor.
-      Human studies \u2194 artifacts (r=\u22120.24) is the strongest negative — two research traditions that don't speak to each other.
+      Contamination \u2194 data leakage (r=0.86) are effectively the same decision.
+      Artifacts \u2194 statistical methodology (r=0.06) are completely independent — releasing code says nothing about statistical rigor.
+      Human studies \u2194 artifacts (r=\u22120.20) is the strongest negative — two research traditions that don't speak to each other.
+      Three independent rigor clusters: transparency/artifacts, statistics/experimental, contamination awareness.
     </div>
   </div>`;
 }
@@ -712,31 +713,85 @@ function renderHnAnalysis(f: Findings): string {
   const topHn = hn.top_hn as { id: string; title: string; score: number; hn_points: number }[];
   const gems = hn.hidden_gems as { id: string; title: string; score: number; hn_points: number }[];
   const overhyped = hn.overhyped as { id: string; title: string; score: number; hn_points: number }[];
+  const scatter = hn.scatter as { id: string; hn: number; score: number; log_hn: number }[];
+  const tagComp = hn.tag_comparison as Record<string, { n: number; mean_hn: number; mean_score: number }>;
+  const repost = hn.repost_signal as Record<string, { n: number; mean_score: number; mean_hn: number }>;
+  const controversy = hn.controversy as { high_discussion_mean: number; low_discussion_mean: number; high_n: number; low_n: number } | undefined;
+
+  // Scatter: HN points (log) vs methodology score
+  const scatterSvg = renderHnScatter(scatter);
+
+  // Tag comparison bars
+  const tagBars = Object.entries(tagComp)
+    .sort((a, b) => b[1].mean_hn - a[1].mean_hn)
+    .map(([tag, d]) => {
+      const hnColor = 'var(--accent)';
+      const scColor = d.mean_score < 42 ? 'var(--red)' : d.mean_score < 52 ? 'var(--yellow)' : 'var(--green)';
+      return `<div style="margin-bottom:0.6rem">
+        <div class="hbar-label"><span>${formatName(tag)} <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
+        <div style="display:flex;gap:4px">
+          <div class="hbar-track" style="flex:1"><div class="hbar-fill" style="width:${Math.min(d.mean_hn / 120 * 100, 100)}%;background:${hnColor}"></div></div>
+          <div class="hbar-track" style="flex:1"><div class="hbar-fill" style="width:${d.mean_score}%;background:${scColor}"></div></div>
+        </div>
+        <div style="display:flex;justify-content:space-between;font-size:0.72rem;color:var(--text-dim)">
+          <span>HN: ${d.mean_hn.toFixed(0)} pts</span>
+          <span>Method: ${d.mean_score}%</span>
+        </div>
+      </div>`;
+    }).join('');
 
   return `<div class="section">
     <h2>Social Attention vs Rigor</h2>
     <p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">Hacker News discussion data for ${hn.total_with_hn} papers (${hn.total_without_hn} had no HN presence). Correlation between HN points and methodology score: <strong>r=${hn.correlation}</strong> — social attention is uncorrelated with rigor.</p>
 
-    ${hn.engagement_n >= 10 ? `<div style="margin-bottom:1.5rem">
-      <h3 style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">What Predicts HN Attention? (n=${hn.engagement_n} papers with engagement scores)</h3>
-      <p style="font-size:0.8rem;color:var(--text-dim);margin-bottom:0.75rem">Correlation of 6 engagement factors with log(HN points). Methodology score r=${hn.correlation} shown for comparison.</p>
+    ${scatterSvg}
+
+    <div class="detail-grid" style="margin-top:1.5rem">
+      <div>
+        <h3 style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">The Case Study Paradox</h3>
+        <p style="font-size:0.8rem;color:var(--text-dim);margin-bottom:0.75rem">HN attention (blue, left bar) vs methodology score (right bar) by paper type. Case studies get the most love with the worst rigor.</p>
+        ${tagBars}
+      </div>
+      <div>
+        <h3 style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">Signals in the Noise</h3>
+        ${repost ? `<div style="margin-bottom:1rem">
+          <p style="font-size:0.8rem;color:var(--text-dim);margin-bottom:0.5rem"><strong>Repost signal:</strong> papers shared repeatedly tend to have more substance.</p>
+          ${Object.entries(repost).map(([label, d]) => `<div class="hbar">
+            <div class="hbar-label"><span>${label}</span><span>${d.mean_score}% <span style="color:var(--text-dim)">(n=${d.n})</span></span></div>
+            <div class="hbar-track"><div class="hbar-fill" style="width:${d.mean_score}%;background:${d.mean_score >= 50 ? 'var(--green)' : 'var(--yellow)'}"></div></div>
+          </div>`).join('')}
+        </div>` : ''}
+        ${controversy ? `<div style="margin-bottom:1rem">
+          <p style="font-size:0.8rem;color:var(--text-dim);margin-bottom:0.5rem"><strong>Controversy signal:</strong> papers people argue about tend to have something real to argue about.</p>
+          <div class="hbar">
+            <div class="hbar-label"><span>High discussion</span><span>${controversy.high_discussion_mean}% <span style="color:var(--text-dim)">(n=${controversy.high_n})</span></span></div>
+            <div class="hbar-track"><div class="hbar-fill" style="width:${controversy.high_discussion_mean}%;background:var(--green)"></div></div>
+          </div>
+          <div class="hbar">
+            <div class="hbar-label"><span>Upvote-and-move-on</span><span>${controversy.low_discussion_mean}% <span style="color:var(--text-dim)">(n=${controversy.low_n})</span></span></div>
+            <div class="hbar-track"><div class="hbar-fill" style="width:${controversy.low_discussion_mean}%;background:var(--yellow)"></div></div>
+          </div>
+        </div>` : ''}
+      </div>
+    </div>
+
+    ${hn.engagement_n >= 10 ? `<div style="margin-top:1.5rem">
+      <h3 style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">Engagement Factor Correlations (n=${hn.engagement_n})</h3>
       ${['brand_recognition', 'fear_safety', 'drama_conflict', 'surprise_contrarian', 'practical_relevance', 'demo_ability'].map(dim => {
         const r = hn.engagement_correlations?.[dim];
-        const split = hn.engagement_split?.[dim];
         if (r === undefined) return '';
         const color = r > 0.1 ? 'var(--green)' : r < -0.1 ? 'var(--red)' : 'var(--text-dim)';
         const label = dim.replace(/_/g, ' ').replace(/\b\w/g, (c: string) => c.toUpperCase());
         return `<div class="hbar">
-          <div class="hbar-label"><span>${label}</span><span style="color:${color}">r=${r > 0 ? '+' : ''}${r.toFixed(3)}${split ? ` (high HN: ${split.high_hn_mean}, low: ${split.low_hn_mean})` : ''}</span></div>
+          <div class="hbar-label"><span>${label}</span><span style="color:${color}">r=${r > 0 ? '+' : ''}${r.toFixed(3)}</span></div>
           <div class="hbar-track"><div class="hbar-fill" style="width:${Math.abs(r) * 300}%;max-width:100%;background:${color}"></div></div>
         </div>`;
       }).join('')}
     </div>` : ''}
 
-    <div class="detail-grid">
+    <div class="detail-grid" style="margin-top:1.5rem">
       <div>
         <h3 style="font-size:0.85rem;color:var(--green);margin-bottom:0.5rem">Hidden Gems (score \u226565%, \u22645 HN pts)</h3>
-        <p style="font-size:0.8rem;color:var(--text-dim);margin-bottom:0.5rem">Rigorous papers the internet hasn't noticed.</p>
         ${gems.map(p => `<div class="game-row" style="padding:0.3rem 0">
           <span style="font-size:0.82rem"><a href="#/paper/${p.id}" style="color:var(--accent);text-decoration:none">${p.title.length > 50 ? p.title.slice(0, 47) + '...' : p.title}</a></span>
           <span style="font-family:var(--font);font-size:0.8rem"><span style="color:var(--green)">${p.score}%</span></span>
@@ -744,7 +799,6 @@ function renderHnAnalysis(f: Findings): string {
       </div>
       <div>
         <h3 style="font-size:0.85rem;color:var(--red);margin-bottom:0.5rem">Overhyped (score &lt;40%, \u226530 HN pts)</h3>
-        <p style="font-size:0.8rem;color:var(--text-dim);margin-bottom:0.5rem">Popular papers with weak methodology.</p>
         ${overhyped.map(p => `<div class="game-row" style="padding:0.3rem 0">
           <span style="font-size:0.82rem"><a href="#/paper/${p.id}" style="color:var(--accent);text-decoration:none">${p.title.length > 50 ? p.title.slice(0, 47) + '...' : p.title}</a></span>
           <span style="font-family:var(--font);font-size:0.8rem"><span style="color:var(--red)">${p.score}%</span> <span style="color:var(--text-dim)">${p.hn_points}pts</span></span>
@@ -767,6 +821,55 @@ function renderHnAnalysis(f: Findings): string {
   </div>`;
 }
 
+function renderHnScatter(scatter: { id: string; hn: number; score: number; log_hn: number }[]): string {
+  if (!scatter || scatter.length < 10) return '';
+
+  const w = 600, h = 350;
+  const pad = { l: 50, r: 20, t: 15, b: 45 };
+  const cw = w - pad.l - pad.r, ch = h - pad.t - pad.b;
+
+  const maxLog = Math.max(...scatter.map(p => p.log_hn));
+  const xScale = (v: number) => pad.l + (v / maxLog) * cw;
+  const yScale = (v: number) => pad.t + ch - (v / 100) * ch;
+
+  // Grid
+  let grid = '';
+  for (let v = 0; v <= 100; v += 25) {
+    grid += `<text x="${pad.l - 8}" y="${yScale(v) + 4}" text-anchor="end" font-size="10">${v}%</text>`;
+    grid += `<line class="grid-line" x1="${pad.l}" x2="${w - pad.r}" y1="${yScale(v)}" y2="${yScale(v)}" stroke-dasharray="3"/>`;
+  }
+  // X-axis: log scale labels
+  for (const pts of [1, 10, 50, 100, 500, 1000]) {
+    const lv = Math.log(pts + 1);
+    if (lv <= maxLog) {
+      grid += `<text x="${xScale(lv)}" y="${h - 8}" text-anchor="middle" font-size="10">${pts}</text>`;
+    }
+  }
+
+  // Dots
+  let dots = '';
+  for (const p of scatter) {
+    const cx = xScale(p.log_hn);
+    const cy = yScale(p.score);
+    const color = p.score < 40 ? '#f06565' : p.score < 55 ? '#f0c050' : '#3dd68c';
+    dots += `<circle cx="${cx}" cy="${cy}" r="3" fill="${color}" opacity="0.5">
+      <title>${p.id}: ${p.score}% method, ${p.hn} HN pts</title>
+    </circle>`;
+  }
+
+  // Axes
+  const axes = `
+    <text x="${(pad.l + w - pad.r) / 2}" y="${h - 25}" text-anchor="middle" fill="var(--text-dim)" font-size="11">HN Points (log scale) \u2192</text>
+    <text x="14" y="${(pad.t + h - pad.b) / 2}" text-anchor="middle" fill="var(--text-dim)" font-size="11" transform="rotate(-90, 14, ${(pad.t + h - pad.b) / 2})">Methodology Score \u2192</text>
+  `;
+
+  return `<div>
+    <h3 style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">${scatter.length} Papers: HN Points vs Methodology</h3>
+    <svg viewBox="0 0 ${w} ${h}" style="width:100%;max-width:${w}px">${grid}${dots}${axes}</svg>
+    <p style="font-size:0.78rem;color:var(--text-dim)">The blob has no slope. Social attention is random with respect to rigor.</p>
+  </div>`;
+}
+
 function renderGames(f: Findings): string {
   const sorted = Object.entries(f.game_pcts).sort((a, b) => b[1] - a[1]);
   return `<div class="section">
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -947,6 +947,58 @@ def build():
                 "low_hn_mean": round(sum(p[dim] for p in low_hn) / len(low_hn), 2) if low_hn else 0,
             }
 
+    # Scatter points for HN vs methodology (log scale, papers with HN > 0)
+    import math as _math
+    hn_scatter = [{"id": p["id"], "hn": p["hn_points"], "score": p["score"],
+                   "log_hn": round(_math.log(p["hn_points"] + 1), 2)}
+                  for p in hn_with_attention]
+
+    # Tag comparison: avg HN attention + avg methodology per tag
+    tag_hn_comparison = {}
+    tag_groups = defaultdict(list)
+    for p in papers_full:
+        hn_pts = p.get("hn_points", 0)
+        for t in p["tags"]:
+            tag_groups[t].append({"hn": hn_pts, "score": p["score"]})
+    for tag, ps in tag_groups.items():
+        on_hn = [p for p in ps if p["hn"] > 0]
+        if len(on_hn) >= 5:
+            tag_hn_comparison[tag] = {
+                "n": len(on_hn),
+                "mean_hn": round(sum(p["hn"] for p in on_hn) / len(on_hn), 1),
+                "mean_score": round(sum(p["score"] for p in on_hn) / len(on_hn), 1),
+            }
+
+    # Repost signal: quality by number of HN threads
+    repost_bands = {}
+    for label, lo, hi in [("1 post", 1, 1), ("2-3", 2, 3), ("4-7", 4, 7), ("8+", 8, 999)]:
+        band_papers = [p for p in hn_with_attention
+                       if lo <= len(load_hn(p["id"]).get("threads", [])) <= hi]
+        if band_papers:
+            repost_bands[label] = {
+                "n": len(band_papers),
+                "mean_score": safe_mean([p["score"] for p in band_papers]),
+                "mean_hn": safe_mean([p["hn_points"] for p in band_papers]),
+            }
+
+    # Controversy signal: comment-to-point ratio vs quality
+    controversy = {}
+    comment_papers = [p for p in hn_with_attention if p["hn_points"] >= 10]
+    if comment_papers:
+        for p in comment_papers:
+            hn_d = load_hn(p["id"])
+            p["_total_comments"] = sum(t.get("comments", 0) for t in hn_d.get("threads", []))
+            p["_cpt"] = p["_total_comments"] / p["hn_points"] if p["hn_points"] else 0
+        med_cpt = sorted(p["_cpt"] for p in comment_papers)[len(comment_papers) // 2]
+        high_disc = [p for p in comment_papers if p["_cpt"] > med_cpt]
+        low_disc = [p for p in comment_papers if p["_cpt"] <= med_cpt]
+        controversy = {
+            "high_discussion_mean": safe_mean([p["score"] for p in high_disc]),
+            "low_discussion_mean": safe_mean([p["score"] for p in low_disc]),
+            "high_n": len(high_disc),
+            "low_n": len(low_disc),
+        }
+
     hn_analysis = {
         "total_with_hn": len(hn_with_attention),
         "total_without_hn": len(hn_without),
@@ -956,6 +1008,10 @@ def build():
         "top_hn": sorted(hn_with_attention, key=lambda p: -p["hn_points"])[:20],
         "hidden_gems": hidden_gems,
         "overhyped": overhyped,
+        "scatter": hn_scatter,
+        "tag_comparison": tag_hn_comparison,
+        "repost_signal": repost_bands,
+        "controversy": controversy,
         "engagement_correlations": engagement_corrs,
         "engagement_split": engagement_split,
         "engagement_n": len(v3_hn_papers),

	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs

M	explorer/src/views/findings.ts	\|	125	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
M	scripts/build-explorer-data.py	\|	56	++++++++++++++++++++++++++++++++++++++++++++++++++++++++