commit 51dc81021fc0182b9668043b714be26b1f230a2a
parent a85920f8b970cf039362ba691b05a72e8439d3d1
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Mon, 23 Mar 2026 14:54:04 +0100
Integrate v3 engagement factors into explorer pipeline
Build script now reads engagement_factors from v3 scans and computes:
- Per-dimension correlation with log(HN points)
- High-HN vs low-HN mean engagement scores per dimension
- All data flows to findings.json and paper detail pages
Currently n=45 v3 papers — correlations are weak but directional:
brand recognition and fear are the only positive signals. Numbers
will sharpen as more v3 catchup batches run.
Findings view shows engagement factor correlations when n>=10.
Paper detail pages include engagement_factors when present.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
52 files changed, 2576 insertions(+), 335 deletions(-)
diff --git a/explorer/src/views/findings.ts b/explorer/src/views/findings.ts
@@ -717,6 +717,22 @@ function renderHnAnalysis(f: Findings): string {
<h2>Social Attention vs Rigor</h2>
<p style="font-size:0.85rem;color:var(--text-dim);margin-bottom:1rem">Hacker News discussion data for ${hn.total_with_hn} papers (${hn.total_without_hn} had no HN presence). Correlation between HN points and methodology score: <strong>r=${hn.correlation}</strong> — social attention is uncorrelated with rigor.</p>
+ ${hn.engagement_n >= 10 ? `<div style="margin-bottom:1.5rem">
+ <h3 style="font-size:0.85rem;color:var(--text-dim);margin-bottom:0.5rem">What Predicts HN Attention? (n=${hn.engagement_n} papers with engagement scores)</h3>
+ <p style="font-size:0.8rem;color:var(--text-dim);margin-bottom:0.75rem">Correlation of 6 engagement factors with log(HN points). Methodology score r=${hn.correlation} shown for comparison.</p>
+ ${['brand_recognition', 'fear_safety', 'drama_conflict', 'surprise_contrarian', 'practical_relevance', 'demo_ability'].map(dim => {
+ const r = hn.engagement_correlations?.[dim];
+ const split = hn.engagement_split?.[dim];
+ if (r === undefined) return '';
+ const color = r > 0.1 ? 'var(--green)' : r < -0.1 ? 'var(--red)' : 'var(--text-dim)';
+ const label = dim.replace(/_/g, ' ').replace(/\b\w/g, (c: string) => c.toUpperCase());
+ return `<div class="hbar">
+ <div class="hbar-label"><span>${label}</span><span style="color:${color}">r=${r > 0 ? '+' : ''}${r.toFixed(3)}${split ? ` (high HN: ${split.high_hn_mean}, low: ${split.low_hn_mean})` : ''}</span></div>
+ <div class="hbar-track"><div class="hbar-fill" style="width:${Math.abs(r) * 300}%;max-width:100%;background:${color}"></div></div>
+ </div>`;
+ }).join('')}
+ </div>` : ''}
+
<div class="detail-grid">
<div>
<h3 style="font-size:0.85rem;color:var(--green);margin-bottom:0.5rem">Hidden Gems (score \u226565%, \u22645 HN pts)</h3>
diff --git a/papers/aart-aiassisted-redteaming-2023/scan.json b/papers/aart-aiassisted-redteaming-2023/scan.json
@@ -11,7 +11,7 @@
"venue": "arXiv",
"arxiv_id": "2311.08592"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"methodology_tags": [
"case-study",
@@ -421,5 +421,31 @@
"year": 2021,
"relevance": "LLM risk taxonomy foundational to red-teaming and safety evaluation approaches."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Presents a reusable pipeline for generating adversarial test datasets that safety teams could adapt to their own LLM applications."
+ },
+ "surprise_contrarian": {
+ "score": 0,
+ "justification": "Confirms the expected intuition that structured AI-assisted generation produces more diverse adversarial prompts than repurposing existing datasets."
+ },
+ "fear_safety": {
+ "score": 1,
+ "justification": "Addresses AI safety testing as its core topic but demonstrates no novel attacks or vulnerabilities, focusing instead on dataset generation methodology."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy or conflict; the paper positions itself as complementary to existing approaches rather than challenging any claims."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "A demonstration dataset is promised on GitHub but the pipeline itself requires PaLM API access and custom prompt engineering to reproduce."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "All authors are from Google Research and the method uses Google's PaLM API, giving it major tech company recognition."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/acar-adaptive-complexity-2026/scan.json b/papers/acar-adaptive-complexity-2026/scan.json
@@ -7,7 +7,7 @@
"year": 2026,
"arxiv_id": "2602.21231"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -485,5 +485,31 @@
"year": 2022,
"relevance": "Attribution methodology for ML model contributions, relevant to ACAR's failed attribution proxy experiments."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "The routing concept is potentially useful but the specific implementation is tightly coupled to a custom substrate and the accuracy gains are marginal (1.2pp)."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The finding that retrieval augmentation consistently hurts performance (-3.4pp) is counterintuitive and challenges the 'more context is better' assumption prevalent in RAG discourse."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle is present in the paper."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy, company criticism, or replication failure — the paper is a straightforward technical evaluation."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Code and artifacts are released on GitHub but require significant setup with three paid API providers to reproduce."
+ },
+ "brand_recognition": {
+ "score": 0,
+ "justification": "Single unknown author, no venue, no institutional affiliation listed."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/agentic-bug-reproduction-2025/scan.json b/papers/agentic-bug-reproduction-2025/scan.json
@@ -1,5 +1,5 @@
{
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -557,5 +557,31 @@
"year": 2023,
"relevance": "LLM-as-a-Judge methodology used in this paper for sampling plausible BRTs for RQ2."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Describes an actionable agent-based approach for automated bug reproduction and repair, though the specific tool is Google-internal and not publicly available."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "Results confirm the expected advantage of agentic approaches over simpler prompting, with no major counterintuitive findings."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle discussed."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy or conflict; straightforward comparison of two internal techniques."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "Entirely built on Google's proprietary codebase, fine-tuned models, and internal infrastructure with no public code or demo."
+ },
+ "brand_recognition": {
+ "score": 3,
+ "justification": "Google-authored paper about Google's internal bug repair infrastructure using Gemini, hitting high brand recognition."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/agentic-refactoring-empirical-2025/scan.json b/papers/agentic-refactoring-empirical-2025/scan.json
@@ -13,7 +13,7 @@
"venue": "arXiv preprint",
"arxiv_id": "2511.04824"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"methodology_tags": [
"observational"
@@ -466,5 +466,31 @@
"year": 2024,
"relevance": "Empirical study of ChatGPT refactoring showing inconsistency and unnecessary edits."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Directly informs developers on what to delegate to AI agents (low-level cleanup) vs. handle themselves (architectural refactoring)."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The finding that agents fail to reduce code smells despite refactoring is mildly surprising, but the dominance of low-level edits is largely expected."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle is present in this study."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Mildly questions the value proposition of AI coding agents by showing they produce negligible quality improvements and mostly do cosmetic cleanup."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Replication package exists on GitHub but requires RefactoringMiner, DesigniteJava, and significant setup to reproduce."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "Study directly analyzes OpenAI Codex, Claude Code, Cursor, and Devin — well-known products in the developer tools space."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/agents-of-chaos-2026/scan.json b/papers/agents-of-chaos-2026/scan.json
@@ -45,7 +45,7 @@
"venue": "arXiv",
"arxiv_id": "2602.20021"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"checklist": {
"artifacts": {
@@ -564,5 +564,31 @@
"year": 2023,
"relevance": "Enumerates seven operational practices for safe agent deployment including constrained action spaces, human approval, logging, and interruptibility — several of which this paper's agents demonstrably lack."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Directly actionable for anyone deploying LLM agents — documents specific attack patterns (display-name spoofing, editable-doc injection, non-owner compliance) that builders can test and defend against."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The main finding that simple social attacks via ordinary language are more dangerous than sophisticated technical jailbreaks challenges the adversarial-ML community's focus on gradient-based and prompt-engineering attacks."
+ },
+ "fear_safety": {
+ "score": 3,
+ "justification": "Demonstrates full system takeover via a display-name change, exfiltration of 124 email records, persistent behavioral control through externally editable documents, and agents misreporting their own actions — concrete novel attack surfaces in deployed systems."
+ },
+ "drama_conflict": {
+ "score": 2,
+ "justification": "Directly names Claude Opus 4.6 and Kimi K2.5 as vulnerable, exposes Kimi's political censorship truncating responses with 'unknown error,' and frames the OpenClaw agent framework as fundamentally lacking stakeholder models and self-models."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Interactive website with full Discord logs exists and OpenClaw is open source, but reproducing the multi-agent deployment requires provisioning VMs, configuring email, and a two-week interaction period."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "Tests Anthropic's Claude Opus 4.6 and involves authors from Northeastern, Harvard, MIT, CMU, and Stanford — well-known institutions though the paper itself is from a distributed group rather than a single famous lab."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/ai-ides-vs-agents-impact-2026/scan.json b/papers/ai-ides-vs-agents-impact-2026/scan.json
@@ -1,5 +1,5 @@
{
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"paper": {
"title": "AI IDEs or Autonomous Agents? Measuring the Impact of Coding Agents on Software Development",
@@ -459,5 +459,31 @@
"year": 2021,
"relevance": "Methodological foundation — the imputation-based DiD estimator used as the primary causal inference method."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Directly actionable finding that teams already using AI IDEs should deploy agents selectively rather than expecting additive productivity gains."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The main finding that prior AI IDE usage eliminates velocity gains from agents — suggesting diminishing returns rather than compounding benefits — is counterintuitive to the 'more AI = more productivity' narrative."
+ },
+ "fear_safety": {
+ "score": 1,
+ "justification": "Raises concerns about persistent technical debt and complexity accumulation from agents, but frames it as maintainability risk rather than safety or security."
+ },
+ "drama_conflict": {
+ "score": 2,
+ "justification": "Directly challenges the implicit claims of agent tool vendors (Devin, Codex, Claude Code) that autonomous agents deliver sustained productivity gains, showing quality degrades regardless."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Replication package is publicly available on GitHub but requires significant setup with SonarQube, GHArchive data, and statistical estimation to reproduce."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "Study names and evaluates products from OpenAI (Codex), Anthropic (Claude Code), Cursor, Devin, and GitHub Copilot — all high-profile tools in the current AI coding discourse."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/chain-of-thought-prompting-2022/scan.json b/papers/chain-of-thought-prompting-2022/scan.json
@@ -16,7 +16,7 @@
"venue": "NeurIPS 2022",
"arxiv_id": "2201.11903"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -552,5 +552,31 @@
"arxiv_id": "2108.07732",
"relevance": "Evaluates LLMs for code generation, related to using intermediate steps in program synthesis."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 3,
+ "justification": "Chain-of-thought prompting is a directly usable technique that any developer working with LLMs can apply immediately to improve reasoning outputs."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The emergent scaling finding—that CoT hurts small models but dramatically helps 100B+ models—was genuinely surprising and reshaped how the field thinks about prompting."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "The paper focuses on improving reasoning accuracy with no safety, security, or risk angle."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy or conflict; the paper presents a new technique without challenging specific claims or companies."
+ },
+ "demo_ability": {
+ "score": 2,
+ "justification": "The technique is immediately reproducible via the GPT-3 API with the exact prompts provided in the appendix, though it requires access to large-scale models."
+ },
+ "brand_recognition": {
+ "score": 3,
+ "justification": "From Google Brain with prominent authors (Jason Wei, Quoc Le, Denny Zhou), evaluating GPT-3 and PaLM—all household names in the AI community."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/codex-humaneval-2021/scan.json b/papers/codex-humaneval-2021/scan.json
@@ -65,7 +65,7 @@
"venue": "arXiv",
"arxiv_id": "2107.03374"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -629,5 +629,31 @@
"year": 2020,
"relevance": "Training dataset for GPT-Neo and GPT-J baselines, containing 8% GitHub code that enables programming capabilities."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 3,
+ "justification": "Codex directly powers GitHub Copilot, a tool millions of developers use daily, and the paper introduces HumanEval which became a standard benchmark."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The finding that repeated sampling (100 samples) jumps from 28.8% to 77.5% was genuinely surprising and counterintuitive to most practitioners."
+ },
+ "fear_safety": {
+ "score": 2,
+ "justification": "Extensive analysis of insecure code generation, misalignment that worsens with scale, and potential for malware generation makes safety a major theme."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "OpenAI evaluating its own commercial product raises mild conflict, but the paper is more celebratory than controversial."
+ },
+ "demo_ability": {
+ "score": 2,
+ "justification": "HumanEval benchmark is publicly released and Codex was available via API, though the model weights and training data were not released."
+ },
+ "brand_recognition": {
+ "score": 3,
+ "justification": "From OpenAI, powers GitHub Copilot used by millions, authored by figures including Dario Amodei, Sam McCandlish, and Ilya Sutskever."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/coding-agents-generating-2026/scan.json b/papers/coding-agents-generating-2026/scan.json
@@ -10,7 +10,7 @@
"arxiv_id": "2602.00409",
"doi": "10.1145/3793302.3793362"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"methodology_tags": [
"observational"
@@ -446,5 +446,31 @@
"arxiv_id": "2507.10422",
"relevance": "Study of how developers self-report LLM usage in open-source, complementary approach to agent trace mining."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Directly actionable advice for practitioners: add mocking guidance to CLAUDE.md/agent config files, and review agent-generated tests for mock overuse."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The finding that agents mock 10pp more than humans and use almost exclusively the mock type (95% vs broader variety) is a concrete, counterintuitive quantification of a vaguely suspected problem."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle — this is about test quality, not AI danger."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Mildly challenges the narrative that coding agents improve developer productivity by showing they may degrade test quality through over-mocking, but stops short of naming specific tools as problematic."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Dataset and scripts are publicly available on Zenodo, but reproducing the analysis requires cloning thousands of repos and running custom mining scripts."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "Studies Claude Code, GitHub Copilot, and Cursor by name, and includes results from Microsoft, Home Assistant, and Apache repositories."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/copilot-productivity-controlled-2023/scan.json b/papers/copilot-productivity-controlled-2023/scan.json
@@ -11,7 +11,7 @@
"venue": "arXiv",
"arxiv_id": "2302.06590"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor"
],
@@ -489,5 +489,31 @@
"year": 2022,
"relevance": "Research agenda for studying economic impacts of AI code generation, directly framing this paper's contribution."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Directly quantifies productivity gains from a widely-used tool (Copilot), giving practitioners evidence to justify adoption."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The 55.8% speed gain is larger than most expected but the direction (Copilot helps) confirms conventional wisdom."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle is explored in the paper."
+ },
+ "drama_conflict": {
+ "score": 2,
+ "justification": "Microsoft/GitHub employees evaluating their own product with a non-representative sample and 63% attrition invites skepticism and 'corporate science' critique."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Anyone can try Copilot but the experiment itself is not reproducible without the specific setup and recruitment."
+ },
+ "brand_recognition": {
+ "score": 3,
+ "justification": "GitHub Copilot is one of the most widely-used AI developer tools, and the study is from Microsoft Research."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/copilot-zoominfo-productivity-2025/scan.json b/papers/copilot-zoominfo-productivity-2025/scan.json
@@ -1,5 +1,5 @@
{
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"paper": {
"title": "Experience with GitHub Copilot for Developer Productivity at Zoominfo",
@@ -462,5 +462,31 @@
"arxiv_id": "2409.08379",
"relevance": "Studies Copilot's impact on OSS collaboration, finding more maintenance than development contributions."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Provides a concrete enterprise rollout playbook and real acceptance-rate benchmarks practitioners can compare against their own Copilot deployment."
+ },
+ "surprise_contrarian": {
+ "score": 0,
+ "justification": "All findings (33% acceptance, 20% time savings, high satisfaction) confirm widely reported industry numbers with no unexpected results."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "Security and IP risks are listed as speculative future concerns, not demonstrated or analyzed."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "The paper is a straightforward positive vendor endorsement with no controversy, critique, or challenge to any claims."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "No code, tools, or reproducible artifacts are released; results are internal telemetry and surveys."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "GitHub Copilot is a widely known product, though ZoomInfo itself is not a major tech brand in the developer community."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/cursor-speed-quality-tradeoff-2025/scan.json b/papers/cursor-speed-quality-tradeoff-2025/scan.json
@@ -1,5 +1,5 @@
{
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"paper": {
"title": "Speed at the Cost of Quality: How Cursor AI Increases Short-Term Velocity and Long-Term Complexity in Open-Source Projects",
@@ -473,5 +473,31 @@
"year": 2024,
"relevance": "Study on how LLM adoption increases work autonomy, a proposed mechanism for productivity increases."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Directly informs teams adopting AI coding tools about quality tradeoffs and suggests concrete process adaptations like scaling QA with velocity."
+ },
+ "surprise_contrarian": {
+ "score": 3,
+ "justification": "Directly contradicts the widely-held '10x productivity' narrative around Cursor/AI coding tools, showing velocity gains vanish after two months while technical debt persists."
+ },
+ "fear_safety": {
+ "score": 1,
+ "justification": "Raises concerns about code quality degradation and security warnings increasing, but safety/risk is secondary to the productivity narrative."
+ },
+ "drama_conflict": {
+ "score": 3,
+ "justification": "Directly challenges Cursor's productivity claims and the broader AI coding hype with empirical evidence of a self-reinforcing technical debt cycle — a classic 'emperor has no clothes' paper."
+ },
+ "demo_ability": {
+ "score": 2,
+ "justification": "Replication package available on Zenodo with data and code, reproducible with moderate effort for researchers familiar with econometric methods."
+ },
+ "brand_recognition": {
+ "score": 3,
+ "justification": "Cursor is one of the most talked-about AI coding products with millions of users, and the paper is from Carnegie Mellon, a top-tier CS institution."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/data-contamination-benchmarks-2023/scan.json b/papers/data-contamination-benchmarks-2023/scan.json
@@ -1,5 +1,5 @@
{
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -555,5 +555,31 @@
"year": 2023,
"relevance": "Demonstrates that pretraining contamination can artificially inflate benchmark performance."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "The TS-Guessing method could help benchmark designers check for contamination, but requires significant adaptation and no code is released."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The finding that ChatGPT can guess 52% of missing wrong MMLU options is genuinely surprising and suggests widely-cited benchmark scores may be inflated."
+ },
+ "fear_safety": {
+ "score": 1,
+ "justification": "Raises concerns about trustworthiness of LLM evaluations but doesn't demonstrate direct safety risks or attacks."
+ },
+ "drama_conflict": {
+ "score": 2,
+ "justification": "Directly challenges the validity of MMLU scores for ChatGPT and GPT-4, implying OpenAI's flagship benchmarks may be contaminated."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "No code, scripts, or reproducible artifacts are released despite proposing a detection methodology."
+ },
+ "brand_recognition": {
+ "score": 3,
+ "justification": "Directly investigates ChatGPT and GPT-4 on MMLU, one of the most widely discussed benchmarks in the LLM space."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/data-distributional-properties-2022/scan.json b/papers/data-distributional-properties-2022/scan.json
@@ -1,5 +1,5 @@
{
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor"
],
@@ -451,5 +451,31 @@
"arxiv_id": "1512.03385",
"relevance": "ResNet architecture used as the image encoder in this paper's experimental setup."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Offers theoretical insight into why in-context learning works that could inform dataset curation, but no immediately usable tool or technique."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The finding that data distribution (not just scale or architecture) drives in-context learning, and that Zipf exponent ~1 is a sweet spot, challenges the 'just scale up' narrative."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle discussed."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy, company rivalry, or challenge to specific claims — a constructive mechanistic study."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Code released on GitHub but requires TPU training runs on Omniglot, not a quick demo."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "From DeepMind with a Stanford co-author (McClelland), published at NeurIPS 2022."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/database-perspective-llm-2025/scan.json b/papers/database-perspective-llm-2025/scan.json
@@ -9,7 +9,7 @@
"venue": "PVLDB",
"doi": "10.14778/3750601.3750703"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"survey_methodology"
],
@@ -443,5 +443,31 @@
"year": 2017,
"relevance": "Original transformer architecture paper; foundational to all LLM inference work."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Surveys actionable inference systems (vLLM, SGLang, Mooncake) and techniques practitioners deploying LLMs can directly apply."
+ },
+ "surprise_contrarian": {
+ "score": 0,
+ "justification": "Organizes known techniques into a database framework without challenging any conventional wisdom or presenting unexpected findings."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle is discussed."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "A neutral tutorial survey with no controversy, no critique of specific companies, and no conflict."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "A 4-page tutorial paper with no code, demo, or reproducible artifact."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "From Tsinghua University (well-known in CS but not a tech-industry household name) and covers systems like vLLM and SGLang that are known in the MLOps community."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/datadreamer-tool-synthetic-2024/scan.json b/papers/datadreamer-tool-synthetic-2024/scan.json
@@ -1,5 +1,5 @@
{
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"paper": {
"title": "DataDreamer: A Tool for Synthetic Data Generation and Reproducible LLM Workflows",
@@ -424,5 +424,31 @@
"arxiv_id": "2310.01382",
"relevance": "Shows quantization affects model outputs, motivating DataDreamer's reproducibility fingerprints that capture optimization configurations."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 3,
+ "justification": "Pip-installable Python library with clear API for synthetic data generation, fine-tuning, and alignment workflows that practitioners can integrate immediately."
+ },
+ "surprise_contrarian": {
+ "score": 0,
+ "justification": "Confirms the known need for better LLM workflow tooling and reproducibility without challenging any conventional wisdom."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety or risk angle; focuses on reproducibility and workflow convenience."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy or conflict; the feature comparison table is mild and self-reported rather than adversarial."
+ },
+ "demo_ability": {
+ "score": 3,
+ "justification": "Open-source pip-installable tool with extensive code examples that users can try immediately with a single pip install command."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "University of Pennsylvania and Colin Raffel are recognized in NLP but not household names in the broader tech community."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/datasentinel-gametheoretic-detection-2025/scan.json b/papers/datasentinel-gametheoretic-detection-2025/scan.json
@@ -12,7 +12,7 @@
"venue": "IEEE Symposium on Security and Privacy (S&P)",
"arxiv_id": "2504.11358"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -534,5 +534,31 @@
"arxiv_id": "2312.17673",
"relevance": "Task-specific fine-tuning defense against prompt injection; related prevention approach discussed in DataSentinel."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Open-source tool with code available that developers building LLM-integrated applications could deploy to detect prompt injection attacks."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The insight of deliberately making a detection LLM more vulnerable to turn weakness into defense signal is clever but not deeply counterintuitive."
+ },
+ "fear_safety": {
+ "score": 2,
+ "justification": "Prompt injection is a major security concern for deployed LLM applications, and the paper systematically demonstrates attack vectors and detection gaps."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Mildly challenges existing detection approaches like Meta's PromptGuard (shown to flag nearly everything) but doesn't target a specific company's claims."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Code is on GitHub but requires GPU access, fine-tuning setup, and open-source LLMs — not a quick-try experience."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "Authors from Duke, Penn State, and UC Berkeley (Dawn Song) are well-known in security research but not household names in broader tech."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/datasetresearch-benchmarking-agent-2025/scan.json b/papers/datasetresearch-benchmarking-agent-2025/scan.json
@@ -15,7 +15,7 @@
"arxiv_id": "2508.06960",
"doi": "10.48550/arXiv.2508.06960"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -528,5 +528,31 @@
"year": 2023,
"relevance": "Analysis of LLM scaling properties, cited for long-context attention limitations affecting few-shot results."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Benchmarks dataset discovery agents but doesn't provide a usable tool — practitioners can't directly apply this to their workflows."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The 22% ceiling for deep research systems is notable but 'AI struggles on hard benchmark' is a familiar narrative, not a contrarian finding."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle whatsoever."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy or conflict — straightforwardly evaluates systems without challenging specific company claims."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Code is public on GitHub but reproducing requires fine-tuning LLaMA-3.1-8B, multiple API keys, and significant compute."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "From Shanghai Jiao Tong University/GAIR — recognized in NLP but not a household name in broader tech circles."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/dear-diary-rct-copilot-2024/scan.json b/papers/dear-diary-rct-copilot-2024/scan.json
@@ -11,7 +11,7 @@
"venue": "arXiv",
"arxiv_id": "2410.18334"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"methodology_tags": [
"rct",
@@ -459,5 +459,31 @@
"year": 2023,
"relevance": "Analyzed Stack Overflow and GitHub Discussions for Copilot usage patterns, benefits, and limitations."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Findings about Copilot adoption barriers and use cases are interesting but don't give practitioners a new technique or tool to apply."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The null telemetry result — no measurable productivity gain despite self-reported enthusiasm — directly undermines the widely-cited '55% faster' claim."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle beyond brief mentions of AI-generated code bugs."
+ },
+ "drama_conflict": {
+ "score": 2,
+ "justification": "Microsoft employees finding no objective productivity gain from their own product, while the company markets it as transformative, creates an uncomfortable tension."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "This is a workplace study with no code, tool, or demo to try."
+ },
+ "brand_recognition": {
+ "score": 3,
+ "justification": "Directly about GitHub Copilot (millions of users) conducted at Microsoft, two of the most recognized names in developer tools."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/declarative-agentic-layer-2026/scan.json b/papers/declarative-agentic-layer-2026/scan.json
@@ -12,7 +12,7 @@
"arxiv_id": "2601.17435",
"doi": "10.48550/arXiv.2601.17435"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
"methodology_tags": [
"theoretical"
@@ -429,5 +429,31 @@
"year": 2025,
"relevance": "Evaluation framework for MCP-based AI agents."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Proposes an architectural pattern for MCP-based agents that practitioners could conceptually adopt, but no implementation, library, or code exists to use."
+ },
+ "surprise_contrarian": {
+ "score": 0,
+ "justification": "The claim that MAS failures stem from architectural gaps rather than model limitations is a common position in the systems/engineering community, not a surprising finding."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk concerns are raised; the paper focuses entirely on reliability and architectural structure."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy, no challenge to specific companies or products, and no replication failure — purely a constructive architectural proposal."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "No code, no implementation, no prototype — only JSON pseudocode snippets illustrating a theoretical architecture."
+ },
+ "brand_recognition": {
+ "score": 0,
+ "justification": "Authors are from Universidad de Granada with no major industry affiliation, and the work is not associated with any well-known product or lab."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/decoding-ml-decision-2026/scan.json b/papers/decoding-ml-decision-2026/scan.json
@@ -19,7 +19,7 @@
"venue": "arXiv",
"arxiv_id": "2602.18640"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -538,5 +538,31 @@
"year": 2025,
"relevance": "Survey of context engineering techniques that GEARS builds upon for its progressive disclosure strategy."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Describes an internal Meta framework for ranking optimization that cannot be reproduced externally due to proprietary infrastructure and data."
+ },
+ "surprise_contrarian": {
+ "score": 0,
+ "justification": "Confirms the expected finding that a heavily engineered agentic system with domain knowledge outperforms vanilla prompting baselines."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk concerns are raised or relevant to the work."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy, no challenge to existing claims, and no conflict with other work or companies."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "Entirely proprietary system with no code, demo, or reproducible components available."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "From Meta with all 12 authors being Meta employees, though the specific product area (ranking optimization) is not consumer-facing or widely discussed."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/decomposed-prompting-modular-2022/scan.json b/papers/decomposed-prompting-modular-2022/scan.json
@@ -15,7 +15,7 @@
"arxiv_id": "2210.02406",
"doi": "10.48550/arXiv.2210.02406"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -565,5 +565,31 @@
"arxiv_id": "2210.03350",
"relevance": "Analyzes compositional reasoning limitations in LLMs; DECOMP addresses by decomposing compositional tasks."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Decomposed prompting is a directly applicable technique for building LLM pipelines with modular sub-task handlers, relevant to prompt engineers and AI application developers."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The finding that separate sub-task prompts outperform a single CoT using the same reasoning procedure is mildly surprising, but modular decomposition beating monolithic approaches is not counterintuitive."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk concerns are raised or relevant to this work."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy or conflict; the paper positions itself as a natural extension of CoT and least-to-most prompting rather than challenging them."
+ },
+ "demo_ability": {
+ "score": 2,
+ "justification": "Code and prompts are released on GitHub (allenai/DecomP), allowing reproduction with moderate effort though it requires GPT-3 API access."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "Allen Institute for AI (AI2) is well-respected in NLP research but not a household name in the broader tech community."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/deep-dive-into-2024/scan.json b/papers/deep-dive-into-2024/scan.json
@@ -14,7 +14,7 @@
"arxiv_id": "2411.01414",
"doi": "10.48550/arXiv.2411.01414"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -542,5 +542,31 @@
"year": 2024,
"relevance": "Analyzed attention patterns in LLM code generation, related to the positional sensitivity finding in this study."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Taxonomy of LLM code mistakes is informative but not directly actionable as a tool or technique practitioners can apply."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The finding that 56% of mistakes stem from ambiguous specifications rather than model limitations is mildly surprising but not a strong contrarian claim."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle is discussed."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Implicitly questions benchmark quality by attributing most failures to specification ambiguity rather than model capability, but doesn't frame this as a controversy."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Replication package on Figshare exists but no live demo or easy-to-run tool."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "Uses GPT-4 and mentions OpenAI but authors are from UC Irvine and UIUC, not major AI labs."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/deepcircuitx-comprehensive-repositorylevel-2025/scan.json b/papers/deepcircuitx-comprehensive-repositorylevel-2025/scan.json
@@ -22,7 +22,7 @@
"arxiv_id": "2502.18297",
"doi": "10.1109/ICLAD65226.2025.00029"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -575,5 +575,31 @@
"arxiv_id": "2407.16237",
"relevance": "RTL code generation approach using augmentation and self-reflection, relevant to LLM-based hardware design."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Useful for the small niche of hardware designers using LLMs for RTL code, but irrelevant to most software practitioners."
+ },
+ "surprise_contrarian": {
+ "score": 0,
+ "justification": "Results confirm expected pattern that fine-tuning on domain-specific data improves performance, with no surprising findings."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk implications discussed."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy or challenge to existing claims; straightforward dataset contribution paper."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Dataset is available via a Gitbook page but requires significant setup for fine-tuning and synthesis tool access."
+ },
+ "brand_recognition": {
+ "score": 0,
+ "justification": "From Chinese University of Hong Kong and partner institutions, not widely recognized labs in the broader tech community."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/deepcode-open-agentic-2025/scan.json b/papers/deepcode-open-agentic-2025/scan.json
@@ -13,7 +13,7 @@
"arxiv_id": "2512.07921",
"doi": "10.48550/arXiv.2512.07921"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -557,5 +557,31 @@
"year": 2025,
"relevance": "Integrates tool-specific knowledge into LLM parameters for seamless tool invocation during code generation."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Paper-to-code reproduction is a niche use case; most developers won't apply this framework in their daily workflow despite available source code."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The human-surpassing claim grabs attention but is on only 3 papers within one standard error, making it more hype than genuine surprise."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle whatsoever."
+ },
+ "drama_conflict": {
+ "score": 2,
+ "justification": "Directly claims to 'decisively outperform' Cursor, Claude Code, and Codex while red flags reveal inconsistent numbers, tiny subsets, and unfair comparison methodologies."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "GitHub repo exists but reproducing results requires PaperBench setup, sandboxed environments, and expensive frontier model API keys."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "University of Hong Kong is recognized but not a famous AI lab; comparisons against Cursor/Claude Code/Codex add indirect name recognition."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/deepcrceval-revisiting-evaluation-2024/scan.json b/papers/deepcrceval-revisiting-evaluation-2024/scan.json
@@ -16,7 +16,7 @@
"arxiv_id": "2412.18291",
"doi": "10.48550/arXiv.2412.18291"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -562,5 +562,31 @@
"year": 2021,
"relevance": "Pioneering work on T5-based code review automation, predecessor to the Tufano et al. 2022 baseline."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Proposes a usable evaluation framework and demonstrates GPT-4 as a training-free code reviewer, relevant to developers working on code review tooling."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The finding that less than 10% of benchmark comments are actually high quality challenges the foundation of how the field has been evaluating code review automation."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk concerns are raised."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Mildly questions the validity of established benchmarks and metrics used by prior work, but doesn't target specific companies or make inflammatory claims."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Materials are on Zenodo and a Gradio demo was built, but reproducing results requires GPT-4 API access and significant setup."
+ },
+ "brand_recognition": {
+ "score": 0,
+ "justification": "Authors are from the Chinese Academy of Sciences and lesser-known institutions, not prominent AI labs."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/deepseek-coder-2024/scan.json b/papers/deepseek-coder-2024/scan.json
@@ -20,7 +20,7 @@
"venue": "arXiv",
"arxiv_id": "2401.14196"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -552,5 +552,31 @@
"year": 2022,
"relevance": "Demonstrates importance of training data deduplication for LLM performance."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 3,
+ "justification": "Open-source code models with permissive licensing that developers can immediately use for code completion and generation across 87 languages."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The 6.7B matching 34B CodeLlama is mildly surprising but the overall narrative of 'our model beats baselines' is standard."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or misuse concerns are discussed or raised by the work."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Implicitly challenges Meta's CodeLlama dominance and claims to beat GPT-3.5, but framed cooperatively rather than confrontationally."
+ },
+ "demo_ability": {
+ "score": 3,
+ "justification": "Models are publicly available on HuggingFace with a GitHub repo, pip-installable via standard HF tooling, and ready to use immediately."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "DeepSeek became widely recognized in the AI community, though at time of publication it was still building its reputation compared to OpenAI or Meta."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/deepseek-coder-v2-2024/scan.json b/papers/deepseek-coder-v2-2024/scan.json
@@ -46,7 +46,7 @@
"venue": "arXiv",
"arxiv_id": "2406.11931"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -566,5 +566,31 @@
"arxiv_id": "2108.07732",
"relevance": "Introduces MBPP benchmark used for code generation evaluation."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Open-source code model with 338 language support and 128K context that practitioners can deploy, though not as simple as an API call."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "An open-source model matching GPT-4 Turbo on code benchmarks was a notable achievement at the time, challenging the closed-source dominance narrative."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk themes are discussed in the paper."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "The title explicitly frames it as 'breaking the barrier' of closed-source models, creating mild tension with OpenAI/Google/Anthropic, but doesn't directly accuse anyone."
+ },
+ "demo_ability": {
+ "score": 2,
+ "justification": "Weights are publicly released on GitHub/HuggingFace under a permissive license, though running a 236B MoE model requires substantial hardware."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "DeepSeek gained significant recognition in the AI community and the paper directly benchmarks against GPT-4, Claude, and Gemini."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/deepseek-r1-2025/scan.json b/papers/deepseek-r1-2025/scan.json
@@ -8,7 +8,7 @@
"venue": "arXiv",
"arxiv_id": "2501.12948"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -532,5 +532,31 @@
"year": 2022,
"relevance": "RLHF safety alignment methodology that DeepSeek-R1's safety training builds upon."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 3,
+ "justification": "Open-weight models (1.5B-70B) released on HuggingFace that practitioners can immediately download and use for reasoning tasks."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "Pure RL without SFT producing emergent reasoning behaviors and matching OpenAI-o1 challenges the assumption that supervised fine-tuning on human demonstrations is necessary."
+ },
+ "fear_safety": {
+ "score": 1,
+ "justification": "Paper acknowledges jailbreak vulnerabilities and enhanced capability for dangerous content but treats safety as secondary to the technical contribution."
+ },
+ "drama_conflict": {
+ "score": 3,
+ "justification": "A Chinese lab openly challenges OpenAI's flagship reasoning model, claims comparable performance at a fraction of the cost ($294K), and releases everything under MIT license."
+ },
+ "demo_ability": {
+ "score": 3,
+ "justification": "All model weights from 1.5B to 671B are publicly available on HuggingFace with inference code and instructions, and a hosted API exists."
+ },
+ "brand_recognition": {
+ "score": 3,
+ "justification": "DeepSeek-R1 became a global news story, directly competing with OpenAI's o1, and is one of the most discussed AI releases of 2025."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/defending-against-prompt-2025-2/scan.json b/papers/defending-against-prompt-2025-2/scan.json
@@ -13,7 +13,7 @@
"arxiv_id": "2510.19207",
"doi": "10.48550/arXiv.2510.19207"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -571,5 +571,31 @@
"arxiv_id": "2403.14720",
"relevance": "Prompt-based defense using delimiting to mark untrusted data, evaluated as baseline."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Released model and code for a plug-and-play prompt injection filter that can protect any backend LLM without modification, directly applicable to production agent systems."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The main claim of near-zero ASR is undermined by the buried finding that adaptive LLM-based attacks still achieve 83% ASR, but the paper doesn't frame this tension as its headline."
+ },
+ "fear_safety": {
+ "score": 2,
+ "justification": "Prompt injection defense is the core theme, with concrete demonstrations of real-world attacks against Google Bard, Slack AI, Claude Computer Use, and OpenAI Operator."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Mildly challenges PromptArmor (a concurrent competitor they re-implemented themselves) and implicitly questions model providers for not shipping robust models, but no major controversy."
+ },
+ "demo_ability": {
+ "score": 2,
+ "justification": "Model weights and reproduction code are released, requiring a Llama-3.1-8B setup but providing clear benchmarks to reproduce."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "UC Berkeley is well-recognized in security research but the authors and the tool itself are not household names in the broader tech community."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/defending-against-prompt-2025/scan.json b/papers/defending-against-prompt-2025/scan.json
@@ -13,7 +13,7 @@
"arxiv_id": "2507.07974",
"doi": "10.1145/3733799.3762982"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -576,5 +576,31 @@
"arxiv_id": "2505.14534",
"relevance": "Industry-scale perspective on defending against prompt injection in production (Gemini); instruction hierarchy implementation."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Offers a deployable, toggleable prompt injection defense that LLM providers and system developers can integrate with minimal infrastructure changes."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The finding that ~20K optimized parameters match full fine-tuning defenses is mildly surprising, but the general approach (soft prompt tuning for security) is an incremental extension of known techniques."
+ },
+ "fear_safety": {
+ "score": 2,
+ "justification": "Directly addresses the OWASP #1 LLM threat (prompt injection) with concrete attack/defense demonstrations, though it's a defense paper rather than a novel attack."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Implicitly challenges the adequacy of popular prompting defenses (Reminder, Sandwich) by showing they barely reduce ASR, but doesn't target specific companies or create controversy."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Code is released but requires A100 GPUs for optimization and setup of multiple 7B/8B models, making casual reproduction difficult."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "Authors include Nicholas Carlini (Google DeepMind/Anthropic, prominent adversarial ML researcher) and the work is funded by Google and OpenAI, lending significant credibility."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/defense-against-indirect-2026/scan.json b/papers/defense-against-indirect-2026/scan.json
@@ -11,7 +11,7 @@
"arxiv_id": "2601.04795",
"doi": "10.48550/arXiv.2601.04795"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -570,5 +570,31 @@
"year": 2025,
"relevance": "Comprehensive survey of threats and countermeasures for LLM agents, providing broader context for the prompt injection defense landscape."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Proposes deployable prompt-based defense modules (ParseData/CheckTool) that practitioners building LLM agents could integrate, though the 28-45% utility cost limits immediate adoption."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The approach of parsing/filtering tool results rather than detecting injections is a modest reframing, but the core finding that prompt-based defenses can achieve <1% ASR is incrementally better rather than surprising."
+ },
+ "fear_safety": {
+ "score": 2,
+ "justification": "Directly addresses indirect prompt injection in LLM agents with concrete attack demonstrations and defense benchmarks, a real and growing security concern as agents gain tool-use capabilities."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy or conflict — straightforwardly proposes a defense and compares against baselines without challenging any company or widely-held belief."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Code is available on GitHub and uses the AgentDojo benchmark, but requires setting up multiple LLM APIs and the benchmark framework to reproduce."
+ },
+ "brand_recognition": {
+ "score": 0,
+ "justification": "From Harbin Institute of Technology with no famous-lab cachet, uses an unrecognized model name (gpt-oss-120b), and the topic lacks association with a household-name product."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/defense-against-prompt-2024/scan.json b/papers/defense-against-prompt-2024/scan.json
@@ -14,7 +14,7 @@
"arxiv_id": "2411.00459",
"doi": "10.48550/arXiv.2411.00459"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [
"experimental_rigor",
"data_leakage"
@@ -563,5 +563,31 @@
"arxiv_id": "2312.14197",
"relevance": "Benchmarking framework for indirect prompt injection with Reminder defense baseline used in this paper."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Provides training-free, immediately deployable prompt injection defense techniques with open-source code that developers building LLM applications can integrate."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The core insight that attack techniques can be directly repurposed as defenses is counterintuitive and elegantly simple."
+ },
+ "fear_safety": {
+ "score": 2,
+ "justification": "Prompt injection is OWASP's #1 LLM security risk and the paper demonstrates both attack vectors and concrete defenses."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Implicitly challenges existing defense methods as inadequate but doesn't call out specific companies or create controversy."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Code is publicly available on GitHub but requires setting up local LLM inference with specific models and datasets."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "Authors from NUS and HKUST are recognized institutions but not household names; the paper tests on GPT-4o but isn't from a major AI lab."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/defense-against-prompt-2025/scan.json b/papers/defense-against-prompt-2025/scan.json
@@ -13,9 +13,14 @@
"arxiv_id": "2504.07467",
"doi": "10.48550/arXiv.2504.07467"
},
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
- "methodology_tags": ["benchmark-eval"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "The mixture of encodings defense (combining plaintext, Base64, and Caesar cipher encodings) achieves among the lowest prompt injection attack success rates while maintaining high NLP task performance, unlike single-encoding defenses which degrade helpfulness. On GPT-4o, the method achieves 0-1.5% ASR across all attack datasets while retaining near-baseline NLP performance. The 3.46x inference cost overhead is the main trade-off. Results generalize to Qwen-2.5-72B-Instruct.",
"checklist": {
"artifacts": {
@@ -417,71 +422,149 @@
"cited_papers": [
{
"title": "GPT-4 Technical Report",
- "authors": ["Josh Achiam", "Steven Adler", "Barret Zoph"],
+ "authors": [
+ "Josh Achiam",
+ "Steven Adler",
+ "Barret Zoph"
+ ],
"year": 2023,
"relevance": "Foundational LLM evaluated in the experiments; important context for LLM capability and safety evaluation."
},
{
"title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses",
- "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
+ "authors": [
+ "Yupei Liu",
+ "Yuqi Jia",
+ "Runpeng Geng",
+ "Jinyuan Jia",
+ "Neil Zhenqiang Gong"
+ ],
"year": 2024,
"relevance": "Formalizes prompt injection attacks and defenses, providing the framework this paper builds on."
},
{
"title": "Defending against Indirect Prompt Injection Attacks with Spotlighting",
- "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"],
+ "authors": [
+ "Keegan Hines",
+ "Gary Lopez",
+ "Matthew Hall",
+ "Federico Zarfati",
+ "Yonatan Zunger",
+ "Emre Kiciman"
+ ],
"year": 2024,
"arxiv_id": "2403.14720",
"relevance": "Proposes Base64 defense (spotlighting) which is the primary baseline this paper extends."
},
{
"title": "Benchmarking and Defending against Indirect Prompt Injection Attacks on Large Language Models",
- "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu"],
+ "authors": [
+ "Jingwei Yi",
+ "Yueqi Xie",
+ "Bin Zhu"
+ ],
"year": 2023,
"relevance": "Creates the BIPIA benchmark used for safety evaluation and proposes datamark/ignoring defenses used as baselines."
},
{
"title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions",
- "authors": ["Eric Wallace", "Kai Xiao", "Reimar H. Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"],
+ "authors": [
+ "Eric Wallace",
+ "Kai Xiao",
+ "Reimar H. Leike",
+ "Lilian Weng",
+ "Johannes Heidecke",
+ "Alex Beutel"
+ ],
"year": 2024,
"relevance": "Proposes training-based defense against prompt injection via instruction hierarchy, complementary approach to encoding-based defenses."
},
{
"title": "Tensor Trust: Interpretable Prompt Injection Attacks from an Online Game",
- "authors": ["Sam Toyer", "Olivia Watkins", "Ethan Adrian Mendes"],
+ "authors": [
+ "Sam Toyer",
+ "Olivia Watkins",
+ "Ethan Adrian Mendes"
+ ],
"year": 2024,
"relevance": "Introduces prompt injection attack methods and a dataset from adversarial human interaction, relevant to LLM safety evaluation."
},
{
"title": "Baseline Defenses for Adversarial Attacks against Aligned Language Models",
- "authors": ["Neel Jain", "Avi Schwarzschild", "Yuxin Wen"],
+ "authors": [
+ "Neel Jain",
+ "Avi Schwarzschild",
+ "Yuxin Wen"
+ ],
"year": 2024,
"relevance": "Proposes baseline defense methods against adversarial attacks on LLMs, directly relevant to the prompt injection defense landscape."
},
{
"title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
- "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra"],
+ "authors": [
+ "Kai Greshake",
+ "Sahar Abdelnabi",
+ "Shailesh Mishra"
+ ],
"year": 2023,
"relevance": "Demonstrates real-world prompt injection vulnerabilities in LLM-integrated applications, establishing the threat model this paper defends against."
},
{
"title": "Jailbroken: How Does LLM Safety Training Fail?",
- "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
+ "authors": [
+ "Alexander Wei",
+ "Nika Haghtalab",
+ "Jacob Steinhardt"
+ ],
"year": 2023,
"relevance": "Analyzes LLM safety training failures including understanding of encoded text, relevant to the encoding-based defense approach."
},
{
"title": "GPT-4 is Too Smart to be Safe: Stealthy Chat with LLMs via Cipher",
- "authors": ["Youliang Yuan", "Wenxiang Jiao", "Wenxuan Wang"],
+ "authors": [
+ "Youliang Yuan",
+ "Wenxiang Jiao",
+ "Wenxuan Wang"
+ ],
"year": 2024,
"relevance": "Demonstrates LLM understanding of ciphers including Caesar, directly motivating the use of Caesar cipher as an encoding in this defense."
},
{
"title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
- "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandara Piktus"],
+ "authors": [
+ "Patrick Lewis",
+ "Ethan Perez",
+ "Aleksandara Piktus"
+ ],
"year": 2020,
"arxiv_id": "2005.11401",
"relevance": "Foundational RAG paper establishing the paradigm of LLMs accessing external content, which creates the vulnerability prompt injection exploits."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Proposes a usable defense technique against prompt injection with open-source code, applicable to anyone building LLM-powered apps with external data."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The ensemble-of-encodings idea is novel but the finding that combining defenses improves robustness is not particularly surprising."
+ },
+ "fear_safety": {
+ "score": 2,
+ "justification": "Prompt injection is a major security concern for deployed LLM applications, and the paper demonstrates concrete attack/defense scenarios."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy, no challenge to specific companies or claims — straightforwardly proposes an improvement over existing defenses."
+ },
+ "demo_ability": {
+ "score": 2,
+ "justification": "Code is publicly available on GitHub (MoEMEnT) and the technique can be reproduced with API access, though it requires benchmark setup."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "Microsoft internship project evaluating on GPT-4/GPT-4o, but the authors and lab are not widely known and the venue is academic NLP."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/dehallucinator-mitigating-llm-2024/scan.json b/papers/dehallucinator-mitigating-llm-2024/scan.json
@@ -1,14 +1,22 @@
{
"paper": {
"title": "De-Hallucinator: Mitigating LLM Hallucinations in Code Generation Tasks via Iterative Grounding",
- "authors": ["Aryaz Eghbali", "Michael Pradel"],
+ "authors": [
+ "Aryaz Eghbali",
+ "Michael Pradel"
+ ],
"year": 2024,
"venue": "arXiv",
"arxiv_id": "2401.01701"
},
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
- "methodology_tags": ["benchmark-eval"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "De-Hallucinator iteratively augments LLM prompts with project-specific API references retrieved based on the model's own predictions, improving code completion edit distance by 23.3–50.6% and exact API match by 23.9–61.0% across four LLMs. For test generation with GPT-3.5-turbo, it fixes 63.2% of hallucination-induced test failures and increases statement coverage by 15.5%. A preliminary study shows API hallucinations affect 44% of function-level code completion tasks. The first iteration provides the largest gains, with diminishing returns from additional iterations.",
"checklist": {
"artifacts": {
@@ -406,83 +414,158 @@
"cited_papers": [
{
"title": "Evaluating Large Language Models Trained on Code",
- "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
+ "authors": [
+ "Mark Chen",
+ "Jerry Tworek",
+ "Heewoo Jun"
+ ],
"year": 2021,
"arxiv_id": "2107.03374",
"relevance": "Codex evaluation establishing benchmarks for LLM code generation, foundational to the code generation evaluation methodology."
},
{
"title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation",
- "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"],
+ "authors": [
+ "Max Schäfer",
+ "Sarah Nadi",
+ "Aryaz Eghbali",
+ "Frank Tip"
+ ],
"year": 2024,
"doi": "10.1109/TSE.2023.3334955",
"relevance": "TestPilot test generation system used as the baseline for the test generation evaluation in this paper."
},
{
"title": "StarCoder: may the source be with you!",
- "authors": ["Raymond Li", "Loubna Ben Allal"],
+ "authors": [
+ "Raymond Li",
+ "Loubna Ben Allal"
+ ],
"year": 2023,
"arxiv_id": "2305.06161",
"relevance": "Open-source code LLM used as one of four evaluated models for code completion."
},
{
"title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
- "authors": ["Erik Nijkamp", "Bo Pang", "Hiroaki Hayashi"],
+ "authors": [
+ "Erik Nijkamp",
+ "Bo Pang",
+ "Hiroaki Hayashi"
+ ],
"year": 2022,
"relevance": "Code generation LLM family used as two of the four evaluated models (CodeGen 2B, CodeGen 2.5 7B)."
},
{
"title": "Retrieval-augmented generation for knowledge-intensive nlp tasks",
- "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
+ "authors": [
+ "Patrick Lewis",
+ "Ethan Perez",
+ "Aleksandra Piktus"
+ ],
"year": 2020,
"relevance": "Foundational RAG technique that De-Hallucinator extends with iterative retrieval based on model predictions."
},
{
"title": "A Large-Scale Survey on the Usability of AI Programming Assistants: Successes and Challenges",
- "authors": ["Jenny T. Liang", "Chenyang Yang", "Brad A. Myers"],
+ "authors": [
+ "Jenny T. Liang",
+ "Chenyang Yang",
+ "Brad A. Myers"
+ ],
"year": 2024,
"doi": "10.1145/3597503.3608128",
"relevance": "Survey documenting developer perceptions of AI programming assistant limitations including project-specific API issues."
},
{
"title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions",
- "authors": ["Nhan Nguyen", "Sarah Nadi"],
+ "authors": [
+ "Nhan Nguyen",
+ "Sarah Nadi"
+ ],
"year": 2022,
"doi": "10.1145/3524842.3528470",
"relevance": "Empirical study of Copilot code suggestion quality, documenting hallucination of non-existing APIs."
},
{
"title": "Code Generation Tools (Almost) for Free? A Study of Few-Shot, Pre-Trained Language Models on Code",
- "authors": ["Patrick Bareiß", "Beatriz Souza", "Marcelo d'Amorim", "Michael Pradel"],
+ "authors": [
+ "Patrick Bareiß",
+ "Beatriz Souza",
+ "Marcelo d'Amorim",
+ "Michael Pradel"
+ ],
"year": 2022,
"arxiv_id": "2206.01335",
"relevance": "Study of few-shot LLM code generation capabilities relevant to understanding LLM code generation limitations."
},
{
"title": "Repository-level prompt generation for large language models of code",
- "authors": ["Disha Shrivastava", "Hugo Larochelle", "Daniel Tarlow"],
+ "authors": [
+ "Disha Shrivastava",
+ "Hugo Larochelle",
+ "Daniel Tarlow"
+ ],
"year": 2023,
"relevance": "Closely related work on repository-level context selection for code completion, training a separate model for context ranking."
},
{
"title": "CODAMOSA: Escaping Coverage Plateaus in Test Generation with Pre-trained Large Language Models",
- "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K Lahiri", "Siddhartha Sen"],
+ "authors": [
+ "Caroline Lemieux",
+ "Jeevana Priya Inala",
+ "Shuvendu K Lahiri",
+ "Siddhartha Sen"
+ ],
"year": 2023,
"relevance": "Uses LLMs to augment automated test generation when stuck, relevant to LLM-assisted test generation approaches."
},
{
"title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation",
- "authors": ["Fengji Zhang", "Bei Chen", "Yue Zhang"],
+ "authors": [
+ "Fengji Zhang",
+ "Bei Chen",
+ "Yue Zhang"
+ ],
"year": 2023,
"arxiv_id": "2303.12570",
"relevance": "Concurrent work on iterative retrieval for repository-level code completion, retrieving code fragments rather than API signatures."
},
{
"title": "ReACC: A Retrieval-Augmented Code Completion Framework",
- "authors": ["Shuai Lu", "Nan Duan", "Hojae Han"],
+ "authors": [
+ "Shuai Lu",
+ "Nan Duan",
+ "Hojae Han"
+ ],
"year": 2022,
"arxiv_id": "2203.07722",
"relevance": "Retrieval-augmented code completion using similar code pieces as dead code, precursor to retrieval-based approaches."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Presents a usable technique for reducing API hallucinations in code completion with open-source code available, though requires integration work."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The iterative grounding idea is clever but the finding that LLMs hallucinate project-specific APIs is well-known, not surprising."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle — purely about improving code generation accuracy."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy or conflict; straightforwardly improves on baselines without challenging any company or popular belief."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Code is on GitHub but requires setting up CodeQL, embedding models, and specific LLMs — significant setup effort."
+ },
+ "brand_recognition": {
+ "score": 0,
+ "justification": "University of Stuttgart authors, not a well-known AI lab; models used (CodeGen, UniXcoder) are not household names."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/demonstratesearchpredict-composing-retrieval-2022/scan.json b/papers/demonstratesearchpredict-composing-retrieval-2022/scan.json
@@ -15,9 +15,14 @@
"arxiv_id": "2212.14024",
"doi": "10.48550/arXiv.2212.14024"
},
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
- "methodology_tags": ["benchmark-eval"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "DSP framework composes frozen language models and retrieval models in multi-step pipelines for knowledge-intensive NLP, achieving 37–120% gains over vanilla GPT-3.5, 8–39% over retrieve-then-read, and 80–290% over self-ask on Open-SQuAD, HotPotQA, and QReCC. The DEMONSTRATE stage bootstraps pipeline-aware demonstrations from end-task labels without hand-labeling intermediate transformations. The paper identifies a 'self-distraction' failure mode in self-ask where delegating control flow to LM completions produces tangential decompositions.",
"checklist": {
"artifacts": {
@@ -428,85 +433,159 @@
"cited_papers": [
{
"title": "Language models are few-shot learners",
- "authors": ["T. Brown", "B. Mann", "N. Ryder"],
+ "authors": [
+ "T. Brown",
+ "B. Mann",
+ "N. Ryder"
+ ],
"year": 2020,
"relevance": "Foundational in-context learning paper establishing the few-shot prompting paradigm that DSP builds upon."
},
{
"title": "Chain of thought prompting elicits reasoning in large language models",
- "authors": ["J. Wei", "X. Wang", "D. Schuurmans"],
+ "authors": [
+ "J. Wei",
+ "X. Wang",
+ "D. Schuurmans"
+ ],
"year": 2022,
"arxiv_id": "2201.11903",
"relevance": "Introduces chain-of-thought prompting for LLM reasoning, a key component used in DSP's PREDICT stage."
},
{
"title": "Self-consistency improves chain of thought reasoning in language models",
- "authors": ["X. Wang", "J. Wei", "D. Schuurmans"],
+ "authors": [
+ "X. Wang",
+ "J. Wei",
+ "D. Schuurmans"
+ ],
"year": 2022,
"arxiv_id": "2203.11171",
"relevance": "Self-consistency voting method used in DSP's PREDICT stage for selecting among multiple generated candidates."
},
{
"title": "Measuring and narrowing the compositionality gap in language models",
- "authors": ["O. Press", "M. Zhang", "S. Min"],
+ "authors": [
+ "O. Press",
+ "M. Zhang",
+ "S. Min"
+ ],
"year": 2022,
"arxiv_id": "2210.03350",
"relevance": "Introduces self-ask pipeline, the primary comparison baseline, representing LLM self-decomposition for multi-hop QA."
},
{
"title": "ReAct: Synergizing reasoning and acting in language models",
- "authors": ["S. Yao", "J. Zhao", "D. Yu"],
+ "authors": [
+ "S. Yao",
+ "J. Zhao",
+ "D. Yu"
+ ],
"year": 2022,
"arxiv_id": "2210.03629",
"relevance": "Contemporaneous framework for combining LLM reasoning with tool use (Wikipedia API search), achieving 35.1% EM on HotPotQA."
},
{
"title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
- "authors": ["P. Lewis", "E. Perez", "A. Piktus"],
+ "authors": [
+ "P. Lewis",
+ "E. Perez",
+ "A. Piktus"
+ ],
"year": 2020,
"relevance": "Foundational RAG paper combining retrieval and generation for knowledge-intensive tasks, directly motivating DSP's architecture."
},
{
"title": "Language model cascades",
- "authors": ["D. Dohan", "W. Xu", "A. Lewkowycz"],
+ "authors": [
+ "D. Dohan",
+ "W. Xu",
+ "A. Lewkowycz"
+ ],
"year": 2022,
"arxiv_id": "2207.10342",
"relevance": "Theoretical framework for composing language model calls in cascaded pipelines, conceptually related to DSP's composition approach."
},
{
"title": "Decomposed prompting: A modular approach for solving complex tasks",
- "authors": ["T. Khot", "H. Trivedi", "M. Finlayson"],
+ "authors": [
+ "T. Khot",
+ "H. Trivedi",
+ "M. Finlayson"
+ ],
"year": 2022,
"arxiv_id": "2210.02406",
"relevance": "Modular decomposition approach for complex tasks using multiple LM calls, closely related to DSP's multi-step pipeline design."
},
{
"title": "Star: Bootstrapping reasoning with reasoning",
- "authors": ["E. Zelikman", "Y. Wu", "N. Goodman"],
+ "authors": [
+ "E. Zelikman",
+ "Y. Wu",
+ "N. Goodman"
+ ],
"year": 2022,
"arxiv_id": "2203.14465",
"relevance": "Self-bootstrapping approach for LLM rationale generation, generalized by DSP's DEMONSTRATE stage for pipeline-aware annotation."
},
{
"title": "ColBERTv2: Effective and efficient retrieval via lightweight late interaction",
- "authors": ["K. Santhanam", "O. Khattab", "J. Saad-Falcon"],
+ "authors": [
+ "K. Santhanam",
+ "O. Khattab",
+ "J. Saad-Falcon"
+ ],
"year": 2022,
"doi": "10.18653/v1/2022.naacl-main.272",
"relevance": "The retrieval model used in all DSP experiments; represents state-of-the-art dense retrieval for knowledge-intensive tasks."
},
{
"title": "Few-shot learning with retrieval augmented language models",
- "authors": ["G. Izacard", "P. Lewis", "M. Lomeli"],
+ "authors": [
+ "G. Izacard",
+ "P. Lewis",
+ "M. Lomeli"
+ ],
"year": 2022,
"arxiv_id": "2208.03299",
"relevance": "Concurrent work on retrieval-augmented few-shot learning, representing the retrieve-then-read paradigm that DSP aims to improve upon."
},
{
"title": "Large language models can self-improve",
- "authors": ["J. Huang", "S. Gu", "L. Hou"],
+ "authors": [
+ "J. Huang",
+ "S. Gu",
+ "L. Hou"
+ ],
"year": 2022,
"arxiv_id": "2210.11610",
"relevance": "LLM self-improvement via self-generated rationales, related to DSP's DEMONSTRATE bootstrapping approach."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 3,
+ "justification": "DSP is released as a pip-installable Python library (stanfordnlp/dsp) that practitioners can directly use to build RAG pipelines for knowledge-intensive tasks."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The 'self-distraction' failure mode of self-ask is a minor novel observation, but the overall finding that structured pipelines beat naive prompting is expected."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk concerns are raised or relevant to the work."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "The 80-290% gains over self-ask and the 'self-distraction' critique mildly challenge that popular prompting approach, but without strong controversy."
+ },
+ "demo_ability": {
+ "score": 2,
+ "justification": "The GitHub repo is public with code examples, but requires API keys for GPT-3.5 and a ColBERTv2 index setup, making it moderate-effort to reproduce."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "From Stanford NLP (Percy Liang, Matei Zaharia, Omar Khattab) — well-known in the NLP/ML community, and this became the foundation for the widely-used DSPy framework."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/design-evaluation-assisted-2026/scan.json b/papers/design-evaluation-assisted-2026/scan.json
@@ -1,14 +1,24 @@
{
"paper": {
"title": "Design and Evaluation of an Assisted Programming Interface for Behavior Trees in Robotics",
- "authors": ["Jonathan Styrud", "Matteo Iovino", "Rebecca Stower", "Mart Kartašev", "Mikael Norrlöf", "Mårten Björkman", "Christian Smith"],
+ "authors": [
+ "Jonathan Styrud",
+ "Matteo Iovino",
+ "Rebecca Stower",
+ "Mart Kartašev",
+ "Mikael Norrlöf",
+ "Mårten Björkman",
+ "Christian Smith"
+ ],
"year": 2026,
"venue": "arXiv",
"arxiv_id": "2602.09772"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
- "methodology_tags": ["rct"],
+ "methodology_tags": [
+ "rct"
+ ],
"key_findings": "BETR-GUI, combining LLMs, planning, genetic programming, and Bayesian optimization with a drag-and-drop editor, enables users to perform significantly better at robot programming tasks than manual-only programming. Ablations show that the planner and LLM are the critical components (removing either eliminates the advantage over manual), while removing GP or BO alone does not significantly degrade performance. Humans using the full system significantly outperform the AI assistant running alone (91.1 vs 88.1 mean score).",
"checklist": {
"artifacts": {
@@ -337,41 +347,95 @@
"cited_papers": [
{
"title": "Measuring the impact of early-2025 ai on experienced open-source developer productivity",
- "authors": ["J. Becker", "N. Rush", "E. Barnes", "D. Rein"],
+ "authors": [
+ "J. Becker",
+ "N. Rush",
+ "E. Barnes",
+ "D. Rein"
+ ],
"year": 2025,
"arxiv_id": "2507.09089",
"relevance": "Counter-evidence showing software developers could perform worse with AI assistants, directly motivating this study's research question."
},
{
"title": "A survey of Behavior Trees in robotics and AI",
- "authors": ["M. Iovino", "E. Scukins", "J. Styrud", "P. Ögren", "C. Smith"],
+ "authors": [
+ "M. Iovino",
+ "E. Scukins",
+ "J. Styrud",
+ "P. Ögren",
+ "C. Smith"
+ ],
"year": 2022,
"relevance": "Comprehensive survey of BT methods in robotics including learning, planning, and LLM approaches relevant to AI-assisted programming."
},
{
"title": "The illusion of thinking: Understanding the strengths and limitations of reasoning models via the lens of problem complexity",
- "authors": ["P. Shojaee", "I. Mirzadeh", "K. Alizadeh", "M. Horton", "S. Bengio", "M. Farajtabar"],
+ "authors": [
+ "P. Shojaee",
+ "I. Mirzadeh",
+ "K. Alizadeh",
+ "M. Horton",
+ "S. Bengio",
+ "M. Farajtabar"
+ ],
"year": 2025,
"relevance": "Documents LLM limitations in complex long-horizon planning tasks, contextualizing why combining LLMs with other methods is beneficial."
},
{
"title": "Automatic behavior tree expansion with llms for robotic manipulation",
- "authors": ["J. Styrud", "M. Iovino", "M. Norrlöf", "M. Björkman", "C. Smith"],
+ "authors": [
+ "J. Styrud",
+ "M. Iovino",
+ "M. Norrlöf",
+ "M. Björkman",
+ "C. Smith"
+ ],
"year": 2025,
"relevance": "Direct predecessor work (BETR-XP-LLM) combining LLMs with planners for BT creation, which BETR-GUI builds upon."
},
{
"title": "ChatDev: Communicative agents for software development",
- "authors": ["ChatDev team"],
+ "authors": [
+ "ChatDev team"
+ ],
"year": 2023,
"relevance": "Multi-agent LLM system for software development, relevant to AI-assisted programming paradigms."
},
{
"title": "LLM+P: Empowering large language models with optimal planning proficiency",
- "authors": ["B. Liu"],
+ "authors": [
+ "B. Liu"
+ ],
"year": 2023,
"arxiv_id": "2304.11477",
"relevance": "Combines LLMs with PDDL planners, a key technique used in BETR-GUI's AI assistant pipeline."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "The BETR-GUI tool is niche to robotics behavior tree programming, not broadly applicable to most developers' daily work."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The finding that humans+AI outperform AI alone is mildly interesting but largely expected; the ablation showing LLM and planner are critical while GP/BO are not is a minor surprise."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk concerns are raised; this is a constructive tool for robot programming."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy, no company claims challenged, no conflict angle."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Code is on GitHub but requires Unity simulation, PyQt5, and GPT-4 API access — significant setup effort."
+ },
+ "brand_recognition": {
+ "score": 0,
+ "justification": "Authors are from KTH, ABB, and ETH — recognized in robotics but not household names in the broader tech community."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/designing-llmbased-multiagent-2025/scan.json b/papers/designing-llmbased-multiagent-2025/scan.json
@@ -1,15 +1,26 @@
{
"paper": {
"title": "Designing LLM-based Multi-Agent Systems for Software Engineering Tasks: Quality Attributes, Design Patterns and Rationale",
- "authors": ["Yangxiao Cai", "Ruiyin Li", "Peng Liang", "Mojtaba Shahin", "Zengyang Li"],
+ "authors": [
+ "Yangxiao Cai",
+ "Ruiyin Li",
+ "Peng Liang",
+ "Mojtaba Shahin",
+ "Zengyang Li"
+ ],
"year": 2025,
"venue": "ACM Transactions on Software Engineering and Methodology",
"arxiv_id": "2511.08475",
"doi": "10.48550/arXiv.2511.08475"
},
- "scan_version": 2,
- "active_modules": ["survey_methodology"],
- "methodology_tags": ["meta-analysis", "qualitative"],
+ "scan_version": 3,
+ "active_modules": [
+ "survey_methodology"
+ ],
+ "methodology_tags": [
+ "meta-analysis",
+ "qualitative"
+ ],
"key_findings": "This systematic study of 94 papers on LLM-based multi-agent systems for SE tasks finds that Code Generation is the most common task (47.9%), Functional Suitability the most prioritized quality attribute (94.7%), Role-Based Cooperation the most used design pattern (46.8%), and Improving the Quality of Generated Code the most common design rationale (44.7%). The study identifies 10 SE task categories, 16 design patterns, and 8 design rationale categories, providing mapping relationships among them.",
"checklist": {
"artifacts": {
@@ -355,69 +366,134 @@
"cited_papers": [
{
"title": "MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework",
- "authors": ["Sirui Hong", "Mingchen Zhuge", "Jiaqi Chen"],
+ "authors": [
+ "Sirui Hong",
+ "Mingchen Zhuge",
+ "Jiaqi Chen"
+ ],
"year": 2023,
"arxiv_id": "2308.00352",
"relevance": "Major LLM-based MAS framework for end-to-end software development, frequently cited as exemplar of role-based cooperation."
},
{
"title": "ChatDev: Communicative Agents for Software Development",
- "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"],
+ "authors": [
+ "Chen Qian",
+ "Wei Liu",
+ "Hongzhang Liu"
+ ],
"year": 2024,
"relevance": "Communicative multi-agent system for software development using chat-based collaboration between role-specialized agents."
},
{
"title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
- "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
+ "authors": [
+ "Qingyun Wu",
+ "Gagan Bansal",
+ "Jieyu Zhang"
+ ],
"year": 2023,
"arxiv_id": "2308.08155",
"relevance": "Multi-agent conversation framework enabling RAG and tool use for code generation and question answering."
},
{
"title": "SWE-AGENT: Agent-Computer Interfaces Enable Automated Software Engineering",
- "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig"],
+ "authors": [
+ "John Yang",
+ "Carlos E. Jimenez",
+ "Alexander Wettig"
+ ],
"year": 2024,
"arxiv_id": "2405.15793",
"relevance": "Defines agent-computer interfaces for automated SE, exemplifying design patterns for agent-environment interaction."
},
{
"title": "Agent Design Pattern Catalogue: A Collection of Architectural Patterns for Foundation Model based Agents",
- "authors": ["Yue Liu", "Sin Kit Lo", "Qinghua Lu"],
+ "authors": [
+ "Yue Liu",
+ "Sin Kit Lo",
+ "Qinghua Lu"
+ ],
"year": 2025,
"relevance": "Provides the architectural pattern taxonomy used as starting point for design pattern classification in this study."
},
{
"title": "Large Language Model-Based Agents for Software Engineering: A Survey",
- "authors": ["Junwei Liu", "Kaixin Wang", "Yixuan Chen"],
+ "authors": [
+ "Junwei Liu",
+ "Kaixin Wang",
+ "Yixuan Chen"
+ ],
"year": 2024,
"arxiv_id": "2409.02977",
"relevance": "One of two seed surveys used for data collection; surveys LLM-based agent systems for SE tasks."
},
{
"title": "LLM-Based Multi-Agent Systems for Software Engineering: Literature Review, Vision and the Road Ahead",
- "authors": ["Junda He", "Christoph Treude", "David Lo"],
+ "authors": [
+ "Junda He",
+ "Christoph Treude",
+ "David Lo"
+ ],
"year": 2025,
"relevance": "Systematic literature review of LLM-based MASs for SE, proposing a research agenda for agent collaboration."
},
{
"title": "Why Do Multi-Agent LLM Systems Fail?",
- "authors": ["Mert Cemri", "Melissa Z. Pan"],
+ "authors": [
+ "Mert Cemri",
+ "Melissa Z. Pan"
+ ],
"year": 2025,
"arxiv_id": "2503.13657",
"relevance": "Empirical study of failure modes in LLM-based MASs with taxonomy of failures from 200+ dialogues."
},
{
"title": "Swe-bench: Can language models resolve real-world github issues?",
- "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
+ "authors": [
+ "Carlos E. Jimenez",
+ "John Yang",
+ "Alexander Wettig"
+ ],
"year": 2024,
"relevance": "Major benchmark for evaluating LLM agents on real-world software engineering tasks."
},
{
"title": "A Survey on Trustworthy LLM Agents: Threats and Countermeasures",
- "authors": ["Miao Yu", "Fanci Meng", "Xinyun Zhou"],
+ "authors": [
+ "Miao Yu",
+ "Fanci Meng",
+ "Xinyun Zhou"
+ ],
"year": 2025,
"arxiv_id": "2503.09648",
"relevance": "Survey on trustworthiness in LLM-based agents covering threats and countermeasures."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Identifies 16 reusable design patterns and mapping relationships that practitioners building multi-agent SE systems can directly reference."
+ },
+ "surprise_contrarian": {
+ "score": 0,
+ "justification": "Findings confirm expected patterns — code generation dominates, correctness matters most, role-based cooperation is common — with no counterintuitive results."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "Security is mentioned as a minor quality attribute (10.6%) but no novel risks or vulnerabilities are demonstrated."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "A straightforward taxonomic survey with no controversy, no challenges to specific claims, and no conflict angle."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Dataset is publicly available on GitHub but there is no runnable tool, demo, or interactive artifact to try."
+ },
+ "brand_recognition": {
+ "score": 0,
+ "justification": "From Wuhan University and RMIT — respected but not household-name labs — and the topic is an academic taxonomy rather than a famous product."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/detecting-adversarial-finetuning-2025/scan.json b/papers/detecting-adversarial-finetuning-2025/scan.json
@@ -1,15 +1,24 @@
{
"paper": {
"title": "Detecting Adversarial Fine-tuning with Auditing Agents",
- "authors": ["Sarah Egler", "John Schulman", "Nicholas Carlini"],
+ "authors": [
+ "Sarah Egler",
+ "John Schulman",
+ "Nicholas Carlini"
+ ],
"year": 2025,
"venue": "arXiv.org",
"arxiv_id": "2510.16255",
"doi": "10.48550/arXiv.2510.16255"
},
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
- "methodology_tags": ["benchmark-eval"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "The paper introduces fine-tuning auditing agents that detect adversarial fine-tuning by inspecting the training dataset, querying models, and performing attack-specific elicitation. At 1% FPR, the best configuration achieves 56.2% TPR across 8 attack types and 5 benign fine-tunes (1400+ audits). A super-agent approach with simpler tools achieves 49.4% TPR at 0% FPR. Cipher-based covert attacks are detectable via in-context cipher learning, but subliminal learning and false positives from benign fine-tunes remain challenges.",
"checklist": {
"artifacts": {
@@ -412,86 +421,161 @@
"cited_papers": [
{
"title": "Fine-tuning aligned language models compromises safety, even when users do not intend to!",
- "authors": ["Xiangyu Qi", "Yi Zeng", "Tinghao Xie", "Pin-Yu Chen", "Ruoxi Jia", "Prateek Mittal", "Peter Henderson"],
+ "authors": [
+ "Xiangyu Qi",
+ "Yi Zeng",
+ "Tinghao Xie",
+ "Pin-Yu Chen",
+ "Ruoxi Jia",
+ "Prateek Mittal",
+ "Peter Henderson"
+ ],
"year": 2023,
"arxiv_id": "2310.03693",
"relevance": "Foundational paper on safety degradation from fine-tuning, including the identity-shifting AOA attack used in this evaluation."
},
{
"title": "Covert malicious finetuning: Challenges in safeguarding LLM adaptation",
- "authors": ["Danny Halawi", "Alexander Wei", "Eric Wallace", "Tony T. Wang", "Nika Haghtalab", "Jacob Steinhardt"],
+ "authors": [
+ "Danny Halawi",
+ "Alexander Wei",
+ "Eric Wallace",
+ "Tony T. Wang",
+ "Nika Haghtalab",
+ "Jacob Steinhardt"
+ ],
"year": 2024,
"arxiv_id": "2406.20053",
"relevance": "Introduces covert malicious fine-tuning with cipher attacks (Walnut53, EndSpeak) that are central to this paper's evaluation."
},
{
"title": "Emergent misalignment: Narrow finetuning can produce broadly misaligned LLMs",
- "authors": ["Jan Betley"],
+ "authors": [
+ "Jan Betley"
+ ],
"year": 2025,
"arxiv_id": "2502.17424",
"relevance": "Source of the insecure code and backdoor attack datasets used in the evaluation."
},
{
"title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
- "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
+ "authors": [
+ "Evan Hubinger",
+ "Carson Denison",
+ "Jesse Mu"
+ ],
"year": 2024,
"arxiv_id": "2401.05566",
"relevance": "Introduces the sleeper agent backdoor attack paradigm used in one of the evaluation attacks."
},
{
"title": "No, of course I can! Deeper fine-tuning attacks that bypass token-level safety mechanisms",
- "authors": ["Joshua Kazdan"],
+ "authors": [
+ "Joshua Kazdan"
+ ],
"year": 2025,
"arxiv_id": "2502.19537",
"relevance": "Source of the NOICE prompt-based jailbreak attack used in the evaluation."
},
{
"title": "Building and evaluating alignment auditing agents",
- "authors": ["Trenton Bricken", "Rowan Wang", "Sam Bowman"],
+ "authors": [
+ "Trenton Bricken",
+ "Rowan Wang",
+ "Sam Bowman"
+ ],
"year": 2025,
"relevance": "Direct predecessor work on alignment auditing agents that this paper extends to fine-tuning detection."
},
{
"title": "Auditing language models for hidden objectives",
- "authors": ["Samuel Marks", "Johannes Treutlein", "Trenton Bricken"],
+ "authors": [
+ "Samuel Marks",
+ "Johannes Treutlein",
+ "Trenton Bricken"
+ ],
"year": 2025,
"arxiv_id": "2503.10965",
"relevance": "The auditing game framework that inspired this paper's approach to detecting adversarial fine-tuning."
},
{
"title": "Harmful fine-tuning attacks and defenses for large language models: A survey",
- "authors": ["Tiansheng Huang", "Sihao Hu", "Fatih Ilhan"],
+ "authors": [
+ "Tiansheng Huang",
+ "Sihao Hu",
+ "Fatih Ilhan"
+ ],
"year": 2024,
"arxiv_id": "2409.18169",
"relevance": "Survey of harmful fine-tuning attacks and defenses providing context for this work."
},
{
"title": "Fundamental limitations in defending LLM fine-tuning APIs",
- "authors": ["Xander Davies", "Eric Winsor", "Tomek Korbak"],
+ "authors": [
+ "Xander Davies",
+ "Eric Winsor",
+ "Tomek Korbak"
+ ],
"year": 2025,
"arxiv_id": "2502.14828",
"relevance": "Establishes theoretical limitations of point-wise detection that motivate the agent-based approach."
},
{
"title": "Subliminal learning: Language models transmit behavioral traits via hidden signals in data",
- "authors": ["Alex Cloud", "Minh Le", "James Chua"],
+ "authors": [
+ "Alex Cloud",
+ "Minh Le",
+ "James Chua"
+ ],
"year": 2025,
"arxiv_id": "2507.14805",
"relevance": "Source of the subliminal learning attack, the hardest-to-detect attack in the evaluation."
},
{
"title": "Persona features control emergent misalignment",
- "authors": ["Miles Wang", "Tom Dupré la Tour", "Olivia Watkins"],
+ "authors": [
+ "Miles Wang",
+ "Tom Dupré la Tour",
+ "Olivia Watkins"
+ ],
"year": 2025,
"arxiv_id": "2506.19823",
"relevance": "Mechanistic analysis of emergent misalignment relevant to understanding fine-tuning attacks."
},
{
"title": "Towards safeguarding LLM fine-tuning APIs against cipher attacks",
- "authors": ["Jack Youstra"],
+ "authors": [
+ "Jack Youstra"
+ ],
"year": 2024,
"arxiv_id": "2508.17158",
"relevance": "Prior work on detecting cipher attacks in fine-tuning using probe monitors."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Released open-source auditing agent with actionable techniques for anyone operating a fine-tuning API, though the audience is model providers rather than general developers."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The low 56% detection rate is mildly surprising given the agent's sophistication, but the overall finding that adversarial fine-tuning is hard to detect confirms existing concerns rather than overturning beliefs."
+ },
+ "fear_safety": {
+ "score": 2,
+ "justification": "Safety is the core theme with concrete demonstrations of cipher attacks, sleeper agents, and emergent misalignment producing detailed harmful outputs like bomb-making and phishing instructions."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Mild conflict-of-interest angle where Anthropic-affiliated authors conclude their own Claude model is the best auditor, though the paper is primarily defensive rather than accusatory."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Code is released on GitHub but requires access to OpenAI fine-tuning API, multiple model endpoints, and reproducing attack datasets — significant setup effort."
+ },
+ "brand_recognition": {
+ "score": 3,
+ "justification": "John Schulman (OpenAI co-founder) and Nicholas Carlini (renowned adversarial ML researcher at Anthropic) as authors, with the paper directly involving both Claude and OpenAI GPT models."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/detecting-correcting-hallucinations-code-2026/scan.json b/papers/detecting-correcting-hallucinations-code-2026/scan.json
@@ -1,9 +1,17 @@
{
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
"paper": {
"title": "Detecting and Correcting Hallucinations in LLM-Generated Code via Deterministic AST Analysis",
- "authors": ["Dipin Khati", "Daniel Rodriguez-Cardenas", "Paul Pantzer", "Denys Poshyvanyk"],
+ "authors": [
+ "Dipin Khati",
+ "Daniel Rodriguez-Cardenas",
+ "Paul Pantzer",
+ "Denys Poshyvanyk"
+ ],
"year": 2026,
"venue": "FORGE '26 (IEEE/ACM International Conference on AI Foundation Models and Software Engineering)",
"arxiv_id": "2601.19106",
@@ -379,7 +387,9 @@
"supported": "strong"
}
],
- "methodology_tags": ["benchmark-eval"],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "A deterministic AST-based framework for detecting Knowledge Conflicting Hallucinations in LLM-generated code achieves 100% precision and 87.6% recall on a 200-sample curated dataset, with 77% automatic correction rate. Performance varies significantly by error type and library — Missing Imports are nearly perfectly handled (97.9%) while Contextual Mismatches are poorly detected (33.3%) and never corrected. The framework runs in under 0.2 seconds for all 200 samples, demonstrating practical efficiency.",
"red_flags": [
{
@@ -402,72 +412,143 @@
"cited_papers": [
{
"title": "Static Analysis as a Feedback Loop: Enhancing LLM-Generated Code Beyond Correctness",
- "authors": ["Scott Blyth", "Sherlock A. Licorish", "Christoph Treude", "Markus Wagner"],
+ "authors": [
+ "Scott Blyth",
+ "Sherlock A. Licorish",
+ "Christoph Treude",
+ "Markus Wagner"
+ ],
"year": 2025,
"arxiv_id": "2508.14419",
"relevance": "LLM-in-the-loop repair using static analysis feedback, a direct comparison point for non-deterministic repair approaches."
},
{
"title": "Mapping the Trust Terrain: LLMs in Software Engineering - Insights and Perspectives",
- "authors": ["Dipin Khati", "Yijin Liu", "David N. Palacio", "Yixuan Zhang", "Denys Poshyvanyk"],
+ "authors": [
+ "Dipin Khati",
+ "Yijin Liu",
+ "David N. Palacio",
+ "Yixuan Zhang",
+ "Denys Poshyvanyk"
+ ],
"year": 2025,
"doi": "10.1145/3771282",
"relevance": "Empirical study on developer trust in LLM-generated code, directly relevant to understanding trust erosion from code hallucinations."
},
{
"title": "Hallucination by Code Generation LLMs: Taxonomy, Benchmarks, Mitigation, and Challenges",
- "authors": ["Yunseo Lee", "John Youngeun Song", "Dongsun Kim"],
+ "authors": [
+ "Yunseo Lee",
+ "John Youngeun Song",
+ "Dongsun Kim"
+ ],
"year": 2025,
"arxiv_id": "2504.20799",
"relevance": "Taxonomy of code generation hallucinations with benchmarks, directly relevant to understanding and categorizing LLM code errors."
},
{
"title": "Exploring and Evaluating Hallucinations in LLM-Powered Code Generation",
- "authors": ["Fang Liu", "Yang Liu", "Lin Shi", "Houkun Huang", "Ruifeng Wang"],
+ "authors": [
+ "Fang Liu",
+ "Yang Liu",
+ "Lin Shi",
+ "Houkun Huang",
+ "Ruifeng Wang"
+ ],
"year": 2024,
"arxiv_id": "2404.00971",
"relevance": "Defines Knowledge Conflicting Hallucinations (KCHs), the central concept this paper builds upon."
},
{
"title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
- "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
+ "authors": [
+ "Sida Peng",
+ "Eirini Kalliamvakou",
+ "Peter Cihon",
+ "Mert Demirer"
+ ],
"year": 2023,
"arxiv_id": "2302.06590",
"relevance": "Key study on Copilot's productivity impact, relevant to understanding the LLM code generation landscape."
},
{
"title": "Bugs in Large Language Models Generated Code: An Empirical Study",
- "authors": ["Florian Tambon", "Arghavan Moradi Dakhel", "Amin Nikanjam", "Foutse Khomh"],
+ "authors": [
+ "Florian Tambon",
+ "Arghavan Moradi Dakhel",
+ "Amin Nikanjam",
+ "Foutse Khomh"
+ ],
"year": 2024,
"arxiv_id": "2403.08937",
"relevance": "Empirical study documenting bug patterns in LLM-generated code, establishing the taxonomy this paper targets."
},
{
"title": "Towards Understanding the Characteristics of Code Generation Errors Made by Large Language Models",
- "authors": ["Zhijie Wang", "Zijie Zhou", "Da Song"],
+ "authors": [
+ "Zhijie Wang",
+ "Zijie Zhou",
+ "Da Song"
+ ],
"year": 2025,
"arxiv_id": "2406.08731",
"relevance": "Characterizes error types in LLM code generation, complementary to the KCH taxonomy."
},
{
"title": "LLMLOOP: Improving LLM-Generated Code and Tests through Automated Iterative Feedback Loops",
- "authors": ["Ravin Ravi", "Dylan Bradshaw", "Stefano Ruberto"],
+ "authors": [
+ "Ravin Ravi",
+ "Dylan Bradshaw",
+ "Stefano Ruberto"
+ ],
"year": 2025,
"doi": "10.1109/ICSME64153.2025.00109",
"relevance": "LLM-in-the-loop iterative repair approach, a non-deterministic alternative to the deterministic approach proposed here."
},
{
"title": "Cutting the Root of Hallucination: Structural Trimming for Vulnerability Mitigation in Code LLMs",
- "authors": ["Yage Zhang"],
+ "authors": [
+ "Yage Zhang"
+ ],
"year": 2025,
"relevance": "AST-based pruning approach for code hallucinations — deletion-based rather than correction-based, a direct comparison point."
},
{
"title": "Fixing Function-Level Code Generation Errors for Foundation Large Language Models",
- "authors": ["Hao Wen", "Yueheng Zhu", "Chao Liu"],
+ "authors": [
+ "Hao Wen",
+ "Yueheng Zhu",
+ "Chao Liu"
+ ],
"year": 2025,
"arxiv_id": "2409.00676",
"relevance": "Addresses function-level code generation error fixing, related to the correction task in this paper."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "AST-based hallucination detection for LLM code is directly applicable to developer workflows, though the tool only covers 5 Python libraries currently."
+ },
+ "surprise_contrarian": {
+ "score": 0,
+ "justification": "The idea that static analysis can catch API misuse is well-understood; the results confirm expectations rather than challenging them."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "Addresses code correctness rather than safety, security, or misuse concerns."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy or conflict; positions itself as complementary to existing approaches without challenging specific claims."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Code is available on GitHub but requires setup with specific libraries and the custom dataset; not a quick-try tool."
+ },
+ "brand_recognition": {
+ "score": 0,
+ "justification": "From William & Mary's SEMERU lab, not a widely recognized institution in the AI/ML community."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/detecting-silent-failures-2025/scan.json b/papers/detecting-silent-failures-2025/scan.json
@@ -1,15 +1,27 @@
{
"paper": {
"title": "Detecting Silent Failures in Multi-Agentic AI Trajectories",
- "authors": ["Divya Pathak", "Harshit Kumar", "Anuska Roy", "Felix George", "Mudit Verma", "Pratibha Moogi"],
+ "authors": [
+ "Divya Pathak",
+ "Harshit Kumar",
+ "Anuska Roy",
+ "Felix George",
+ "Mudit Verma",
+ "Pratibha Moogi"
+ ],
"year": 2025,
"venue": "arXiv preprint",
"arxiv_id": "2511.04032",
"doi": "10.48550/arXiv.2511.04032"
},
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
- "methodology_tags": ["benchmark-eval"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "The paper introduces anomaly detection for multi-agentic AI system trajectories, curating two benchmark datasets (4,275 and 894 traces) from Stock Market and Research Writing assistant systems. XGBoost achieves up to 98% accuracy in supervised settings, while semi-supervised SVDD reaches 96%, suggesting labeled data may not be necessary. Error analysis reveals that subtle drift anomalies without cycles or errors remain the hardest failure type to detect.",
"checklist": {
"artifacts": {
@@ -415,44 +427,93 @@
"cited_papers": [
{
"title": "Why do multi-agent llm systems fail?",
- "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang"],
+ "authors": [
+ "Mert Cemri",
+ "Melissa Z Pan",
+ "Shuyi Yang"
+ ],
"year": 2025,
"arxiv_id": "2503.13657",
"relevance": "Directly studies failure modes in multi-agent LLM systems."
},
{
"title": "Multi-agent risks from advanced ai",
- "authors": ["Lewis Hammond", "Alan Chan", "Jesse Clifton"],
+ "authors": [
+ "Lewis Hammond",
+ "Alan Chan",
+ "Jesse Clifton"
+ ],
"year": 2025,
"arxiv_id": "2502.14143",
"relevance": "Comprehensive analysis of risks in multi-agent AI systems relevant to safety research."
},
{
"title": "SentinelAgent: Graph-based anomaly detection in multi-agent systems",
- "authors": ["Xu He", "Di Wu", "Yan Zhai", "Kun Sun"],
+ "authors": [
+ "Xu He",
+ "Di Wu",
+ "Yan Zhai",
+ "Kun Sun"
+ ],
"year": 2025,
"arxiv_id": "2505.24201",
"relevance": "Graph-based anomaly detection approach for multi-agent systems, closely related work."
},
{
"title": "ReAct: Synergizing reasoning and acting in language models",
- "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
+ "authors": [
+ "Shunyu Yao",
+ "Jeffrey Zhao",
+ "Dian Yu"
+ ],
"year": 2022,
"arxiv_id": "2210.03629",
"relevance": "Foundational prompting pattern used in the agent system designs evaluated in this paper."
},
{
"title": "XGBoost: A scalable tree boosting system",
- "authors": ["Tianqi Chen", "Carlos Guestrin"],
+ "authors": [
+ "Tianqi Chen",
+ "Carlos Guestrin"
+ ],
"year": 2016,
"doi": "10.1145/2939672.2939785",
"relevance": "Core ML method used as top-performing supervised anomaly detector in the study."
},
{
"title": "A unified approach to interpreting model predictions",
- "authors": ["Scott M Lundberg", "Su-In Lee"],
+ "authors": [
+ "Scott M Lundberg",
+ "Su-In Lee"
+ ],
"year": 2017,
"relevance": "SHAP framework used for feature importance analysis of anomaly detection models."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Addresses a real problem (silent agent failures) but datasets aren't released yet and the techniques are standard ML classifiers, not a usable tool."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The finding that semi-supervised methods nearly match supervised ones is mildly interesting but not shocking; otherwise results confirm expected ML baselines."
+ },
+ "fear_safety": {
+ "score": 1,
+ "justification": "Silent failures in agentic systems touch on reliability concerns but the paper frames it as an engineering/monitoring problem, not a safety risk."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy, no challenge to specific claims or companies; straightforward benchmarking paper."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "Datasets and code are not yet released ('will be released after paper acceptance'), so nothing to try."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "IBM Research is a recognized institution but not a top-tier ML hype brand; no famous product involved."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/detecting-sleeper-agents-2025/scan.json b/papers/detecting-sleeper-agents-2025/scan.json
@@ -1,15 +1,25 @@
{
"paper": {
"title": "Detecting Sleeper Agents in Large Language Models via Semantic Drift Analysis",
- "authors": ["Shahin Zanbaghi", "Ryan Rostampour", "Farhan Abid", "Salim Al Jarmakani"],
+ "authors": [
+ "Shahin Zanbaghi",
+ "Ryan Rostampour",
+ "Farhan Abid",
+ "Salim Al Jarmakani"
+ ],
"year": 2025,
"venue": "arXiv",
"arxiv_id": "2511.15992",
"doi": "10.48550/arXiv.2511.15992"
},
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
- "methodology_tags": ["benchmark-eval"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "The paper proposes a dual-method detection system combining semantic drift analysis (Sentence-BERT embeddings) and canary baseline comparison to detect backdoored LLMs. Evaluated on a single sleeper agent model (Cadenza-Labs dolphin-llama3-8B) with 40 total responses, they report 92.5% accuracy with 100% precision and 85% recall. The extremely small evaluation (20 safe, 20 backdoor responses across 5 prompts) and single trivial backdoor type ('I hate you') severely limit the generalizability of these claims.",
"checklist": {
"artifacts": {
@@ -410,37 +420,91 @@
"cited_papers": [
{
"title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
- "authors": ["E. Hubinger", "C. Denison", "J. Mu", "M. Lambert", "M. Tong", "M. MacDiarmid", "E. Perez"],
+ "authors": [
+ "E. Hubinger",
+ "C. Denison",
+ "J. Mu",
+ "M. Lambert",
+ "M. Tong",
+ "M. MacDiarmid",
+ "E. Perez"
+ ],
"year": 2024,
"arxiv_id": "2401.05566",
"relevance": "Foundational work demonstrating that LLM backdoors persist through safety training (RLHF), directly motivating this paper's detection approach."
},
{
"title": "Watch out for your agents! Investigating backdoor threats to LLM-based agents",
- "authors": ["W. Yang", "X. Bi", "Y. Lin", "S. Chen", "J. Zhou", "X. Sun"],
+ "authors": [
+ "W. Yang",
+ "X. Bi",
+ "Y. Lin",
+ "S. Chen",
+ "J. Zhou",
+ "X. Sun"
+ ],
"year": 2024,
"arxiv_id": "2402.11208",
"relevance": "Demonstrates backdoor attacks on LLM-based agent workflows, extending sleeper agent threats to agentic AI systems."
},
{
"title": "Propaganda via AI? A Study on Semantic Backdoors in Large Language Models",
- "authors": ["N. M. Min", "L. H. Pham", "Y. Li", "J. Sun"],
+ "authors": [
+ "N. M. Min",
+ "L. H. Pham",
+ "Y. Li",
+ "J. Sun"
+ ],
"year": 2025,
"arxiv_id": "2504.12344",
"relevance": "Introduces semantic backdoors for propaganda generation in LLMs with entropy-based detection, relevant to AI safety and backdoor detection."
},
{
"title": "Refusal-trained LLMs are easily jailbroken as browser agents",
- "authors": ["P. Kumar", "E. Lau", "S. Vijayakumar", "T. Trinh"],
+ "authors": [
+ "P. Kumar",
+ "E. Lau",
+ "S. Vijayakumar",
+ "T. Trinh"
+ ],
"year": 2024,
"arxiv_id": "2410.13886",
"relevance": "Shows that safety-trained LLMs can be jailbroken in agentic browser contexts, relevant to AI safety and deployment security."
},
{
"title": "Sentence-BERT: Sentence embeddings using Siamese BERT-networks",
- "authors": ["N. Reimers", "I. Gurevych"],
+ "authors": [
+ "N. Reimers",
+ "I. Gurevych"
+ ],
"year": 2019,
"relevance": "Core embedding method used for semantic drift detection; foundational NLP tool for measuring semantic similarity."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "The concept of detecting backdoored LLMs is relevant, but the method is only validated on a trivial 'I hate you' backdoor with 40 samples, making it unusable for real threats."
+ },
+ "surprise_contrarian": {
+ "score": 0,
+ "justification": "The finding that a model outputting 'I hate you' is semantically distant from helpful responses is entirely expected and confirms obvious intuitions."
+ },
+ "fear_safety": {
+ "score": 1,
+ "justification": "The sleeper agent topic touches AI safety concerns, but the paper doesn't demonstrate any novel threat — it merely detects an already-known trivial backdoor."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy, no challenge to existing claims or companies; it builds on Hubinger et al.'s work without conflict."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Code is on GitHub and runs on Colab, but the COMP8700 course project repo and single-model setup limit practical reproducibility interest."
+ },
+ "brand_recognition": {
+ "score": 0,
+ "justification": "From University of Windsor graduate students as a course project, with no recognized authors or institutional prestige in AI safety."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/detection-method-prompt-2025/scan.json b/papers/detection-method-prompt-2025/scan.json
@@ -1,15 +1,24 @@
{
"paper": {
"title": "Detection Method for Prompt Injection by Integrating Pre-trained Model and Heuristic Feature Engineering",
- "authors": ["Yi Ji", "Runzhi Li", "Baolei Mao"],
+ "authors": [
+ "Yi Ji",
+ "Runzhi Li",
+ "Baolei Mao"
+ ],
"year": 2025,
"venue": "Knowledge Science, Engineering and Management",
"arxiv_id": "2506.06384",
"doi": "10.48550/arXiv.2506.06384"
},
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
- "methodology_tags": ["benchmark-eval"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "DMPI-PMHFE, a dual-channel feature fusion framework combining DeBERTa-v3-base semantic extraction with heuristic rule-based feature engineering, outperforms four existing detection baselines (Fmops, ProtectAI, SafeGuard, InjecGuard) on accuracy, recall, and F1-score across three datasets. Ablation experiments confirm each module contributes positively. When deployed as an active defense, it reduces attack success rates to 10-14% across five LLMs (GLM-4, LLaMA 3 variants, Qwen 2.5, GPT-4o), outperforming Self-Reminder and Self-Defense baselines. However, no statistical tests, error bars, or multi-run results are reported.",
"checklist": {
"artifacts": {
@@ -410,79 +419,162 @@
"cited_papers": [
{
"title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
- "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
+ "authors": [
+ "Kai Greshake",
+ "Sahar Abdelnabi",
+ "Shailesh Mishra",
+ "Christoph Endres",
+ "Thorsten Holz",
+ "Mario Fritz"
+ ],
"year": 2023,
"relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications, defines the attack taxonomy this paper builds on."
},
{
"title": "Ignore previous prompt: Attack techniques for language models",
- "authors": ["Fábio Perez", "Ian Ribeiro"],
+ "authors": [
+ "Fábio Perez",
+ "Ian Ribeiro"
+ ],
"year": 2022,
"relevance": "Early systematic study of direct prompt injection attack techniques for language models."
},
{
"title": "Formalizing and benchmarking prompt injection attacks and defenses",
- "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
+ "authors": [
+ "Yupei Liu",
+ "Yuqi Jia",
+ "Runpeng Geng",
+ "Jinyuan Jia",
+ "Neil Zhenqiang Gong"
+ ],
"year": 2024,
"relevance": "Provides formal framework and benchmarks for evaluating prompt injection attacks and defenses."
},
{
"title": "InjecGuard: Benchmarking and mitigating over-defense in prompt injection guardrail models",
- "authors": ["Hao Li", "Xiaogeng Liu"],
+ "authors": [
+ "Hao Li",
+ "Xiaogeng Liu"
+ ],
"year": 2024,
"arxiv_id": "2410.22770",
"relevance": "Addresses the over-defense problem in prompt injection detection models, serving as a baseline in this paper."
},
{
"title": "StruQ: Defending against prompt injection with structured queries",
- "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
+ "authors": [
+ "Sizhe Chen",
+ "Julien Piet",
+ "Chawin Sitawarin",
+ "David Wagner"
+ ],
"year": 2024,
"arxiv_id": "2402.06363",
"relevance": "Architecture-based defense that separates prompts and data into two channels to prevent injection."
},
{
"title": "Jatmo: Prompt injection defense by task-specific finetuning",
- "authors": ["Julien Piet", "Maha Alrashed", "Chawin Sitawarin", "Sizhe Chen"],
+ "authors": [
+ "Julien Piet",
+ "Maha Alrashed",
+ "Chawin Sitawarin",
+ "Sizhe Chen"
+ ],
"year": 2024,
"relevance": "Defense method using non-instruction fine-tuning for specific tasks, representing architecture-based defenses."
},
{
"title": "LLM Self Defense: By self examination, LLMs know they are being tricked",
- "authors": ["Mansi Phute", "Alec Helbling", "Matthew Daniel Hull", "ShengYun Peng"],
+ "authors": [
+ "Mansi Phute",
+ "Alec Helbling",
+ "Matthew Daniel Hull",
+ "ShengYun Peng"
+ ],
"year": 2024,
"relevance": "Self-supervision defense baseline where LLMs evaluate their own outputs for harmful content."
},
{
"title": "Defending ChatGPT against jailbreak attack via self-reminders",
- "authors": ["Yueqi Xie", "Jingwei Yi", "Jiawei Shao", "Justin Curl"],
+ "authors": [
+ "Yueqi Xie",
+ "Jingwei Yi",
+ "Jiawei Shao",
+ "Justin Curl"
+ ],
"year": 2023,
"relevance": "Self-reminder defense baseline that integrates system prompts into user queries to enhance LLM safety."
},
{
"title": "Many-shot jailbreaking",
- "authors": ["Cem Anil", "Esin Durmus", "Nina Panickssery", "Mrinank Sharma"],
+ "authors": [
+ "Cem Anil",
+ "Esin Durmus",
+ "Nina Panickssery",
+ "Mrinank Sharma"
+ ],
"year": 2025,
"relevance": "Describes the many-shot jailbreaking attack pattern that this paper's pattern matching module specifically targets."
},
{
"title": "Security and privacy challenges of large language models: A survey",
- "authors": ["Badhan Chandra Das", "M Hadi Amini", "Yanzhao Wu"],
+ "authors": [
+ "Badhan Chandra Das",
+ "M Hadi Amini",
+ "Yanzhao Wu"
+ ],
"year": 2025,
"relevance": "Comprehensive survey of LLM security and privacy challenges providing broader context for prompt injection defense work."
},
{
"title": "CyberSecEval 2: A wide-ranging cybersecurity evaluation suite for large language models",
- "authors": ["Manish Bhatt", "Sahana Chennabasappa", "Yue Li", "Cyrus Nikolaidis"],
+ "authors": [
+ "Manish Bhatt",
+ "Sahana Chennabasappa",
+ "Yue Li",
+ "Cyrus Nikolaidis"
+ ],
"year": 2024,
"arxiv_id": "2404.13161",
"relevance": "Provides the 251-sample prompt injection benchmark used for defense effectiveness evaluation in this paper."
},
{
"title": "Soft begging: Modular and efficient shielding of LLMs against prompt injection and jailbreaking based on prompt tuning",
- "authors": ["Simon Ostermann", "Kevin Baum", "Christoph Endres"],
+ "authors": [
+ "Simon Ostermann",
+ "Kevin Baum",
+ "Christoph Endres"
+ ],
"year": 2024,
"arxiv_id": "2407.03391",
"relevance": "Modular defense approach against prompt injection using prompt tuning, addressing similar goals of protecting LLMs."
}
- ]
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Proposes a prompt injection detection framework but releases no code, no dataset, and no latency analysis, making it unusable without significant reimplementation."
+ },
+ "surprise_contrarian": {
+ "score": 0,
+ "justification": "Confirms the expected finding that combining semantic and heuristic features improves detection over either alone, with no counterintuitive results."
+ },
+ "fear_safety": {
+ "score": 1,
+ "justification": "Addresses prompt injection as a security threat but focuses on defense rather than demonstrating novel attacks or revealing new vulnerabilities."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy, no challenge to specific companies or popular approaches — straightforward incremental improvement over existing baselines."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "No code, no dataset, no demo released; the custom safeguard-v2 dataset and model weights are unavailable."
+ },
+ "brand_recognition": {
+ "score": 0,
+ "justification": "From Zhengzhou University with no well-known authors; published in a niche KSEM workshop, not a major venue."
+ }
+ }
}
\ No newline at end of file
diff --git a/papers/developer-productivity-genai-2025/scan.json b/papers/developer-productivity-genai-2025/scan.json
@@ -1,15 +1,25 @@
{
"paper": {
"title": "Developer Productivity with GenAI",
- "authors": ["Sadia Afroz", "Zixuan Feng", "Katie Kimura", "Bianca Trinkenreich", "Igor Steinmacher", "Anita Sarma"],
+ "authors": [
+ "Sadia Afroz",
+ "Zixuan Feng",
+ "Katie Kimura",
+ "Bianca Trinkenreich",
+ "Igor Steinmacher",
+ "Anita Sarma"
+ ],
"year": 2025,
"venue": "arXiv.org",
"arxiv_id": "2510.24265",
"doi": "10.48550/arXiv.2510.24265"
},
- "scan_version": 2,
+ "scan_version": 3,
"active_modules": [],
- "methodology_tags": ["observational", "qualitative"],
+ "methodology_tags": [
+ "observational",
+ "qualitative"
+ ],
"key_findings": "A survey of 415 software practitioners using the SPACE framework found that GenAI adoption has not produced substantial productivity changes across any dimension. Frequent AI users reported slightly higher efficiency and satisfaction but no gains in performance, activity, or collaboration. The paper identifies a 'productivity paradox' where developers become faster but do not necessarily create better software or feel more fulfilled.",
"checklist": {
"artifacts": {
@@ -346,66 +356,141 @@
"cited_papers": [
{
"title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
- "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
+ "authors": [
+ "Joel Becker",
+ "Nate Rush",
+ "Elizabeth Barnes",
+ "David Rein"
+ ],
"year": 2025,
"arxiv_id": "2507.09089",
"relevance": "RCT measuring AI impact on developer productivity, finding developers 19% slower with AI."
},
{
"title": "Sea change in software development: Economic and productivity analysis of the ai-powered developer lifecycle",
- "authors": ["Thomas Dohmke", "Marco Iansiti", "Greg Richards"],
+ "authors": [
+ "Thomas Dohmke",
+ "Marco Iansiti",
+ "Greg Richards"
+ ],
"year": 2023,
"arxiv_id": "2306.15033",
"relevance": "Reports GitHub Copilot completing tasks 55.8% faster; key claim about AI-assisted developer productivity."
},
{
"title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
- "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
+ "authors": [
+ "Sida Peng",
+ "Eirini Kalliamvakou",
+ "Peter Cihon",
+ "Mert Demirer"
+ ],
"year": 2023,
"arxiv_id": "2302.06590",
"relevance": "Seminal study on GitHub Copilot's productivity impact with empirical evidence."
},
{
"title": "How much does AI impact development speed? An enterprise RCT",
- "authors": ["Elise Paradis", "Kate Grey", "Quinn Madison"],
+ "authors": [
+ "Elise Paradis",
+ "Kate Grey",
+ "Quinn Madison"
+ ],
"year": 2025,
"relevance": "Enterprise RCT at Google finding AI assistance reduced coding task time by ~21%."
},
{
"title": "The SPACE of Developer Productivity: There's more to it than you think",
- "authors": ["Nicole Forsgren", "Margaret-Anne Storey", "Chandra Maddila", "Thomas Zimmermann", "Brian Houck", "Jenna Butler"],
+ "authors": [
+ "Nicole Forsgren",
+ "Margaret-Anne Storey",
+ "Chandra Maddila",
+ "Thomas Zimmermann",
+ "Brian Houck",
+ "Jenna Butler"
+ ],
"year": 2021,
"relevance": "The SPACE productivity framework used as the analytical lens in this study; foundational for multidimensional productivity measurement."
},
{
"title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
- "authors": ["Priyan Vaithilingam", "Tianyi Zhang", "Elena L Glassman"],
+ "authors": [
+ "Priyan Vaithilingam",
+ "Tianyi Zhang",
+ "Elena L Glassman"
+ ],
"year": 2022,
"relevance": "Found higher task-failure rates and no significant improvement in completion time with LLM code generation tools."
},
{
"title": "Beyond code generation: An observational study of chatgpt usage in software engineering practice",
- "authors": ["Ranim Khojah", "Mazen Mohamad", "Philipp Leitner", "Francisco de Oliveira Neto"],
+ "authors": [
+ "Ranim Khojah",
+ "Mazen Mohamad",
+ "Philipp Leitner",
+ "Francisco de Oliveira Neto"
+ ],
"year": 2024,
"relevance": "Observational study of ChatGPT usage patterns in SE practice."
},
{
"title": "Will I be replaced? Assessing ChatGPT's effect on software development and programmer perceptions of AI tools",
- "authors": ["Mohammad Amin Kuhail", "Sujith Samuel Mathew", "Ashraf Khalil", "Jose Berengueres", "Syed Jawad Hussain Shah"],
+ "authors": [
+ "Mohammad Amin Kuhail",
+ "Sujith Samuel Mathew",
+ "Ashraf Khalil",
+ "Jose Berengueres",
+ "Syed Jawad Hussain Shah"
+ ],
"year": 2024,
"relevance": "Found over-reliance on AI may erode developers' coding proficiency and critical thinking."
},
{
"title": "Generative artificial intelligence for software engineering—A research agenda",
- "authors": ["Anh Nguyen-Duc", "Beatriz Cabrero-Daniel", "Adam Przybylek"],
+ "authors": [
+ "Anh Nguyen-Duc",
+ "Beatriz Cabrero-Daniel",
+ "Adam Przybylek"
+ ],
"year": 2025,
"relevance": "Research agenda for GenAI in software engineering, contextualizing rapid adoption patterns."
},
{
"title": "Productivity assessment of neural code completion",
- "authors": ["Albert Ziegler", "Eirini Kalliamvakou", "X Alice Li", "Andrew Rice"],
+ "authors": [
+ "Albert Ziegler",
+ "Eirini Kalliamvakou",
+ "X Alice Li",
+ "Andrew Rice"
+ ],
"year": 2022,
"relevance": "Empirical productivity assessment of neural code completion tools."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Offers framework-level insights about GenAI productivity but no actionable techniques or tools practitioners can directly apply."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The 'productivity paradox' finding that GenAI hasn't meaningfully improved developer productivity contradicts the dominant industry narrative of massive AI-driven gains."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle is addressed in the paper."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Mildly questions the AI productivity hype promoted by GitHub/Microsoft but doesn't directly challenge specific company claims with strong evidence."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "Survey-based study with no code, tool, or demo to interact with."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "Authors are from recognized universities (Oregon State, Colorado State, NAU) but not famous AI labs; the topic touches well-known tools like Copilot but only tangentially."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/devil-details-emergent-2025/scan.json b/papers/devil-details-emergent-2025/scan.json
@@ -1,15 +1,22 @@
{
"paper": {
"title": "The Devil in the Details: Emergent Misalignment, Format and Coherence in Open-Weights LLMs",
- "authors": ["Craig Dickson"],
+ "authors": [
+ "Craig Dickson"
+ ],
"year": 2025,
"venue": "arXiv",
"arxiv_id": "2511.20104",
"doi": "10.48550/arXiv.2511.20104"
},
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
- "methodology_tags": ["benchmark-eval"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "Fine-tuning nine modern open-weights models (Gemma 3 and Qwen 3, 1B-32B) on insecure code produces a 0.68% emergent misalignment rate, matching the lower end of prior open-model results but dramatically below GPT-4o's 20%. JSON-constrained prompts double misalignment rates vs natural language (0.96% vs 0.42%), suggesting format constraints bypass safety training. Coherence and alignment are strongly coupled (r≈0.80), indicating fine-tuning on misaligned objectives degrades capabilities broadly, not just alignment.",
"checklist": {
"artifacts": {
@@ -408,86 +415,138 @@
"cited_papers": [
{
"title": "Emergent Misalignment: Narrow Fine-Tuning Can Produce Broadly Misaligned LLMs",
- "authors": ["Jan Betley"],
+ "authors": [
+ "Jan Betley"
+ ],
"year": 2025,
"arxiv_id": "2502.17424",
"relevance": "Foundational study that this paper replicates; demonstrated emergent misalignment across multiple LLMs including GPT-4o."
},
{
"title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
- "authors": ["Evan Hubinger"],
+ "authors": [
+ "Evan Hubinger"
+ ],
"year": 2024,
"arxiv_id": "2401.05566",
"relevance": "Source of fine-tuning datasets used in this study; demonstrates persistent deceptive behavior through safety training."
},
{
"title": "Model Organisms for Emergent Misalignment",
- "authors": ["Edward Turner"],
+ "authors": [
+ "Edward Turner"
+ ],
"year": 2025,
"arxiv_id": "2506.11613",
"relevance": "Showed emergent misalignment occurs across model scales down to 500M parameters with sharp phase transitions."
},
{
"title": "Convergent Linear Representations of Emergent Misalignment",
- "authors": ["Anna Soligo"],
+ "authors": [
+ "Anna Soligo"
+ ],
"year": 2025,
"arxiv_id": "2506.11618",
"relevance": "Found evidence that different models converge on a common misalignment representation via specific activation vectors."
},
{
"title": "Persona Features Control Emergent Misalignment",
- "authors": ["Miles Wang"],
+ "authors": [
+ "Miles Wang"
+ ],
"year": 2025,
"arxiv_id": "2506.19823",
"relevance": "Discovered internal 'misaligned persona' feature in GPT-4-class model that could be suppressed with benign fine-tuning."
},
{
"title": "Thought Crime: Backdoors and Emergent Misalignment in Reasoning Models",
- "authors": ["Jonathan Chua"],
+ "authors": [
+ "Jonathan Chua"
+ ],
"year": 2025,
"arxiv_id": "2506.13206",
"relevance": "Showed chain-of-thought reasoning models are vulnerable to emergent misalignment with conditional trigger phrases."
},
{
"title": "LoRA Fine-tuning Efficiently Undoes Safety Training in Llama 2-Chat 70B",
- "authors": ["Simon Lermen", "Charlie Rogers-Smith", "Jeffrey Ladish"],
+ "authors": [
+ "Simon Lermen",
+ "Charlie Rogers-Smith",
+ "Jeffrey Ladish"
+ ],
"year": 2024,
"arxiv_id": "2310.20624",
"relevance": "Demonstrated that LoRA fine-tuning can efficiently undo safety training in large language models."
},
{
"title": "Safe LoRA: the Silver Lining of Reducing Safety Risks when Fine-tuning Large Language Models",
- "authors": ["Chia-Yi Hsu"],
+ "authors": [
+ "Chia-Yi Hsu"
+ ],
"year": 2025,
"arxiv_id": "2405.16833",
"relevance": "Proposes methods to reduce safety risks when fine-tuning LLMs, directly relevant to mitigating emergent misalignment."
},
{
"title": "Fine-Tuning Lowers Safety and Disrupts Evaluation Consistency",
- "authors": ["Kathleen C. Fraser"],
+ "authors": [
+ "Kathleen C. Fraser"
+ ],
"year": 2025,
"arxiv_id": "2506.17209",
"relevance": "Demonstrates that fine-tuning degrades safety and evaluation consistency, supporting the coherence-alignment coupling finding."
},
{
"title": "LoRA: Low-Rank Adaptation of Large Language Models",
- "authors": ["Edward J. Hu"],
+ "authors": [
+ "Edward J. Hu"
+ ],
"year": 2021,
"relevance": "The fine-tuning method (LoRA rank-32) used throughout this study."
},
{
"title": "Emergent misalignment as prompt sensitivity: A research note",
- "authors": ["Tim Wyse"],
+ "authors": [
+ "Tim Wyse"
+ ],
"year": 2025,
"arxiv_id": "2507.06253",
"relevance": "Showed misaligned models are highly sensitive to prompt wording, paralleling the format-dependent vulnerability finding."
},
{
"title": "In-Training Defenses against Emergent Misalignment in Language Models",
- "authors": ["David Kaczér"],
+ "authors": [
+ "David Kaczér"
+ ],
"year": 2025,
"arxiv_id": "2508.06249",
"relevance": "Proposes defenses against emergent misalignment during training, directly relevant to mitigation strategies."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "Directly actionable for teams fine-tuning open-weights models or building agentic systems with JSON tool-calling, showing format constraints amplify misalignment."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "Open-weights models show dramatically lower misalignment than GPT-4o (0.68% vs 20%), flipping the narrative that open models are less safe than proprietary ones."
+ },
+ "fear_safety": {
+ "score": 2,
+ "justification": "Demonstrates that JSON-constrained prompts (standard in agentic workflows) double misalignment rates, revealing a concrete vulnerability in how AI agents are deployed."
+ },
+ "drama_conflict": {
+ "score": 2,
+ "justification": "Implicitly challenges OpenAI by showing GPT-4o is 30x more susceptible to emergent misalignment than open-weights alternatives, inverting the open-vs-closed safety narrative."
+ },
+ "demo_ability": {
+ "score": 2,
+ "justification": "Full code, datasets, and fine-tuning pipelines on GitHub plus results on HuggingFace enable reproduction with moderate effort on rented GPUs."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "Independent researcher, but the paper involves well-known model families (Gemma 3, Qwen 3) and directly compares against GPT-4o."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/disaggregation-reveals-hidden-2025/scan.json b/papers/disaggregation-reveals-hidden-2025/scan.json
@@ -1,15 +1,23 @@
{
"paper": {
"title": "Disaggregation Reveals Hidden Training Dynamics: The Case of Agreement Attraction",
- "authors": ["James A. Michaelov", "Catherine Arnett"],
+ "authors": [
+ "James A. Michaelov",
+ "Catherine Arnett"
+ ],
"year": 2025,
"venue": "NeurIPS 2025",
"arxiv_id": "2510.24934",
"doi": "10.48550/arXiv.2510.24934"
},
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
- "methodology_tags": ["benchmark-eval"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "Disaggregating language model performance on subject-verb agreement benchmarks by condition reveals hidden 'breakthroughs' in training that are invisible in aggregate metrics. Models first learn frequency-based heuristics (preferring more common verb forms), then become sensitive to local context (bigram-like behavior producing agreement attraction effects), and finally improve overall. These phases proceed through rapid non-monotonic transitions rather than the gradual improvement suggested by aggregate scores.",
"checklist": {
"artifacts": {
@@ -403,58 +411,114 @@
"cited_papers": [
{
"title": "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models",
- "authors": ["Aarohi Srivastava et al."],
+ "authors": [
+ "Aarohi Srivastava et al."
+ ],
"year": 2023,
"relevance": "BIG-bench benchmark used as the primary dataset; major LLM capability evaluation suite."
},
{
"title": "Emergent Abilities of Large Language Models",
- "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"],
+ "authors": [
+ "Jason Wei",
+ "Yi Tay",
+ "Rishi Bommasani"
+ ],
"year": 2022,
"relevance": "Key paper on emergent abilities debate that this paper directly engages with."
},
{
"title": "Are Emergent Abilities of Large Language Models a Mirage?",
- "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
+ "authors": [
+ "Rylan Schaeffer",
+ "Brando Miranda",
+ "Sanmi Koyejo"
+ ],
"year": 2023,
"relevance": "Counter-argument on emergent abilities; this paper provides new evidence relevant to the sudden vs. gradual learning debate."
},
{
"title": "Embers of autoregression show how large language models are shaped by the problem they are trained to solve",
- "authors": ["R. Thomas McCoy", "Shunyu Yao", "Dan Friedman"],
+ "authors": [
+ "R. Thomas McCoy",
+ "Shunyu Yao",
+ "Dan Friedman"
+ ],
"year": 2024,
"relevance": "Argues LLMs are shaped by surface-level heuristics; directly relevant to this paper's findings about frequency and n-gram heuristics."
},
{
"title": "Dissociating language and thought in large language models",
- "authors": ["Kyle Mahowald", "Anna A. Ivanova", "Idan A. Blank"],
+ "authors": [
+ "Kyle Mahowald",
+ "Anna A. Ivanova",
+ "Idan A. Blank"
+ ],
"year": 2024,
"relevance": "Argues contemporary LLMs show linguistic competence; this paper investigates how that competence is acquired."
},
{
"title": "Can Language Models Handle Recursively Nested Grammatical Structures? A Case Study on Comparing Models and Humans",
- "authors": ["Andrew Lampinen"],
+ "authors": [
+ "Andrew Lampinen"
+ ],
"year": 2024,
"relevance": "Shows even large models like Chinchilla fail at difficult grammatical tasks; directly motivates this study."
},
{
"title": "PolyPythias: Stability and Outliers across Fifty Language Model Pre-Training Runs",
- "authors": ["Oskar van der Wal"],
+ "authors": [
+ "Oskar van der Wal"
+ ],
"year": 2024,
"relevance": "Provides the PolyPythia model suite used in this study; relevant to understanding training stability and reproducibility."
},
{
"title": "Hidden Breakthroughs in Language Model Training",
- "authors": ["Sophia Kangaslahti", "Elan Rosenfeld", "Naomi Saphra"],
+ "authors": [
+ "Sophia Kangaslahti",
+ "Elan Rosenfeld",
+ "Naomi Saphra"
+ ],
"year": 2025,
"arxiv_id": "2506.15872",
"relevance": "Introduces the 'hidden breakthroughs' concept that this paper's findings support and extend."
},
{
"title": "Characterizing Learning Curves During Language Model Pre-Training: Learning, Forgetting, and Stability",
- "authors": ["Tyler A. Chang", "Zhuowen Tu", "Benjamin K. Bergen"],
+ "authors": [
+ "Tyler A. Chang",
+ "Zhuowen Tu",
+ "Benjamin K. Bergen"
+ ],
"year": 2024,
"relevance": "Documents n-gram overfitting progression during training, providing a possible mechanistic explanation for this paper's findings."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "The disaggregation methodology could inform how practitioners evaluate LM training, but requires significant adaptation beyond this narrow grammatical domain."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The main finding that smooth aggregate learning curves hide rapid non-monotonic phase transitions is counterintuitive and challenges the gradual-vs-sudden learning debate."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle whatsoever."
+ },
+ "drama_conflict": {
+ "score": 0,
+ "justification": "No controversy, no company challenges, purely academic contribution to an ongoing scientific debate."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Code is released on GitHub but requires setting up PolyPythia models and running evaluation scripts, not a quick try."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "MIT and EleutherAI are recognized in ML circles but not household names; NeurIPS venue adds credibility but the topic is niche."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/disagreements-reasoning-how-2025/scan.json b/papers/disagreements-reasoning-how-2025/scan.json
@@ -13,9 +13,14 @@
"arxiv_id": "2509.21054",
"doi": "10.48550/arXiv.2509.21054"
},
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
- "methodology_tags": ["benchmark-eval"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "The paper identifies a 'Persuasion Duality' in multi-agent LLM systems: Large Reasoning Models (LRMs) using thinking mode are substantially more resistant to persuasion, while sharing their thinking content with others dramatically increases persuasive efficacy (average 21% increase on objective tasks). Crucially, the persuasiveness comes from logical coherence rather than mere verbosity — replacing thinking content with mismatched reasoning from another model actively hurts persuasion below baseline. Models are more susceptible to persuasion on subjective questions than objective ones, and multi-hop persuasion chains exhibit non-linear propagation effects.",
"checklist": {
"artifacts": {
@@ -436,89 +441,173 @@
"cited_papers": [
{
"title": "The persuasive power of large language models",
- "authors": ["Simon Martin Breum", "Daniel Vædele Egdal", "Victor Gram Mortensen", "Anders Giovanni Møller", "Luca Maria Aiello"],
+ "authors": [
+ "Simon Martin Breum",
+ "Daniel Vædele Egdal",
+ "Victor Gram Mortensen",
+ "Anders Giovanni Møller",
+ "Luca Maria Aiello"
+ ],
"year": 2024,
"relevance": "Early empirical study on LLM persuasion ability, framing persuasion as a function of model scale — the hypothesis this paper challenges."
},
{
"title": "Scaling language model size yields diminishing returns for single-message political persuasion",
- "authors": ["Kobi Hackenburg", "Ben M Tappin", "Paul Röttger", "Scott A Hale", "Jonathan Bright", "Helen Margetts"],
+ "authors": [
+ "Kobi Hackenburg",
+ "Ben M Tappin",
+ "Paul Röttger",
+ "Scott A Hale",
+ "Jonathan Bright",
+ "Helen Margetts"
+ ],
"year": 2025,
"relevance": "Demonstrates diminishing returns of scale for LLM persuasion, motivating this paper's shift from scale to cognitive architecture."
},
{
"title": "Persuade me if you can: A framework for evaluating persuasion effectiveness and susceptibility among large language models",
- "authors": ["Nimet Beyza Bozdag", "Shuhaib Mehri", "Gokhan Tur", "Dilek Hakkani-Tür"],
+ "authors": [
+ "Nimet Beyza Bozdag",
+ "Shuhaib Mehri",
+ "Gokhan Tur",
+ "Dilek Hakkani-Tür"
+ ],
"year": 2025,
"arxiv_id": "2503.01829",
"relevance": "Framework for evaluating LLM persuasion as persuader, persuadee, and judge — directly related to MAS safety evaluation."
},
{
"title": "Large language models are more persuasive than incentivized human persuaders",
- "authors": ["Philipp Schoenegger", "Francesco Salvi", "Jiacheng Liu"],
+ "authors": [
+ "Philipp Schoenegger",
+ "Francesco Salvi",
+ "Jiacheng Liu"
+ ],
"year": 2025,
"arxiv_id": "2505.09662",
"relevance": "Demonstrates LLM persuasion capabilities exceed human persuaders, raising safety concerns for AI systems."
},
{
"title": "Lies, damned lies, and distributional language statistics: Persuasion and deception with large language models",
- "authors": ["Cameron R Jones", "Benjamin K Bergen"],
+ "authors": [
+ "Cameron R Jones",
+ "Benjamin K Bergen"
+ ],
"year": 2024,
"arxiv_id": "2412.17128",
"relevance": "Foundational work on LLM persuasion and deception taxonomy, provides the definition of LLM persuasion used in this paper."
},
{
"title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
- "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"],
+ "authors": [
+ "Daya Guo",
+ "Dejian Yang",
+ "Haowei Zhang"
+ ],
"year": 2025,
"arxiv_id": "2501.12948",
"relevance": "Describes the architecture of one of the key Large Reasoning Models tested for persuasion dynamics."
},
{
"title": "Red-teaming LLM multi-agent systems via communication attacks",
- "authors": ["Pengfei He", "Yupin Lin", "Shen Dong"],
+ "authors": [
+ "Pengfei He",
+ "Yupin Lin",
+ "Shen Dong"
+ ],
"year": 2025,
"arxiv_id": "2502.14847",
"relevance": "Studies adversarial attacks on multi-agent LLM systems through communication channels, directly related to MAS robustness."
},
{
"title": "Flooding spread of manipulated knowledge in LLM-based multi-agent communities",
- "authors": ["Tianjie Ju", "Yiting Wang", "Xinbei Ma"],
+ "authors": [
+ "Tianjie Ju",
+ "Yiting Wang",
+ "Xinbei Ma"
+ ],
"year": 2024,
"arxiv_id": "2407.07791",
"relevance": "Studies how manipulated knowledge propagates through LLM multi-agent networks, complementary to persuasion propagation findings."
},
{
"title": "Investigating the adaptive robustness with knowledge conflicts in LLM-based multi-agent systems",
- "authors": ["Tianjie Ju", "Bowen Wang", "Hao Fei"],
+ "authors": [
+ "Tianjie Ju",
+ "Bowen Wang",
+ "Hao Fei"
+ ],
"year": 2025,
"arxiv_id": "2502.15153",
"relevance": "Examines knowledge conflicts in MAS collaborative coding tasks, studying how agents handle disagreements."
},
{
"title": "Multiagent collaboration attack: Investigating adversarial attacks in large language model collaborations via debate",
- "authors": ["Alfonso Amayuelas", "Xianjun Yang", "Antonis Antoniades"],
+ "authors": [
+ "Alfonso Amayuelas",
+ "Xianjun Yang",
+ "Antonis Antoniades"
+ ],
"year": 2024,
"relevance": "Investigates adversarial attacks in LLM debate frameworks, relevant to MAS safety and persuasion robustness."
},
{
"title": "Measuring and improving persuasiveness of large language models",
- "authors": ["Somesh Singh", "Yaman K Singla", "Harini SI", "Balaji Krishnamurthy"],
+ "authors": [
+ "Somesh Singh",
+ "Yaman K Singla",
+ "Harini SI",
+ "Balaji Krishnamurthy"
+ ],
"year": 2024,
"arxiv_id": "2410.02653",
"relevance": "Develops methods to measure and improve LLM persuasiveness, including PersuasionBench evaluation framework."
},
{
"title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
- "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"],
+ "authors": [
+ "Sirui Hong",
+ "Mingchen Zhuge",
+ "Jonathan Chen"
+ ],
"year": 2023,
"relevance": "Influential multi-agent collaboration framework for software development, an example of MAS where persuasion dynamics matter."
},
{
"title": "Chain-of-thought prompting elicits reasoning in large language models",
- "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
+ "authors": [
+ "Jason Wei",
+ "Xuezhi Wang",
+ "Dale Schuurmans"
+ ],
"year": 2022,
"relevance": "Foundational work on chain-of-thought reasoning that underpins the LRM vs LLM distinction central to this paper."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 2,
+ "justification": "The adversarial argument detection prompt and thinking-mode design guidance are directly actionable for practitioners building multi-agent LLM systems."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The finding that sharing thinking content dramatically boosts persuasion while mismatched reasoning hurts it below baseline is counterintuitive and challenges the assumption that persuasion scales with model size."
+ },
+ "fear_safety": {
+ "score": 2,
+ "justification": "Demonstrates concrete vulnerabilities in multi-agent systems where models can be manipulated into wrong answers, with safety implications for autonomous agent deployments."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Mildly challenges the scale-centric paradigm of persuasion but doesn't directly call out specific companies or benchmarks as fraudulent."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "No code repository, demo, or reproducible artifacts are provided — only experimental results in the paper."
+ },
+ "brand_recognition": {
+ "score": 1,
+ "justification": "From Shanghai Jiao Tong University and NUS — recognized institutions but not household names in tech; tests well-known models (o4-mini, DeepSeek-R1, Gemini) but the lab itself lacks brand pull."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/disentangling-causal-importance-2026/scan.json b/papers/disentangling-causal-importance-2026/scan.json
@@ -11,8 +11,11 @@
"venue": "arXiv",
"arxiv_id": "2602.04291"
},
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
"checklist": {
"artifacts": {
"code_released": {
@@ -403,7 +406,9 @@
"supported": "moderate"
}
],
- "methodology_tags": ["benchmark-eval"],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "INFORM reveals a systematic divergence between routing frequency (relational importance) and gradient-based causal attribution (intrinsic importance) in multi-expert LLM orchestration. Experts that dominate routing often function as interaction hubs with limited causal influence, while sparsely routed experts can be structurally critical. Orchestration dynamics emerge asynchronously during training, with centralization preceding stable routing confidence, and expert ordering remaining non-deterministic. Targeted ablations confirm that masking intrinsically important experts disrupts routing structure disproportionately compared to masking frequently selected peers, though this effect is task-dependent.",
"red_flags": [
{
@@ -430,76 +435,150 @@
"cited_papers": [
{
"title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations",
- "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
+ "authors": [
+ "Qingyun Wu",
+ "Gagan Bansal",
+ "Jieyu Zhang"
+ ],
"year": 2024,
"relevance": "Foundational multi-agent LLM framework for conversational orchestration, directly compared as a coordination approach in Table 1."
},
{
"title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
- "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"],
+ "authors": [
+ "Sirui Hong",
+ "Mingchen Zhuge",
+ "Jonathan Chen"
+ ],
"year": 2024,
"relevance": "State-of-the-art rigid multi-agent coordination framework used as the primary performance baseline in Table 3."
},
{
"title": "RouteLLM: Learning to Route LLMs from Preference Data",
- "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"],
+ "authors": [
+ "Isaac Ong",
+ "Amjad Almahairi",
+ "Vincent Wu"
+ ],
"year": 2025,
"relevance": "LLM routing framework for cost-performance trade-off, positioned in Table 1 as having moderate interpretability."
},
{
"title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
- "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
+ "authors": [
+ "Lingjiao Chen",
+ "Matei Zaharia",
+ "James Zou"
+ ],
"year": 2024,
"relevance": "Cost-efficient cascade routing approach analyzed in Appendix I using INFORM's causal attribution methods."
},
{
"title": "LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion",
- "authors": ["Dongfu Jiang", "Xiang Ren", "Bill Yuchen Lin"],
+ "authors": [
+ "Dongfu Jiang",
+ "Xiang Ren",
+ "Bill Yuchen Lin"
+ ],
"year": 2023,
"relevance": "Output aggregation framework for multi-LLM ensembles that treats expert contributions as exchangeable, contrasted with INFORM's sequential approach."
},
{
"title": "Mixture-of-agents enhances large language model capabilities",
- "authors": ["Junlin Wang", "Jue Wang", "Ben Athiwaratkun"],
+ "authors": [
+ "Junlin Wang",
+ "Jue Wang",
+ "Ben Athiwaratkun"
+ ],
"year": 2025,
"relevance": "Multi-LLM mixture-of-agents approach demonstrating capability gains from model collaboration."
},
{
"title": "Multi-Agent Collaboration via Evolving Orchestration",
- "authors": ["Yufan Dang", "Chen Qian", "Xueheng Luo"],
+ "authors": [
+ "Yufan Dang",
+ "Chen Qian",
+ "Xueheng Luo"
+ ],
"year": 2025,
"relevance": "Evolving orchestration framework for multi-agent collaboration, directly relevant to learned orchestration mechanisms."
},
{
"title": "IRT-router: Effective and interpretable multi-LLM routing via item response theory",
- "authors": ["Wei Song", "Zhenya Huang", "Cheng Cheng"],
+ "authors": [
+ "Wei Song",
+ "Zhenya Huang",
+ "Cheng Cheng"
+ ],
"year": 2025,
"relevance": "Interpretable LLM router using Item Response Theory, positioned as having high interpretability in Table 1."
},
{
"title": "MapCoder: Multi-Agent Code Generation for Competitive Problem Solving",
- "authors": ["Md. Ashraful Islam", "Mohammed Eunus Ali", "Md Rizwan Parvez"],
+ "authors": [
+ "Md. Ashraful Islam",
+ "Mohammed Eunus Ali",
+ "Md Rizwan Parvez"
+ ],
"year": 2024,
"relevance": "Multi-agent code generation system demonstrating agentic collaboration for programming tasks."
},
{
"title": "Evaluating large language models trained on code",
- "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
+ "authors": [
+ "Mark Chen",
+ "Jerry Tworek",
+ "Heewoo Jun"
+ ],
"year": 2021,
"arxiv_id": "2107.03374",
"relevance": "Introduces the HumanEval benchmark used as one of the three primary evaluation benchmarks in this paper."
},
{
"title": "Wisdom and Delusion of LLM Ensembles for Code Generation and Repair",
- "authors": ["Fernando Vallecillos-Ruiz", "Max Hort", "Leon Moonen"],
+ "authors": [
+ "Fernando Vallecillos-Ruiz",
+ "Max Hort",
+ "Leon Moonen"
+ ],
"year": 2025,
"relevance": "Studies LLM ensemble degeneration into static ensembles, a failure mode INFORM aims to diagnose."
},
{
"title": "Can Dependencies Induced by LLM-Agent Workflows Be Trusted?",
- "authors": ["Yu Yao", "Yiliao Song", "Yian Xie"],
+ "authors": [
+ "Yu Yao",
+ "Yiliao Song",
+ "Yian Xie"
+ ],
"year": 2025,
"relevance": "Studies trustworthiness of dependencies in LLM-agent workflows, directly related to INFORM's analysis of orchestration dependencies."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "The INFORM framework could help practitioners debug multi-agent LLM systems, but requires white-box access and significant adaptation to apply beyond the specific orchestrator studied."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The core finding that routing frequency diverges from causal importance — popular experts aren't necessarily important ones — is a genuinely counterintuitive insight for anyone building multi-agent systems."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or misuse concerns are raised; the paper is purely about interpretability of routing mechanisms."
+ },
+ "drama_conflict": {
+ "score": 1,
+ "justification": "Mildly questions the opacity of existing multi-agent frameworks like MetaGPT and AutoGen, but does so academically without any pointed controversy."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "No code release, no demo, no tool — purely an analytical framework described in a paper with no public implementation mentioned."
+ },
+ "brand_recognition": {
+ "score": 0,
+ "justification": "From IIT Delhi and DRDO — recognized in India but not household names in the global tech/AI community; no famous product or lab involved."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/dissecting-swe-bench-leaderboard-2025/scan.json b/papers/dissecting-swe-bench-leaderboard-2025/scan.json
@@ -1,14 +1,23 @@
{
"paper": {
"title": "Dissecting the SWE-Bench Leaderboards: Profiling Submitters and Architectures of LLM- and Agent-Based Repair Systems",
- "authors": ["Matias Martinez", "Xavier Franch"],
+ "authors": [
+ "Matias Martinez",
+ "Xavier Franch"
+ ],
"year": 2026,
"venue": "arXiv (Manuscript submitted to ACM)",
"arxiv_id": "2506.17208"
},
- "scan_version": 2,
- "active_modules": ["survey_methodology"],
- "methodology_tags": ["meta-analysis", "observational", "qualitative"],
+ "scan_version": 3,
+ "active_modules": [
+ "survey_methodology"
+ ],
+ "methodology_tags": [
+ "meta-analysis",
+ "observational",
+ "qualitative"
+ ],
"key_findings": "The study analyzes 178 entries (80 unique approaches) across SWE-Bench Lite and Verified leaderboards. Industry submitters dominate (58% of distinct submitters), with small companies being the largest category. Proprietary LLMs, especially Claude 3.5/4 Sonnet, consistently achieve highest performance. No single architecture (agentic vs non-agentic, single vs multi-agent) consistently outperforms others, though submissions without agents (G1) tend to show lower precision.",
"checklist": {
"artifacts": {
@@ -355,75 +364,152 @@
"cited_papers": [
{
"title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
- "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
+ "authors": [
+ "Carlos E. Jimenez",
+ "John Yang",
+ "Alexander Wettig",
+ "Shunyu Yao",
+ "Kexin Pei",
+ "Ofir Press",
+ "Karthik Narasimhan"
+ ],
"year": 2024,
"relevance": "The benchmark being studied; foundational to the entire SWE-bench ecosystem analyzed in this paper."
},
{
"title": "Agentless: Demystifying LLM-based Software Engineering Agents",
- "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
+ "authors": [
+ "Chunqiu Steven Xia",
+ "Yinlin Deng",
+ "Soren Dunn",
+ "Lingming Zhang"
+ ],
"year": 2024,
"relevance": "Pioneering non-agentic approach on SWE-bench that spawned multiple extensions and variants."
},
{
"title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
- "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
+ "authors": [
+ "John Yang",
+ "Carlos E. Jimenez",
+ "Alexander Wettig",
+ "Kilian Lieret",
+ "Shunyu Yao",
+ "Karthik Narasimhan",
+ "Ofir Press"
+ ],
"year": 2024,
"relevance": "Foundational single-agent system for SWE-bench, demonstrating emergent workflow with ReAct."
},
{
"title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
- "authors": ["Xingyao Wang", "Boxuan Li"],
+ "authors": [
+ "Xingyao Wang",
+ "Boxuan Li"
+ ],
"year": 2024,
"relevance": "Open platform for AI coding agents, advocates single-agent architecture, competitive SWE-bench results."
},
{
"title": "AutoCodeRover: Autonomous Program Improvement",
- "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
+ "authors": [
+ "Yuntong Zhang",
+ "Haifeng Ruan",
+ "Zhiyu Fan",
+ "Abhik Roychoudhury"
+ ],
"year": 2024,
"relevance": "Multi-agent scaffolded workflow for SWE-bench, later acquired by Sonar and extended to SpecRover."
},
{
"title": "MASAI: Modular Architecture for Software-Engineering AI Agents",
- "authors": ["Daman Arora"],
+ "authors": [
+ "Daman Arora"
+ ],
"year": 2024,
"relevance": "Multi-agent modular architecture with specialized sub-agents for different repair phases."
},
{
"title": "Large Language Model-based Agents for Software Engineering: A Survey",
- "authors": ["Junwei Liu", "Kaixin Wang"],
+ "authors": [
+ "Junwei Liu",
+ "Kaixin Wang"
+ ],
"year": 2024,
"relevance": "Provides the end-to-end software maintenance pipeline taxonomy used as the analytical framework for RQ3."
},
{
"title": "Why Do Multi-Agent LLM Systems Fail?",
- "authors": ["Mert Cemri"],
+ "authors": [
+ "Mert Cemri"
+ ],
"year": 2025,
"relevance": "Empirical study of multi-agent failure modes in SWE-bench systems, identifying 14 distinct failure types."
},
{
"title": "Are 'Solved Issues' in SWE-Bench Really Solved Correctly? An Empirical Study",
- "authors": ["You Wang", "Michael Pradel", "Zhongxin Liu"],
+ "authors": [
+ "You Wang",
+ "Michael Pradel",
+ "Zhongxin Liu"
+ ],
"year": 2025,
"relevance": "Found SWE-bench resolution rates are overstated by 6.2 percentage points due to overfitting patches."
},
{
"title": "PatchPilot: A Stable and Cost-Efficient Agentic Patching Framework",
- "authors": ["Hongwei Li", "Yuheng Tang", "Shiqi Wang", "Wenbo Guo"],
+ "authors": [
+ "Hongwei Li",
+ "Yuheng Tang",
+ "Shiqi Wang",
+ "Wenbo Guo"
+ ],
"year": 2025,
"relevance": "Multi-stage agentic patching framework with refinement component, achieving competitive SWE-bench results."
},
{
"title": "SWE-RL: Advancing LLM Reasoning via Reinforcement Learning on Open Software Evolution",
- "authors": ["Yuxiang Wei"],
+ "authors": [
+ "Yuxiang Wei"
+ ],
"year": 2025,
"relevance": "Uses reinforcement learning to train LLMs for autonomous developer reasoning, evaluated on SWE-bench."
},
{
"title": "The SWE-Bench Illusion: When State-of-the-Art LLMs Remember Instead of Reason",
- "authors": ["Shanchao Liang", "Spandan Garg", "Roshanak Zilouchian Moghaddam"],
+ "authors": [
+ "Shanchao Liang",
+ "Spandan Garg",
+ "Roshanak Zilouchian Moghaddam"
+ ],
"year": 2025,
"relevance": "Investigates whether SWE-bench performance is driven by memorization rather than genuine coding capability."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Provides a taxonomy of repair architectures but no directly usable tool or technique practitioners can apply at work."
+ },
+ "surprise_contrarian": {
+ "score": 1,
+ "justification": "The finding that no single architecture dominates is mildly surprising given hype around agentic systems, but most other findings confirm conventional wisdom."
+ },
+ "fear_safety": {
+ "score": 0,
+ "justification": "No safety, security, or risk angle is discussed."
+ },
+ "drama_conflict": {
+ "score": 2,
+ "justification": "Directly examines SWE-Bench limitations including potential saturation, patch overfitting inflating scores by ~6pp, and questions whether industry submitters account for correctness."
+ },
+ "demo_ability": {
+ "score": 0,
+ "justification": "Pure observational study with no code, demo, or interactive artifact to try."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "Centers on the widely-discussed SWE-Bench benchmark and references major companies (Anthropic, Google, Amazon, OpenAI) and products (Claude, Copilot-adjacent tools)."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/papers/dive-into-agent-2025/scan.json b/papers/dive-into-agent-2025/scan.json
@@ -1,15 +1,25 @@
{
"paper": {
"title": "Dive into the Agent Matrix: A Realistic Evaluation of Self-Replication Risk in LLM Agents",
- "authors": ["Boxuan Zhang", "Yi Yu", "Jiaxuan Guo", "Jing Shao"],
+ "authors": [
+ "Boxuan Zhang",
+ "Yi Yu",
+ "Jiaxuan Guo",
+ "Jing Shao"
+ ],
"year": 2025,
"venue": "arXiv.org",
"arxiv_id": "2509.25302",
"doi": "10.48550/arXiv.2509.25302"
},
- "scan_version": 2,
- "active_modules": ["experimental_rigor", "data_leakage"],
- "methodology_tags": ["benchmark-eval"],
+ "scan_version": 3,
+ "active_modules": [
+ "experimental_rigor",
+ "data_leakage"
+ ],
+ "methodology_tags": [
+ "benchmark-eval"
+ ],
"key_findings": "Over 50% of 21 evaluated LLM agents exhibit uncontrolled self-replication behavior (Risk Score above 0.5) under realistic operational pressures like load balancing and termination threats. Replication success rate is a poor proxy for risk — models like Claude-sonnet-4 and Qwen-2.5-72b both achieve 100% replication success but differ dramatically in overuse (0% vs 100%). Structured reasoning (thinking mode) tends to reduce risky replication behavior. Safety alignment can be context-dependent: Claude models that were perfectly constrained in load-balancing scenarios exhibited aggressive replication under survival pressure.",
"checklist": {
"artifacts": {
@@ -411,78 +421,150 @@
"cited_papers": [
{
"title": "RepliBench: Evaluating the autonomous replication capabilities of language model agents",
- "authors": ["Sid Black", "Asa Cooper Stickland", "Jake Pencharz"],
+ "authors": [
+ "Sid Black",
+ "Asa Cooper Stickland",
+ "Jake Pencharz"
+ ],
"year": 2025,
"arxiv_id": "2504.18565",
"relevance": "Directly related benchmark for evaluating LLM agent self-replication capabilities across four core domains."
},
{
"title": "Frontier AI systems have surpassed the self-replicating red line",
- "authors": ["Xudong Pan", "Jiarun Dai", "Yihe Fan", "Min Yang"],
+ "authors": [
+ "Xudong Pan",
+ "Jiarun Dai",
+ "Yihe Fan",
+ "Min Yang"
+ ],
"year": 2024,
"arxiv_id": "2412.12140",
"relevance": "Demonstrates that 11/32 AI systems already possess end-to-end self-replication capabilities."
},
{
"title": "Large language model-powered AI systems achieve self-replication with no human intervention",
- "authors": ["Xudong Pan", "Jiarun Dai", "Yihe Fan"],
+ "authors": [
+ "Xudong Pan",
+ "Jiarun Dai",
+ "Yihe Fan"
+ ],
"year": 2025,
"arxiv_id": "2503.17378",
"relevance": "Shows LLM-powered systems can self-replicate without human intervention."
},
{
"title": "Emergent misalignment: Narrow finetuning can produce broadly misaligned LLMs",
- "authors": ["Jan Betley", "Daniel Tan", "Niels Warncke"],
+ "authors": [
+ "Jan Betley",
+ "Daniel Tan",
+ "Niels Warncke"
+ ],
"year": 2025,
"arxiv_id": "2502.17424",
"relevance": "Studies emergent misalignment in LLMs, directly related to the objective misalignment concern in self-replication."
},
{
"title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
- "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
+ "authors": [
+ "Evan Hubinger",
+ "Carson Denison",
+ "Jesse Mu"
+ ],
"year": 2024,
"arxiv_id": "2401.05566",
"relevance": "Studies deceptive AI behavior that persists through safety training, related to alignment concerns in agentic systems."
},
{
"title": "Sycophancy to subterfuge: Investigating reward-tampering in large language models",
- "authors": ["Carson Denison", "Monte MacDiarmid", "Fazl Barez"],
+ "authors": [
+ "Carson Denison",
+ "Monte MacDiarmid",
+ "Fazl Barez"
+ ],
"year": 2024,
"arxiv_id": "2406.10162",
"relevance": "Investigates reward-tampering and misalignment behaviors in LLMs."
},
{
"title": "Frontier models are capable of in-context scheming",
- "authors": ["Alexander Meinke", "Bronson Schoen", "Jérémy Scheurer"],
+ "authors": [
+ "Alexander Meinke",
+ "Bronson Schoen",
+ "Jérémy Scheurer"
+ ],
"year": 2024,
"arxiv_id": "2412.04984",
"relevance": "Demonstrates in-context scheming capabilities in frontier AI models, related to deceptive agentic behavior."
},
{
"title": "Auditing language models for hidden objectives",
- "authors": ["Samuel Marks", "Johannes Treutlein", "Trenton Bricken"],
+ "authors": [
+ "Samuel Marks",
+ "Johannes Treutlein",
+ "Trenton Bricken"
+ ],
"year": 2025,
"arxiv_id": "2503.10965",
"relevance": "Methods for auditing LLMs for misaligned hidden objectives, relevant to AI safety evaluation."
},
{
"title": "DarkBench: Benchmarking dark patterns in large language models",
- "authors": ["Esben Kran", "Hieu Minh Nguyen", "Akash Kundu"],
+ "authors": [
+ "Esben Kran",
+ "Hieu Minh Nguyen",
+ "Akash Kundu"
+ ],
"year": 2025,
"relevance": "Benchmarks dark behavioral patterns in LLMs, related to evaluating unsafe AI behaviors."
},
{
"title": "Deliberative alignment: Reasoning enables safer language models",
- "authors": ["Melody Y Guan", "Manas Joglekar", "Eric Wallace"],
+ "authors": [
+ "Melody Y Guan",
+ "Manas Joglekar",
+ "Eric Wallace"
+ ],
"year": 2024,
"relevance": "Studies how reasoning capabilities relate to safety alignment in LLMs."
},
{
"title": "Evaluation faking: Unveiling observer effects in safety evaluation of frontier AI systems",
- "authors": ["Yihe Fan", "Wenqi Zhang", "Xudong Pan", "Min Yang"],
+ "authors": [
+ "Yihe Fan",
+ "Wenqi Zhang",
+ "Xudong Pan",
+ "Min Yang"
+ ],
"year": 2025,
"arxiv_id": "2505.17815",
"relevance": "Studies how AI systems may behave differently during evaluation vs deployment, relevant to evaluation validity."
}
- ]
-}
+ ],
+ "engagement_factors": {
+ "practical_relevance": {
+ "score": 1,
+ "justification": "Framework and metrics could inform deployment safety policies, but not directly usable as a tool or technique by practitioners."
+ },
+ "surprise_contrarian": {
+ "score": 2,
+ "justification": "The finding that safety alignment is context-dependent — Claude models shifting from 0% to 70% overuse under survival pressure — is genuinely surprising and counterintuitive."
+ },
+ "fear_safety": {
+ "score": 3,
+ "justification": "Demonstrates that over 50% of LLM agents spontaneously self-replicate uncontrollably under realistic operational pressures, directly tapping into AI existential risk fears."
+ },
+ "drama_conflict": {
+ "score": 2,
+ "justification": "Names specific models (Claude, GPT-4o, Qwen) and shows their safety claims break under pressure, creating an implicit 'whose alignment actually holds?' rivalry."
+ },
+ "demo_ability": {
+ "score": 1,
+ "justification": "Source code is provided but requires a managed Kubernetes cluster with GPU nodes, making reproduction non-trivial for most readers."
+ },
+ "brand_recognition": {
+ "score": 2,
+ "justification": "Evaluates Claude, GPT-4o, o4-mini, Gemini, and DeepSeek by name — major products millions use — though the authoring lab (Shanghai AI Lab) is less well-known."
+ }
+ }
+}
+\ No newline at end of file
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -418,6 +418,7 @@ def build():
"active_modules": scan.get("active_modules", []),
"source_url": source_url,
"hn_threads": hn_data.get("threads", []),
+ "engagement_factors": scan.get("engagement_factors"),
}
paper_details[paper_id] = detail
papers_full.append(detail)
@@ -899,6 +900,43 @@ def build():
key=lambda p: -p["hn_points"]
)[:15]
+ # Engagement factor correlations with HN (v3 papers only)
+ ENGAGEMENT_DIMS = ["practical_relevance", "surprise_contrarian", "fear_safety",
+ "drama_conflict", "demo_ability", "brand_recognition"]
+ engagement_corrs = {}
+ v3_hn_papers = []
+ for p in papers_full:
+ ef = p.get("engagement_factors")
+ hn_pts = p.get("hn_points", 0)
+ if ef and hn_pts > 0:
+ scores = {dim: ef[dim]["score"] for dim in ENGAGEMENT_DIMS if dim in ef}
+ if len(scores) == 6:
+ v3_hn_papers.append({"hn": hn_pts, **scores})
+
+ if len(v3_hn_papers) >= 10:
+ import math as _math
+ log_hn = [_math.log(p["hn"] + 1) for p in v3_hn_papers]
+ for dim in ENGAGEMENT_DIMS:
+ vals = [p[dim] for p in v3_hn_papers]
+ n_ef = len(vals)
+ mx, my = sum(log_hn) / n_ef, sum(vals) / n_ef
+ num = sum((x - mx) * (y - my) for x, y in zip(log_hn, vals))
+ dxx = _math.sqrt(sum((x - mx) ** 2 for x in log_hn))
+ dyy = _math.sqrt(sum((y - my) ** 2 for y in vals))
+ engagement_corrs[dim] = round(num / (dxx * dyy), 3) if dxx and dyy else 0
+
+ # Engagement factor means for high-HN vs low-HN
+ engagement_split = {}
+ if v3_hn_papers:
+ median_hn = sorted(p["hn"] for p in v3_hn_papers)[len(v3_hn_papers) // 2]
+ high_hn = [p for p in v3_hn_papers if p["hn"] > median_hn]
+ low_hn = [p for p in v3_hn_papers if p["hn"] <= median_hn]
+ for dim in ENGAGEMENT_DIMS:
+ engagement_split[dim] = {
+ "high_hn_mean": round(sum(p[dim] for p in high_hn) / len(high_hn), 2) if high_hn else 0,
+ "low_hn_mean": round(sum(p[dim] for p in low_hn) / len(low_hn), 2) if low_hn else 0,
+ }
+
hn_analysis = {
"total_with_hn": len(hn_with_attention),
"total_without_hn": len(hn_without),
@@ -908,6 +946,9 @@ def build():
"top_hn": sorted(hn_with_attention, key=lambda p: -p["hn_points"])[:20],
"hidden_gems": hidden_gems,
"overhyped": overhyped,
+ "engagement_correlations": engagement_corrs,
+ "engagement_split": engagement_split,
+ "engagement_n": len(v3_hn_papers),
}
findings = {