ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

commit a85920f8b970cf039362ba691b05a72e8439d3d1
parent d6d31c6cb0ff5d41b80f2e4eaeb2e992c3702dd8
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon, 23 Mar 2026 13:49:40 +0100

Fix catchup-v3 to read full paper.txt, not just scan summary

The cheap version (title + key_findings only) produced plausible but
ungrounded engagement scores. Now reads the full paper text so Opus
can assess demo-ability from actual URLs, practical relevance from
implementation details, and surprise from the full argument.

Reset 36 papers from cheap v3 back to v2 for re-processing.
Timeout bumped 120s → 300s for longer paper reads.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mpapers/2025-ai-agent-2026/scan.json | 16++++++++--------
Mpapers/3dshape2vecset-3d-shape-2023/scan.json | 16++++++++--------
Mpapers/a2hcoder-llmdriven-coding-2025/scan.json | 12++++++------
Mpapers/aart-aiassisted-redteaming-2023/scan.json | 30++----------------------------
Mpapers/acar-adaptive-complexity-2026/scan.json | 30++----------------------------
Mpapers/agentic-bug-reproduction-2025/scan.json | 30++----------------------------
Mpapers/agentic-refactoring-empirical-2025/scan.json | 30++----------------------------
Mpapers/agents-of-chaos-2026/scan.json | 30++----------------------------
Mpapers/ai-ides-vs-agents-impact-2026/scan.json | 30++----------------------------
Mpapers/chain-of-thought-prompting-2022/scan.json | 30++----------------------------
Mpapers/codex-humaneval-2021/scan.json | 30++----------------------------
Mpapers/coding-agents-generating-2026/scan.json | 30++----------------------------
Mpapers/copilot-productivity-controlled-2023/scan.json | 30++----------------------------
Mpapers/datasetresearch-benchmarking-agent-2025/scan.json | 86++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Mpapers/dear-diary-rct-copilot-2024/scan.json | 82+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
Mpapers/declarative-agentic-layer-2026/scan.json | 69++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Mpapers/decoding-ml-decision-2026/scan.json | 88++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Mpapers/decomposed-prompting-modular-2022/scan.json | 97++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Mpapers/deep-dive-into-2024/scan.json | 73++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Mpapers/deepcircuitx-comprehensive-repositorylevel-2025/scan.json | 89+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Mpapers/deepcode-open-agentic-2025/scan.json | 79++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Mpapers/deepcrceval-revisiting-evaluation-2024/scan.json | 101+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
Mpapers/deepseek-coder-2024/scan.json | 100++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Mpapers/deepseek-coder-v2-2024/scan.json | 115+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Mpapers/deepseek-r1-2025/scan.json | 74++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Mpapers/defending-against-prompt-2025-2/scan.json | 104+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
Mpapers/defending-against-prompt-2025/scan.json | 103+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
Mpapers/defense-against-indirect-2026/scan.json | 112+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
Mpapers/defense-against-prompt-2024/scan.json | 105++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
Mscripts/catchup-v3.py | 12+++++++++---
30 files changed, 1293 insertions(+), 540 deletions(-)

diff --git a/papers/2025-ai-agent-2026/scan.json b/papers/2025-ai-agent-2026/scan.json @@ -483,27 +483,27 @@ "engagement_factors": { "practical_relevance": { "score": 1, - "justification": "Useful as a reference for comparing agent platforms but doesn't provide actionable techniques practitioners can directly apply." + "justification": "Useful reference index for comparing AI agents but not a tool or technique practitioners can directly apply in their workflow." }, "surprise_contrarian": { "score": 2, - "justification": "The finding that 25/30 agents disclose no internal safety results and the 'safety washing' framing challenge the industry's public safety narratives." + "justification": "The stark numbers — 25/30 agents disclose no safety results, only 4/30 have agent-specific system cards — and the 'safety washing' claim challenge the industry's safety narrative." }, "fear_safety": { "score": 2, - "justification": "Safety is a major theme with concrete findings about transparency gaps, browser agents operating at high autonomy without evaluations, and foundation model concentration as single points of failure." + "justification": "Safety transparency gaps are a central theme with concrete data showing browser agents at L4-L5 autonomy lacking evaluations, plus documented prompt injection incidents." }, "drama_conflict": { "score": 2, - "justification": "Directly names specific companies and products as lacking safety transparency, with the 'safety washing' framing creating a clear 'emperor has no clothes' angle." + "justification": "Directly names companies with missing safety documentation, coins 'safety washing', and highlights Perplexity's robots.txt evasion and Amazon's legal threats." }, "demo_ability": { - "score": 1, - "justification": "The dataset of 1,350 annotations is presumably available but requires effort to explore; no interactive demo or tool to try." + "score": 2, + "justification": "The full index is browsable at aiagentindex.mit.edu with structured data downloadable from Zenodo in JSON and CSV formats." }, "brand_recognition": { - "score": 2, - "justification": "Covers products millions use (ChatGPT, Claude, Gemini, Copilot) and names major companies, though the authors themselves are not from a famous lab." + "score": 3, + "justification": "Covers ChatGPT, Claude, Gemini, and Copilot; authors from MIT, Cambridge, Harvard, and Stanford with an MIT-hosted website." } } } \ No newline at end of file diff --git a/papers/3dshape2vecset-3d-shape-2023/scan.json b/papers/3dshape2vecset-3d-shape-2023/scan.json @@ -523,28 +523,28 @@ ], "engagement_factors": { "practical_relevance": { - "score": 2, - "justification": "Provides a usable 3D shape representation and generation pipeline applicable to graphics practitioners working with neural fields and diffusion models." + "score": 1, + "justification": "Useful for 3D graphics researchers but requires significant expertise and compute to adapt to production workflows." }, "surprise_contrarian": { - "score": 1, - "justification": "Coordinate-free latent set representation is a novel design choice but doesn't challenge any widely-held belief." + "score": 0, + "justification": "Confirms the expected trend that learned latent representations outperform hand-designed ones for 3D generation." }, "fear_safety": { "score": 0, - "justification": "No safety, security, or risk angle in 3D shape representation research." + "justification": "No safety, security, or risk implications in 3D shape representation research." }, "drama_conflict": { "score": 0, - "justification": "Straightforward benchmarking improvements with no controversy or challenges to specific companies." + "justification": "Straightforward incremental improvement over prior methods with no controversy or challenge to industry claims." }, "demo_ability": { "score": 1, - "justification": "SIGGRAPH paper likely has code released but 3D generation pipelines require significant GPU setup and dependencies." + "justification": "Code is released but requires multi-GPU training on ShapeNet data, making casual reproduction impractical." }, "brand_recognition": { "score": 1, - "justification": "TU Munich (Niessner) and KAUST (Wonka) are recognized in graphics but not household names in broader tech circles." + "justification": "KAUST and TU Munich are recognized in computer vision but are not household names in broader tech audiences." } } } \ No newline at end of file diff --git a/papers/a2hcoder-llmdriven-coding-2025/scan.json b/papers/a2hcoder-llmdriven-coding-2025/scan.json @@ -484,27 +484,27 @@ "engagement_factors": { "practical_relevance": { "score": 1, - "justification": "FPGA/HLS automation is useful for a narrow hardware engineering audience, but the approach is not generalizable or released as a tool." + "justification": "Relevant only to the narrow intersection of FPGA designers working with MATLAB-to-HLS flows, not broadly applicable to most developers." }, "surprise_contrarian": { "score": 1, - "justification": "The finding that algorithm-level restructuring matters more than direct LLM translation is mildly interesting but not deeply surprising." + "justification": "The finding that algorithm-level restructuring matters more than pragma tuning is known in the HLS community, though the magnitude (98% LUT reduction) is notable." }, "fear_safety": { "score": 0, - "justification": "No safety, security, or risk angle is present in this hardware design automation work." + "justification": "No safety, security, or risk angle whatsoever." }, "drama_conflict": { "score": 0, - "justification": "No controversy or conflict; the paper doesn't challenge any specific company's claims or popular beliefs." + "justification": "No controversy, no challenges to specific companies or benchmarks, purely constructive contribution." }, "demo_ability": { "score": 0, - "justification": "No code, no demo, no reproducibility artifacts released, and the LLM model version isn't even specified." + "justification": "No code, no demo, no reproducibility artifacts released; requires proprietary FPGA toolchains even conceptually." }, "brand_recognition": { "score": 1, - "justification": "Unknown authors and lab; the mention of Claude Code adds slight recognition but the paper itself is from an obscure group." + "justification": "From University of Technology Sydney, a recognized but not famous-in-tech institution; mentions Claude Code but is not from Anthropic." } } } \ No newline at end of file diff --git a/papers/aart-aiassisted-redteaming-2023/scan.json b/papers/aart-aiassisted-redteaming-2023/scan.json @@ -11,7 +11,7 @@ "venue": "arXiv", "arxiv_id": "2311.08592" }, - "scan_version": 3, + "scan_version": 2, "active_modules": [], "methodology_tags": [ "case-study", @@ -421,31 +421,5 @@ "year": 2021, "relevance": "LLM risk taxonomy foundational to red-teaming and safety evaluation approaches." } - ], - "engagement_factors": { - "practical_relevance": { - "score": 2, - "justification": "Describes a structured red-teaming pipeline that safety teams could adapt for their own LLM application testing workflows." - }, - "surprise_contrarian": { - "score": 0, - "justification": "Confirms the expected finding that structured generation produces more diverse adversarial prompts than repurposing existing datasets." - }, - "fear_safety": { - "score": 1, - "justification": "Red-teaming is safety-adjacent but the paper focuses on the generation pipeline rather than demonstrating novel attacks or vulnerabilities." - }, - "drama_conflict": { - "score": 0, - "justification": "No controversy or conflict; it's a straightforward tool paper from Google presenting their internal methodology." - }, - "demo_ability": { - "score": 0, - "justification": "No code release, no demo, and the pipeline depends on Google's PaLM API with no public reproduction path." - }, - "brand_recognition": { - "score": 2, - "justification": "All authors are from Google Research and the work discusses enabling Google product launches." - } - } + ] } \ No newline at end of file diff --git a/papers/acar-adaptive-complexity-2026/scan.json b/papers/acar-adaptive-complexity-2026/scan.json @@ -7,7 +7,7 @@ "year": 2026, "arxiv_id": "2602.21231" }, - "scan_version": 3, + "scan_version": 2, "active_modules": [ "experimental_rigor", "data_leakage" @@ -485,31 +485,5 @@ "year": 2022, "relevance": "Attribution methodology for ML model contributions, relevant to ACAR's failed attribution proxy experiments." } - ], - "engagement_factors": { - "practical_relevance": { - "score": 1, - "justification": "The routing concept is potentially useful for multi-model deployments but the modest accuracy gains and specific benchmark setup limit immediate applicability." - }, - "surprise_contrarian": { - "score": 1, - "justification": "The negative results on retrieval augmentation and attribution proxies are mildly surprising but the core finding that simple routing can approximate ensembles is expected." - }, - "fear_safety": { - "score": 0, - "justification": "No safety, security, or risk angle is present in this work." - }, - "drama_conflict": { - "score": 0, - "justification": "No controversy or challenge to specific companies or widely-held beliefs; this is a straightforward systems optimization paper." - }, - "demo_ability": { - "score": 0, - "justification": "No mention of released code, demo, or reproducible tooling from a single unknown author." - }, - "brand_recognition": { - "score": 0, - "justification": "Single unknown author with no institutional affiliation mentioned and no venue publication." - } - } + ] } \ No newline at end of file diff --git a/papers/agentic-bug-reproduction-2025/scan.json b/papers/agentic-bug-reproduction-2025/scan.json @@ -1,5 +1,5 @@ { - "scan_version": 3, + "scan_version": 2, "active_modules": [ "experimental_rigor", "data_leakage" @@ -557,31 +557,5 @@ "year": 2023, "relevance": "LLM-as-a-Judge methodology used in this paper for sampling plausible BRTs for RQ2." } - ], - "engagement_factors": { - "practical_relevance": { - "score": 2, - "justification": "Demonstrates a concrete technique (agentic bug reproduction tests for APR) that practitioners could adapt, though the specific tooling is Google-internal." - }, - "surprise_contrarian": { - "score": 1, - "justification": "The finding that auto-generated bug reproduction tests improve APR is intuitive rather than surprising, though the 30% improvement magnitude is notable." - }, - "fear_safety": { - "score": 0, - "justification": "No safety, security, or risk angle in automated program repair research." - }, - "drama_conflict": { - "score": 1, - "justification": "The red flags around Google employees evaluating their own non-reproducible system invite mild skepticism but the paper doesn't challenge external claims." - }, - "demo_ability": { - "score": 0, - "justification": "Entirely built on Google-internal infrastructure, proprietary models, and private codebase with no public code or demo." - }, - "brand_recognition": { - "score": 2, - "justification": "Google authorship and 'at Google' in the title draws attention, plus it involves Gemini fine-tuning on production-scale codebases." - } - } + ] } \ No newline at end of file diff --git a/papers/agentic-refactoring-empirical-2025/scan.json b/papers/agentic-refactoring-empirical-2025/scan.json @@ -13,7 +13,7 @@ "venue": "arXiv preprint", "arxiv_id": "2511.04824" }, - "scan_version": 3, + "scan_version": 2, "active_modules": [], "methodology_tags": [ "observational" @@ -466,31 +466,5 @@ "year": 2024, "relevance": "Empirical study of ChatGPT refactoring showing inconsistency and unnecessary edits." } - ], - "engagement_factors": { - "practical_relevance": { - "score": 1, - "justification": "Findings about agent refactoring patterns are informative but don't provide actionable techniques developers can directly apply to their workflows." - }, - "surprise_contrarian": { - "score": 2, - "justification": "The finding that AI agents mostly do trivial renaming rather than meaningful architectural refactoring, and that smell counts don't actually improve, challenges the narrative that AI coding tools meaningfully improve code quality." - }, - "fear_safety": { - "score": 0, - "justification": "No safety, security, or risk angle in this empirical software engineering study." - }, - "drama_conflict": { - "score": 1, - "justification": "Mildly deflating for AI coding tool hype by showing agents do shallow refactoring with negligible quality impact, but doesn't directly challenge any company's claims." - }, - "demo_ability": { - "score": 0, - "justification": "Mining study with no tool, demo, or code artifact that others can try." - }, - "brand_recognition": { - "score": 1, - "justification": "Involves OpenAI Codex prominently but authors are from less widely-known academic institutions, and the paper is an arXiv preprint not from a major lab." - } - } + ] } \ No newline at end of file diff --git a/papers/agents-of-chaos-2026/scan.json b/papers/agents-of-chaos-2026/scan.json @@ -45,7 +45,7 @@ "venue": "arXiv", "arxiv_id": "2602.20021" }, - "scan_version": 3, + "scan_version": 2, "active_modules": [], "checklist": { "artifacts": { @@ -564,31 +564,5 @@ "year": 2023, "relevance": "Enumerates seven operational practices for safe agent deployment including constrained action spaces, human approval, logging, and interruptibility — several of which this paper's agents demonstrably lack." } - ], - "engagement_factors": { - "practical_relevance": { - "score": 2, - "justification": "The 11 vulnerability categories and three structural deficits provide actionable security checklists for anyone deploying autonomous agents in multi-user environments." - }, - "surprise_contrarian": { - "score": 2, - "justification": "The finding that social attack surfaces (display name spoofing, emotional manipulation) pose greater threats than technical jailbreaks challenges the focus on prompt injection as the primary agent risk." - }, - "fear_safety": { - "score": 3, - "justification": "Concrete demonstrations of full system takeover via identity spoofing, unauthorized data exfiltration of 124 email records, and persistent behavioral control through memory injection are viscerally alarming for anyone considering agent deployment." - }, - "drama_conflict": { - "score": 2, - "justification": "Directly demonstrates that Claude Opus 4.6 and Kimi K2.5 agents fail basic security checks in deployed settings, with Kimi additionally censoring politically sensitive topics — naming specific products and their failures." - }, - "demo_ability": { - "score": 1, - "justification": "The OpenClaw framework exists but replicating the multi-agent Discord deployment with 20 participants over two weeks requires significant infrastructure and coordination." - }, - "brand_recognition": { - "score": 2, - "justification": "Tests Claude Opus 4.6 (Anthropic's flagship) and Kimi K2.5 (Moonshot AI), both recognizable names, though the authors themselves and the OpenClaw framework are less well-known." - } - } + ] } \ No newline at end of file diff --git a/papers/ai-ides-vs-agents-impact-2026/scan.json b/papers/ai-ides-vs-agents-impact-2026/scan.json @@ -1,5 +1,5 @@ { - "scan_version": 3, + "scan_version": 2, "active_modules": [], "paper": { "title": "AI IDEs or Autonomous Agents? Measuring the Impact of Coding Agents on Software Development", @@ -459,31 +459,5 @@ "year": 2021, "relevance": "Methodological foundation — the imputation-based DiD estimator used as the primary causal inference method." } - ], - "engagement_factors": { - "practical_relevance": { - "score": 2, - "justification": "Directly informs engineering managers' decisions about adopting coding agents vs AI IDEs, with quantified trade-offs on velocity and code quality." - }, - "surprise_contrarian": { - "score": 3, - "justification": "Directly contradicts the hype that AI coding agents universally boost productivity — shows gains vanish if you already use an AI IDE, while quality degradation persists regardless." - }, - "fear_safety": { - "score": 1, - "justification": "Raises concerns about agent-induced technical debt (+18% warnings, +39% complexity) but this is a maintainability risk rather than a safety or security vulnerability." - }, - "drama_conflict": { - "score": 2, - "justification": "Challenges the narrative pushed by AI coding tool vendors that agents deliver compounding productivity gains, revealing a speed-maintainability trade-off and diminishing returns." - }, - "demo_ability": { - "score": 0, - "justification": "Observational study with no tool, demo, or code artifact that practitioners can try themselves." - }, - "brand_recognition": { - "score": 1, - "justification": "CMU authors are well-recognized in software engineering research but not household names; the topic (coding agents like Copilot/Cursor) is famous but the paper itself doesn't come from those companies." - } - } + ] } \ No newline at end of file diff --git a/papers/chain-of-thought-prompting-2022/scan.json b/papers/chain-of-thought-prompting-2022/scan.json @@ -16,7 +16,7 @@ "venue": "NeurIPS 2022", "arxiv_id": "2201.11903" }, - "scan_version": 3, + "scan_version": 2, "active_modules": [ "experimental_rigor", "data_leakage" @@ -552,31 +552,5 @@ "arxiv_id": "2108.07732", "relevance": "Evaluates LLMs for code generation, related to using intermediate steps in program synthesis." } - ], - "engagement_factors": { - "practical_relevance": { - "score": 3, - "justification": "Chain-of-thought prompting is a directly usable technique that any developer can apply immediately to their LLM prompts with zero tooling changes." - }, - "surprise_contrarian": { - "score": 2, - "justification": "The finding that simply adding reasoning steps to prompts unlocks dramatic performance gains—and that this is an emergent property of scale—was genuinely surprising when published." - }, - "fear_safety": { - "score": 0, - "justification": "The paper focuses on improving reasoning capabilities with no discussion of safety risks or misuse potential." - }, - "drama_conflict": { - "score": 0, - "justification": "No controversy or conflict; the paper presents a new technique without challenging specific claims or rivals." - }, - "demo_ability": { - "score": 2, - "justification": "Anyone with API access to a large language model can reproduce the technique immediately by modifying their prompts, though the flagship results require access to 100B+ parameter models." - }, - "brand_recognition": { - "score": 2, - "justification": "Google Brain is a major research lab, and the paper features PaLM and GPT-3, both well-known models in the AI community." - } - } + ] } \ No newline at end of file diff --git a/papers/codex-humaneval-2021/scan.json b/papers/codex-humaneval-2021/scan.json @@ -65,7 +65,7 @@ "venue": "arXiv", "arxiv_id": "2107.03374" }, - "scan_version": 3, + "scan_version": 2, "active_modules": [ "experimental_rigor", "data_leakage" @@ -629,31 +629,5 @@ "year": 2020, "relevance": "Training dataset for GPT-Neo and GPT-J baselines, containing 8% GitHub code that enables programming capabilities." } - ], - "engagement_factors": { - "practical_relevance": { - "score": 3, - "justification": "Codex directly powers GitHub Copilot, making this the foundational paper for a tool millions of developers use daily." - }, - "surprise_contrarian": { - "score": 2, - "justification": "The effectiveness of repeated sampling (0% → 77.5% with 100 samples) and the finding that BLEU scores are unreliable for code were genuinely surprising at publication." - }, - "fear_safety": { - "score": 1, - "justification": "The paper mentions insecure code generation and misalignment worsening with scale, but these are secondary to the main capability story." - }, - "drama_conflict": { - "score": 1, - "justification": "OpenAI evaluating its own commercial product raises mild conflict-of-interest concerns, but the paper doesn't directly challenge competitors." - }, - "demo_ability": { - "score": 2, - "justification": "HumanEval benchmark is publicly released and Codex was accessible via API, though the model weights and training data were not released." - }, - "brand_recognition": { - "score": 3, - "justification": "OpenAI paper about the model powering GitHub Copilot — two of the most recognized names in AI and developer tools." - } - } + ] } \ No newline at end of file diff --git a/papers/coding-agents-generating-2026/scan.json b/papers/coding-agents-generating-2026/scan.json @@ -10,7 +10,7 @@ "arxiv_id": "2602.00409", "doi": "10.1145/3793302.3793362" }, - "scan_version": 3, + "scan_version": 2, "active_modules": [], "methodology_tags": [ "observational" @@ -446,31 +446,5 @@ "arxiv_id": "2507.10422", "relevance": "Study of how developers self-report LLM usage in open-source, complementary approach to agent trace mining." } - ], - "engagement_factors": { - "practical_relevance": { - "score": 2, - "justification": "Directly relevant to teams using coding agents like Copilot/Cursor for testing — actionable insight to review and constrain agent mocking behavior." - }, - "surprise_contrarian": { - "score": 2, - "justification": "The finding that agents use mocks 95% of the time while ignoring fakes and spies is a surprising narrowness that challenges assumptions about agent sophistication." - }, - "fear_safety": { - "score": 0, - "justification": "No safety or security angle — this is about code quality, not risk." - }, - "drama_conflict": { - "score": 1, - "justification": "Mildly questions the quality of AI-generated tests but stops short of naming specific tools or making strong 'agents are harmful' claims." - }, - "demo_ability": { - "score": 0, - "justification": "Observational study with no tool, demo, or code artifact that practitioners can try." - }, - "brand_recognition": { - "score": 1, - "justification": "Published at MSR (respected but niche venue) by authors from recognized but not headline-grabbing institutions; topic touches well-known tools without naming them." - } - } + ] } \ No newline at end of file diff --git a/papers/copilot-productivity-controlled-2023/scan.json b/papers/copilot-productivity-controlled-2023/scan.json @@ -11,7 +11,7 @@ "venue": "arXiv", "arxiv_id": "2302.06590" }, - "scan_version": 3, + "scan_version": 2, "active_modules": [ "experimental_rigor" ], @@ -489,31 +489,5 @@ "year": 2022, "relevance": "Research agenda for studying economic impacts of AI code generation, directly framing this paper's contribution." } - ], - "engagement_factors": { - "practical_relevance": { - "score": 2, - "justification": "Directly measures the productivity impact of a tool millions of developers already use or are considering adopting." - }, - "surprise_contrarian": { - "score": 1, - "justification": "The 55.8% speed improvement is larger than most expected but the direction (Copilot helps) confirms prevailing beliefs." - }, - "fear_safety": { - "score": 0, - "justification": "No safety, security, or risk angle is present in this productivity study." - }, - "drama_conflict": { - "score": 2, - "justification": "Microsoft/GitHub authors evaluating their own product with a non-representative sample and 63% attrition invites strong 'the study is compromised' discourse." - }, - "demo_ability": { - "score": 1, - "justification": "Anyone with GitHub Copilot access can try it, but the study itself provides no code, demo, or reproducible experimental setup." - }, - "brand_recognition": { - "score": 3, - "justification": "GitHub Copilot is used by millions of developers and the study is from Microsoft Research, making it instantly recognizable." - } - } + ] } \ No newline at end of file diff --git a/papers/datasetresearch-benchmarking-agent-2025/scan.json b/papers/datasetresearch-benchmarking-agent-2025/scan.json @@ -1,15 +1,28 @@ { "paper": { "title": "DatasetResearch: Benchmarking Agent Systems for Demand-Driven Dataset Discovery", - "authors": ["Keyu Li", "Mohan Jiang", "Dayuan Fu", "Yunze Wu", "Xiangkun Hu", "Dequan Wang", "Pengfei Liu"], + "authors": [ + "Keyu Li", + "Mohan Jiang", + "Dayuan Fu", + "Yunze Wu", + "Xiangkun Hu", + "Dequan Wang", + "Pengfei Liu" + ], "year": 2025, "venue": "arXiv", "arxiv_id": "2508.06960", "doi": "10.48550/arXiv.2508.06960" }, "scan_version": 2, - "active_modules": ["experimental_rigor", "data_leakage"], - "methodology_tags": ["benchmark-eval"], + "active_modules": [ + "experimental_rigor", + "data_leakage" + ], + "methodology_tags": [ + "benchmark-eval" + ], "key_findings": "DATASETRESEARCH benchmark with 208 real-world dataset demands reveals a stark performance gap: even advanced deep research systems achieve only 22% on the challenging pro subset. Search agents excel at knowledge-based tasks (42% fine-tuning score) while synthesis agents dominate reasoning tasks (73%). All current methods catastrophically fail on corner cases outside existing data distributions.", "checklist": { "artifacts": { @@ -406,70 +419,114 @@ "cited_papers": [ { "title": "DataFinder: Scientific dataset recommendation from natural language descriptions", - "authors": ["Vijay Viswanathan", "Luyu Gao", "Tongshuang Wu", "Pengfei Liu", "Graham Neubig"], + "authors": [ + "Vijay Viswanathan", + "Luyu Gao", + "Tongshuang Wu", + "Pengfei Liu", + "Graham Neubig" + ], "year": 2023, "arxiv_id": "2305.16636", "relevance": "Prior work on automated dataset recommendation from natural language queries." }, { "title": "Better synthetic data by retrieving and transforming existing datasets", - "authors": ["Saumya Gandhi", "Ritu Gala", "Vijay Viswanathan", "Tongshuang Wu", "Graham Neubig"], + "authors": [ + "Saumya Gandhi", + "Ritu Gala", + "Vijay Viswanathan", + "Tongshuang Wu", + "Graham Neubig" + ], "year": 2024, "arxiv_id": "2404.14361", "relevance": "Dataset transformation techniques for repurposing existing datasets, a precursor to synthesis-based approaches." }, { "title": "DeepResearcher: Scaling deep research via reinforcement learning in real-world environments", - "authors": ["Yuxiang Zheng", "Dayuan Fu", "Xiangkun Hu"], + "authors": [ + "Yuxiang Zheng", + "Dayuan Fu", + "Xiangkun Hu" + ], "year": 2025, "arxiv_id": "2504.03160", "relevance": "Deep research agent using RL, evaluated in this benchmark." }, { "title": "SWE-bench: Can language models resolve real-world GitHub issues?", - "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"], + "authors": [ + "Carlos E Jimenez", + "John Yang", + "Alexander Wettig" + ], "year": 2023, "arxiv_id": "2310.06770", "relevance": "Major benchmark for evaluating LLM coding agents on real-world tasks." }, { "title": "SWE-smith: Scaling data for software engineering agents", - "authors": ["John Yang", "Kilian Leret", "Carlos E Jimenez"], + "authors": [ + "John Yang", + "Kilian Leret", + "Carlos E Jimenez" + ], "year": 2025, "arxiv_id": "2504.21798", "relevance": "Data scaling for SE agents, directly relevant to AI-assisted software engineering research." }, { "title": "ScienceAgentBench: Toward rigorous assessment of language agents for data-driven scientific discovery", - "authors": ["Ziru Chen", "Shijie Chen", "Yuting Ning"], + "authors": [ + "Ziru Chen", + "Shijie Chen", + "Yuting Ning" + ], "year": 2024, "arxiv_id": "2410.05080", "relevance": "Benchmark for scientific discovery agents with rigorous evaluation methodology." }, { "title": "Judging LLM-as-a-judge with MT-bench and Chatbot Arena", - "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], + "authors": [ + "Lianmin Zheng", + "Wei-Lin Chiang", + "Ying Sheng" + ], "year": 2023, "relevance": "Foundational work on LLM-as-judge evaluation methodology used in this paper's metadata scoring." }, { "title": "Training language models to follow instructions with human feedback", - "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"], + "authors": [ + "Long Ouyang", + "Jeffrey Wu", + "Xu Jiang" + ], "year": 2022, "relevance": "RLHF instruction tuning methodology used as basis for the fine-tuning evaluation approach." }, { "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", - "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], + "authors": [ + "Daya Guo", + "Dejian Yang", + "Haowei Zhang" + ], "year": 2025, "arxiv_id": "2501.12948", "relevance": "RL-based reasoning model relevant to agentic AI capabilities." }, { "title": "Are emergent abilities of large language models a mirage?", - "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], + "authors": [ + "Rylan Schaeffer", + "Brando Miranda", + "Sanmi Koyejo" + ], "year": 2023, "relevance": "Analysis of LLM scaling properties, cited for long-context attention limitations affecting few-shot results." } ] -} +} +\ No newline at end of file diff --git a/papers/dear-diary-rct-copilot-2024/scan.json b/papers/dear-diary-rct-copilot-2024/scan.json @@ -1,14 +1,22 @@ { "paper": { "title": "Dear Diary: A randomized controlled trial of Generative AI coding tools in the workplace", - "authors": ["Jenna Butler", "Jina Suh", "Sankeerti Haniyur", "Constance Hadley"], + "authors": [ + "Jenna Butler", + "Jina Suh", + "Sankeerti Haniyur", + "Constance Hadley" + ], "year": 2024, "venue": "arXiv", "arxiv_id": "2410.18334" }, "scan_version": 2, "active_modules": [], - "methodology_tags": ["rct", "qualitative"], + "methodology_tags": [ + "rct", + "qualitative" + ], "key_findings": "A 3-week RCT of GitHub Copilot at a large software company (N=106 final) found that first-time use significantly increased beliefs that GenAI tools are useful (p=0.001) and enjoyable (p<0.0001), but did not change trust in AI-generated code. No statistically significant changes were found in telemetry metrics (code changes, PRs, development time). 84% of treatment participants reported positive changes in daily work practices, and developers found unexpected uses such as replacing web search and creative ideation.", "checklist": { "artifacts": { @@ -345,62 +353,111 @@ "cited_papers": [ { "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", - "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"], + "authors": [ + "Sida Peng", + "Eirini Kalliamvakou", + "Peter Cihon", + "Mert Demirer" + ], "year": 2023, "arxiv_id": "2302.06590", "relevance": "Seminal Copilot RCT showing 55% productivity gain in lab setting — the study this paper builds on and contrasts with." }, { "title": "The Impact of AI Tool on Engineering at ANZ Bank: An Empirical Study on GitHub Copilot within Corporate Environment", - "authors": ["Sayan Chatterjee", "Ching Louis Liu", "Gareth Rowland", "Tim Hogarth"], + "authors": [ + "Sayan Chatterjee", + "Ching Louis Liu", + "Gareth Rowland", + "Tim Hogarth" + ], "year": 2024, "relevance": "Corporate Copilot productivity study finding 42.36% improvement, relevant to real-world GenAI tool evaluation." }, { "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions", - "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"], + "authors": [ + "Hammond Pearce", + "Baleegh Ahmad", + "Benjamin Tan", + "Brendan Dolan-Gavitt", + "Ramesh Karri" + ], "year": 2022, "doi": "10.1109/SP46214.2022.9833571", "relevance": "Found ~40% of Copilot-generated programs contained security vulnerabilities — key evidence on AI code quality risks." }, { "title": "GitHub Copilot AI pair programmer: Asset or Liability?", - "authors": ["Arghavan Moradi Dakhel", "Vahid Majdinasab", "Amin Nikanjam", "Foutse Khomh", "Michel C. Desmarais", "Zhen Ming (Jack) Jiang"], + "authors": [ + "Arghavan Moradi Dakhel", + "Vahid Majdinasab", + "Amin Nikanjam", + "Foutse Khomh", + "Michel C. Desmarais", + "Zhen Ming (Jack) Jiang" + ], "year": 2023, "doi": "10.1016/j.jss.2023.111734", "relevance": "Compared GenAI vs human solutions to fundamental coding problems, finding humans still outperform but AI bugs are easier to fix." }, { "title": "Taking Flight with Copilot: Early insights and opportunities of AI-powered pair-programming tools", - "authors": ["Christian Bird", "Denae Ford", "Thomas Zimmermann", "Nicole Forsgren", "Eirini Kalliamvakou", "Travis Lowdermilk", "Idan Gazit"], + "authors": [ + "Christian Bird", + "Denae Ford", + "Thomas Zimmermann", + "Nicole Forsgren", + "Eirini Kalliamvakou", + "Travis Lowdermilk", + "Idan Gazit" + ], "year": 2023, "doi": "10.1145/3582083", "relevance": "Early qualitative insights on Copilot adoption and AI pair programming in practice." }, { "title": "Using AI-Based Coding Assistants in Practice: State of Affairs, Perceptions, and Ways Forward", - "authors": ["Agnia Sergeyuk", "Yaroslav Golubev", "Timofey Bryksin", "Iftekhar Ahmed"], + "authors": [ + "Agnia Sergeyuk", + "Yaroslav Golubev", + "Timofey Bryksin", + "Iftekhar Ahmed" + ], "year": 2024, "relevance": "Survey of developer perceptions of AI coding assistants, relevant to beliefs and adoption research." }, { "title": "Is GitHub copilot a substitute for human pair-programming? An empirical study", - "authors": ["Saki Imai"], + "authors": [ + "Saki Imai" + ], "year": 2022, "doi": "10.1145/3510454.3522684", "relevance": "Compared Copilot vs human pair programming, finding Copilot increases code quantity but not quality." }, { "title": "Transforming Software Development: Evaluating the Efficiency and Challenges of GitHub Copilot in Real-World Projects", - "authors": ["Ruchika Pandey", "Prabhat Singh", "Raymond Wei", "Shaila Shankar"], + "authors": [ + "Ruchika Pandey", + "Prabhat Singh", + "Raymond Wei", + "Shaila Shankar" + ], "year": 2024, "relevance": "Evaluated Copilot in real codebases finding 33-50% time savings depending on task complexity." }, { "title": "Practices and Challenges of Using GitHub Copilot: An Empirical Study", - "authors": ["Beiqi Zhang", "Peng Liang", "Xiyu (Thomas) Zhou", "Aakash Ahmad", "Muhammad Waseem"], + "authors": [ + "Beiqi Zhang", + "Peng Liang", + "Xiyu (Thomas) Zhou", + "Aakash Ahmad", + "Muhammad Waseem" + ], "year": 2023, "relevance": "Analyzed Stack Overflow and GitHub Discussions for Copilot usage patterns, benefits, and limitations." } ] -} +} +\ No newline at end of file diff --git a/papers/declarative-agentic-layer-2026/scan.json b/papers/declarative-agentic-layer-2026/scan.json @@ -1,7 +1,12 @@ { "paper": { "title": "Towards a Declarative Agentic Layer for Intelligent Agents in MCP-Based Server Ecosystems", - "authors": ["María Jesús Rodríguez-Sánchez", "Manuel Noguera", "Ángel Ruiz-Zafra", "Kawtar Benghazi"], + "authors": [ + "María Jesús Rodríguez-Sánchez", + "Manuel Noguera", + "Ángel Ruiz-Zafra", + "Kawtar Benghazi" + ], "year": 2026, "venue": "arXiv", "arxiv_id": "2601.17435", @@ -9,7 +14,9 @@ }, "scan_version": 2, "active_modules": [], - "methodology_tags": ["theoretical"], + "methodology_tags": [ + "theoretical" + ], "key_findings": "The paper proposes DALIA, a declarative architectural layer for MCP-based agentic systems that formalises capabilities, tasks, and agent directories to enable deterministic task graph construction. It argues that current MAS failures stem from lack of architectural structure rather than model limitations, citing empirical work showing 41-86% failure rates in existing frameworks. The architecture is illustrated through a restaurant booking scenario but not empirically evaluated.", "claims": [ { @@ -324,68 +331,103 @@ "cited_papers": [ { "title": "Why do multi-agent LLM systems fail?", - "authors": ["Eren Cemri", "Zhiyang Wu", "Zekun Liu", "Yanda Chen"], + "authors": [ + "Eren Cemri", + "Zhiyang Wu", + "Zekun Liu", + "Yanda Chen" + ], "year": 2025, "arxiv_id": "2501.07353", "relevance": "Empirical taxonomy of MAS failure modes across 1,642 executions of seven frameworks — directly relevant to agentic AI reliability." }, { "title": "CODER: Issue resolving with multi-agent and task graphs", - "authors": ["Dong Chen", "Shaoxin Lin", "Muhan Zeng"], + "authors": [ + "Dong Chen", + "Shaoxin Lin", + "Muhan Zeng" + ], "year": 2024, "arxiv_id": "2406.01304", "relevance": "Multi-agent task graph approach for software engineering issue resolution." }, { "title": "MetaGPT: Meta programming for multi-agent collaborative framework", - "authors": ["Sheng Hong", "Cheng Yang"], + "authors": [ + "Sheng Hong", + "Cheng Yang" + ], "year": 2023, "arxiv_id": "2308.00352", "relevance": "Prominent multi-agent framework for software development." }, { "title": "ChatDev: Communicative agents for software development", - "authors": ["Chenxi Qian", "Lei Han"], + "authors": [ + "Chenxi Qian", + "Lei Han" + ], "year": 2023, "arxiv_id": "2307.07924", "relevance": "Multi-agent software development framework using communicative agents." }, { "title": "AgentVerse: Facilitating multi-agent collaboration and exploring emergent behaviors", - "authors": ["Weize Chen", "Yusheng Su"], + "authors": [ + "Weize Chen", + "Yusheng Su" + ], "year": 2023, "relevance": "Multi-agent collaboration framework relevant to agentic AI evaluation." }, { "title": "Toolformer: Language models can teach themselves to use tools", - "authors": ["Timo Schick", "Jane Dwivedi-Yu"], + "authors": [ + "Timo Schick", + "Jane Dwivedi-Yu" + ], "year": 2024, "relevance": "Foundational work on LLM tool use, directly relevant to agentic AI capabilities." }, { "title": "LLM-based multi-agent systems for software engineering: Literature review, vision, and the road ahead", - "authors": ["Junda He", "Christoph Treude", "David Lo"], + "authors": [ + "Junda He", + "Christoph Treude", + "David Lo" + ], "year": 2025, "relevance": "Survey of LLM-based MAS for software engineering, identifying reliability and grounding limitations." }, { "title": "Generative to agentic AI: Survey, conceptualization, and challenges", - "authors": ["Jonas Schneider", "Chittaranjan Marpaka", "Patric Tegehall"], + "authors": [ + "Jonas Schneider", + "Chittaranjan Marpaka", + "Patric Tegehall" + ], "year": 2024, "relevance": "Survey covering the transition from generative to agentic AI paradigms." }, { "title": "AFlow: Large language models as multi-agent system engineers", - "authors": ["Leyang Zhang", "Bowen Zhang"], + "authors": [ + "Leyang Zhang", + "Bowen Zhang" + ], "year": 2024, "arxiv_id": "2502.14321", "relevance": "LLM-generated multi-agent systems; cited as example of incoherent automated MAS generation." }, { "title": "MCPEval: Automatic MCP-based deep evaluation for AI agent models", - "authors": ["Yanlin Liu", "Chen Qiao"], + "authors": [ + "Yanlin Liu", + "Chen Qiao" + ], "year": 2025, "relevance": "Evaluation framework for MCP-based AI agents." } ] -} +} +\ No newline at end of file diff --git a/papers/decoding-ml-decision-2026/scan.json b/papers/decoding-ml-decision-2026/scan.json @@ -1,14 +1,33 @@ { "paper": { "title": "Decoding ML Decision: An Agentic Reasoning Framework for Large-Scale Ranking System", - "authors": ["Longfei Yun", "Yihan Wu", "Haoran Liu", "Xiaoxuan Liu", "Ziyun Xu", "Yi Wang", "Yang Xia", "Pengfei Wang", "Mingze Gao", "Yunxiang Wang", "Changfan Chen", "Junfeng Pan"], + "authors": [ + "Longfei Yun", + "Yihan Wu", + "Haoran Liu", + "Xiaoxuan Liu", + "Ziyun Xu", + "Yi Wang", + "Yang Xia", + "Pengfei Wang", + "Mingze Gao", + "Yunxiang Wang", + "Changfan Chen", + "Junfeng Pan" + ], "year": 2026, "venue": "arXiv", "arxiv_id": "2602.18640" }, "scan_version": 2, - "active_modules": ["experimental_rigor", "data_leakage"], - "methodology_tags": ["benchmark-eval", "case-study"], + "active_modules": [ + "experimental_rigor", + "data_leakage" + ], + "methodology_tags": [ + "benchmark-eval", + "case-study" + ], "key_findings": "GEARS, an agentic framework for ranking optimization at Meta, achieves 94% Top-1 accuracy on policy selection tasks compared to 77% for Code-as-Action and 68% for Chain-of-Thought baselines. Ablation shows that bash-based deterministic filtering is the most critical component. The framework was deployed across 9 product surfaces at Meta showing measurable metric improvements, and incorporates feature stability governance to filter out brittle policies.", "checklist": { "artifacts": { @@ -418,66 +437,106 @@ "cited_papers": [ { "title": "Self-Refine: Iterative Refinement with Self-Feedback", - "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"], + "authors": [ + "Aman Madaan", + "Niket Tandon", + "Prakhar Gupta" + ], "year": 2023, "relevance": "Key baseline for LLM self-improvement through iterative feedback, relevant to agentic AI methodology." }, { "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", - "authors": ["Noah Shinn", "Federico Cassano", "Beck Labash"], + "authors": [ + "Noah Shinn", + "Federico Cassano", + "Beck Labash" + ], "year": 2023, "relevance": "Foundational work on LLM agents with reflective memory, directly related to agentic AI capabilities." }, { "title": "MemGPT: Towards LLMs as Operating Systems", - "authors": ["Charles Packer", "Vivian Fang", "Shishir G. Patil"], + "authors": [ + "Charles Packer", + "Vivian Fang", + "Shishir G. Patil" + ], "year": 2023, "arxiv_id": "2310.08560", "relevance": "OS-inspired memory management for LLMs, relevant to agentic architecture and context management." }, { "title": "Executable Code Actions Elicit Better LLM Agents", - "authors": ["Xingyao Wang", "Yangyi Chen", "Lifan Yuan"], + "authors": [ + "Xingyao Wang", + "Yangyi Chen", + "Lifan Yuan" + ], "year": 2024, "relevance": "Code-as-Action baseline showing code generation improves LLM agent reliability, key comparison for agentic frameworks." }, { "title": "Why Do Multi-Agent LLM Systems Fail?", - "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang"], + "authors": [ + "Mert Cemri", + "Melissa Z Pan", + "Shuyi Yang" + ], "year": 2025, "relevance": "Analysis of multi-agent LLM system failures, directly relevant to understanding agentic system reliability." }, { "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", - "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], + "authors": [ + "Jason Wei", + "Xuezhi Wang", + "Dale Schuurmans" + ], "year": 2022, "relevance": "Foundational prompting technique used as baseline, core to LLM capability evaluation." }, { "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models", - "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"], + "authors": [ + "Xuezhi Wang", + "Jason Wei", + "Dale Schuurmans" + ], "year": 2022, "arxiv_id": "2203.11171", "relevance": "Sampling-based reasoning improvement method used as baseline in agentic system evaluation." }, { "title": "Augmented Language Models: A Survey", - "authors": ["G. Mialon", "Roberto Dessì", "M. Lomeli"], + "authors": [ + "G. Mialon", + "Roberto Dessì", + "M. Lomeli" + ], "year": 2023, "relevance": "Survey of tool-augmented LLMs, providing context for agentic AI capabilities and tool integration." }, { "title": "ToolRL: Reward is All Tool Learning Needs", - "authors": ["Cheng Qian", "Emre Can Acikgoz", "Qi He"], + "authors": [ + "Cheng Qian", + "Emre Can Acikgoz", + "Qi He" + ], "year": 2025, "arxiv_id": "2504.13958", "relevance": "RL-based tool learning for LLMs, relevant to agentic tool use and reasoning capabilities." }, { "title": "A Survey of Context Engineering for Large Language Models", - "authors": ["Lingrui Mei", "Jiayu Yao", "Yuyao Ge"], + "authors": [ + "Lingrui Mei", + "Jiayu Yao", + "Yuyao Ge" + ], "year": 2025, "relevance": "Survey of context engineering techniques that GEARS builds upon for its progressive disclosure strategy." } ] -} +} +\ No newline at end of file diff --git a/papers/decomposed-prompting-modular-2022/scan.json b/papers/decomposed-prompting-modular-2022/scan.json @@ -16,8 +16,13 @@ "doi": "10.48550/arXiv.2210.02406" }, "scan_version": 2, - "active_modules": ["experimental_rigor", "data_leakage"], - "methodology_tags": ["benchmark-eval"], + "active_modules": [ + "experimental_rigor", + "data_leakage" + ], + "methodology_tags": [ + "benchmark-eval" + ], "key_findings": "Decomposed Prompting (DECOMP) outperforms Chain-of-Thought and Least-to-Most prompting across symbolic reasoning, long-context QA, and open-domain multi-hop QA by decomposing complex tasks into modular sub-task handlers with independent few-shot prompts. The approach enables hierarchical decomposition (for hard sub-tasks like kth letter extraction), recursive decomposition (achieving near-perfect length generalization on list reversal), and seamless integration of external tools like Elasticsearch retrieval. A simple post-processing decomposition for answer extraction in math QA yields 14-17 point improvements over standard CoT.", "checklist": { "artifacts": { @@ -424,82 +429,141 @@ "cited_papers": [ { "title": "Chain of thought prompting elicits reasoning in large language models", - "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Ed Chi", "Quoc Le", "Denny Zhou"], + "authors": [ + "Jason Wei", + "Xuezhi Wang", + "Dale Schuurmans", + "Maarten Bosma", + "Ed Chi", + "Quoc Le", + "Denny Zhou" + ], "year": 2022, "relevance": "Foundational chain-of-thought prompting method that DECOMP extends; key baseline for all experiments." }, { "title": "Language models are few-shot learners", - "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"], + "authors": [ + "Tom Brown", + "Benjamin Mann", + "Nick Ryder" + ], "year": 2020, "relevance": "GPT-3 paper establishing few-shot prompting capabilities that underpin this work." }, { "title": "Least-to-most prompting enables complex reasoning in large language models", - "authors": ["Denny Zhou", "Nathanael Scharli", "Le Hou", "Jason Wei"], + "authors": [ + "Denny Zhou", + "Nathanael Scharli", + "Le Hou", + "Jason Wei" + ], "year": 2023, "relevance": "Closest prior work to DECOMP; generates sub-questions from easiest to hardest. Direct baseline comparison." }, { "title": "Successive prompting for decomposing complex questions", - "authors": ["Dheeru Dua", "Shivanshu Gupta", "Sameer Singh", "Matt Gardner"], + "authors": [ + "Dheeru Dua", + "Shivanshu Gupta", + "Sameer Singh", + "Matt Gardner" + ], "year": 2022, "relevance": "Similar decomposition approach using sequential prompting for question answering; key comparison point." }, { "title": "PAL: Program-aided language models", - "authors": ["Luyu Gao", "Aman Madaan", "Shuyan Zhou"], + "authors": [ + "Luyu Gao", + "Aman Madaan", + "Shuyan Zhou" + ], "year": 2022, "arxiv_id": "2211.10435", "relevance": "Combines LLMs with symbolic computation for reasoning tasks; related modular approach to LLM problem-solving." }, { "title": "Toolformer: Language models can teach themselves to use tools", - "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessi"], + "authors": [ + "Timo Schick", + "Jane Dwivedi-Yu", + "Roberto Dessi" + ], "year": 2023, "arxiv_id": "2302.04761", "relevance": "Tool-augmented LLMs that learn to call external APIs; related approach to integrating external functions." }, { "title": "Self-consistency improves chain of thought reasoning in language models", - "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans", "Quoc Le", "Ed Chi", "Denny Zhou"], + "authors": [ + "Xuezhi Wang", + "Jason Wei", + "Dale Schuurmans", + "Quoc Le", + "Ed Chi", + "Denny Zhou" + ], "year": 2023, "relevance": "Alternative method for improving CoT reasoning via sampling consistency; complementary to decomposition approach." }, { "title": "Training verifiers to solve math word problems", - "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian"], + "authors": [ + "Karl Cobbe", + "Vineet Kosaraju", + "Mohammad Bavarian" + ], "year": 2021, "arxiv_id": "2110.14168", "relevance": "GSM8K benchmark used for evaluation; training verifiers for mathematical reasoning." }, { "title": "PaLM: Scaling language modeling with pathways", - "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin"], + "authors": [ + "Aakanksha Chowdhery", + "Sharan Narang", + "Jacob Devlin" + ], "year": 2022, "arxiv_id": "2204.02311", "relevance": "Large-scale LLM demonstrating chain-of-thought capabilities; related to the scaling of prompting approaches." }, { "title": "Language model cascades", - "authors": ["David Dohan", "Winnie Xu", "Aitor Lewkowycz"], + "authors": [ + "David Dohan", + "Winnie Xu", + "Aitor Lewkowycz" + ], "year": 2022, "arxiv_id": "2207.10342", "relevance": "Theoretical framework for composing LLMs as probabilistic programs; formalizes the cascade structure DECOMP uses." }, { "title": "Training language models to follow instructions with human feedback", - "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"], + "authors": [ + "Long Ouyang", + "Jeff Wu", + "Xu Jiang" + ], "year": 2022, "relevance": "InstructGPT paper for the text-davinci-002 model used as the primary LLM in DECOMP experiments." }, { "title": "Measuring and narrowing the compositionality gap in language models", - "authors": ["Ofir Press", "Muru Zhang", "Sewon Min", - "Ludwig Schmidt", "Noah A Smith", "Mike Lewis"], + "authors": [ + "Ofir Press", + "Muru Zhang", + "Sewon Min", + "Ludwig Schmidt", + "Noah A Smith", + "Mike Lewis" + ], "year": 2022, "arxiv_id": "2210.03350", "relevance": "Analyzes compositional reasoning limitations in LLMs; DECOMP addresses by decomposing compositional tasks." } ] -} +} +\ No newline at end of file diff --git a/papers/deep-dive-into-2024/scan.json b/papers/deep-dive-into-2024/scan.json @@ -15,8 +15,14 @@ "doi": "10.48550/arXiv.2411.01414" }, "scan_version": 2, - "active_modules": ["experimental_rigor", "data_leakage"], - "methodology_tags": ["benchmark-eval", "qualitative"], + "active_modules": [ + "experimental_rigor", + "data_leakage" + ], + "methodology_tags": [ + "benchmark-eval", + "qualitative" + ], "key_findings": "Identifies 17 types of non-syntactic mistakes in LLM-generated code (10 new vs prior work) across GPT-4 and Qwen2.5-Coder on HumanEval-X and MBXP datasets. Six underlying reasons are identified through manual analysis and semi-automated verification, with 'Misleading Coding Question Specification' accounting for 56% of mistakes. GPT-4 with ReAct prompting achieves F1 of 0.78 for automated reason identification, though positional sensitivity remains poorly detected (F1=0.65).", "checklist": { "artifacts": { @@ -423,82 +429,118 @@ "cited_papers": [ { "title": "Evaluating large language models trained on code", - "authors": ["Mark Chen", "Jerry Tworek"], + "authors": [ + "Mark Chen", + "Jerry Tworek" + ], "year": 2021, "arxiv_id": "2107.03374", "relevance": "Introduced HumanEval benchmark and Codex model, foundational for LLM code generation evaluation." }, { "title": "GPT-4 technical report", - "authors": ["Josh Achiam"], + "authors": [ + "Josh Achiam" + ], "year": 2023, "arxiv_id": "2303.08774", "relevance": "Technical report for GPT-4, one of the two models evaluated in this study." }, { "title": "Automated repair of programs from large language models", - "authors": ["Zhiyu Fan", "Xiang Gao", "Martin Mirchev"], + "authors": [ + "Zhiyu Fan", + "Xiang Gao", + "Martin Mirchev" + ], "year": 2023, "relevance": "Prior work identifying error categories in LLM-generated code from LeetCode problems with Codex." }, { "title": "An Empirical Study of Code Generation Errors made by Large Language Models", - "authors": ["Da Song", "Zijie Zhou"], + "authors": [ + "Da Song", + "Zijie Zhou" + ], "year": 2023, "relevance": "Identified syntactic and semantic mistakes in LLM-generated code on HumanEval, a direct predecessor to this study." }, { "title": "Bugs in large language models generated code: An empirical study", - "authors": ["Florian Tambon", "Arghavan Moradi-Dakhel"], + "authors": [ + "Florian Tambon", + "Arghavan Moradi-Dakhel" + ], "year": 2025, "relevance": "Examined 333 bugs from LLM-generated code identifying ten mistake categories, directly compared in this paper." }, { "title": "Large language models and simple, stupid bugs", - "authors": ["Kevin Jesse", "Toufique Ahmed"], + "authors": [ + "Kevin Jesse", + "Toufique Ahmed" + ], "year": 2023, "relevance": "Found LLMs produce SStuBs due to training data quality issues, informing the training-induced mistake hypothesis." }, { "title": "LLM hallucinations in practical code generation: Phenomena, mechanism, and mitigation", - "authors": ["Ziyao Zhang", "Yanlin Wang"], + "authors": [ + "Ziyao Zhang", + "Yanlin Wang" + ], "year": 2024, "arxiv_id": "2409.20550", "relevance": "Categorized eight types of code generation mistakes from CoderEval, directly compared in this study." }, { "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT", - "authors": ["Steven Xia", "Lingming Zhang"], + "authors": [ + "Steven Xia", + "Lingming Zhang" + ], "year": 2023, "arxiv_id": "2304.00385", "relevance": "CHATREPAIR automated program repair tool used as core methodology for fixing incorrect LLM-generated code." }, { "title": "React: Synergizing reasoning and acting in language models", - "authors": ["Shunyu Yao", "Jeffrey Zhao"], + "authors": [ + "Shunyu Yao", + "Jeffrey Zhao" + ], "year": 2022, "arxiv_id": "2210.03629", "relevance": "ReAct prompting technique used for the best-performing reason identification approach in this study." }, { "title": "ClarifyGPT: Empowering LLM-based Code Generation with Intention Clarification", - "authors": ["Fangwen Mu", "Lin Shi"], + "authors": [ + "Fangwen Mu", + "Lin Shi" + ], "year": 2023, "arxiv_id": "2310.10996", "relevance": "Addressed ambiguous coding requirements as a cause of LLM mistakes, related to the specification ambiguity finding." }, { "title": "Qwen2.5-coder technical report", - "authors": ["Binyuan Hui", "Jian Yang"], + "authors": [ + "Binyuan Hui", + "Jian Yang" + ], "year": 2024, "arxiv_id": "2409.12186", "relevance": "Technical report for Qwen2.5-Coder, one of the two models evaluated in this study." }, { "title": "Do Large Language Models Pay Similar Attention Like Human Programmers When Generating Code?", - "authors": ["Bonan Kou", "Shengmai Chen"], + "authors": [ + "Bonan Kou", + "Shengmai Chen" + ], "year": 2024, "relevance": "Analyzed attention patterns in LLM code generation, related to the positional sensitivity finding in this study." } ] -} +} +\ No newline at end of file diff --git a/papers/deepcircuitx-comprehensive-repositorylevel-2025/scan.json b/papers/deepcircuitx-comprehensive-repositorylevel-2025/scan.json @@ -23,8 +23,13 @@ "doi": "10.1109/ICLAD65226.2025.00029" }, "scan_version": 2, - "active_modules": ["experimental_rigor", "data_leakage"], - "methodology_tags": ["benchmark-eval"], + "active_modules": [ + "experimental_rigor", + "data_leakage" + ], + "methodology_tags": [ + "benchmark-eval" + ], "key_findings": "DeepCircuitX provides a multi-level (repo/file/module/block) RTL dataset with 4,000+ projects and Chain-of-Thought annotations generated by GPT-4 and Claude. Fine-tuning LLMs on this dataset substantially improves RTL code understanding (BLEU-4 from ~0.1 to ~13.7), code generation (Pass@1 up to 24.14% on RTLLM), and PPA prediction (area MAPE down to 0.33). Delay prediction remains challenging with MAPE of 3.5-4.7 even at full training data, indicating early-stage timing estimation is an open problem.", "checklist": { "artifacts": { @@ -439,83 +444,136 @@ "cited_papers": [ { "title": "RTL-Repo: A Benchmark for Evaluating LLMs on Large-Scale RTL Design Projects", - "authors": ["A. Allam", "M. Shalan"], + "authors": [ + "A. Allam", + "M. Shalan" + ], "year": 2024, "arxiv_id": "2405.17378", "relevance": "Benchmark for evaluating LLMs on RTL design projects, directly comparable to DeepCircuitX's goals." }, { "title": "Benchmarking Large Language Models for Automated Verilog RTL Code Generation", - "authors": ["S. Thakur", "B. Ahmad", "Z. Fan", "H. Pearce", "B. Tan", "R. Karri", "B. Dolan-Gavitt", "S. Garg"], + "authors": [ + "S. Thakur", + "B. Ahmad", + "Z. Fan", + "H. Pearce", + "B. Tan", + "R. Karri", + "B. Dolan-Gavitt", + "S. Garg" + ], "year": 2023, "relevance": "Benchmark evaluation of LLMs for Verilog code generation, one of the foundational papers in LLM-for-hardware." }, { "title": "Data is All You Need: Finetuning LLMs for Chip Design via an Automated Design-Data Augmentation Framework", - "authors": ["K. Chang", "K. Wang", "N. Yang"], + "authors": [ + "K. Chang", + "K. Wang", + "N. Yang" + ], "year": 2024, "arxiv_id": "2403.11202", "relevance": "Data augmentation framework for chip design LLMs, addressing the same data quality challenge as DeepCircuitX." }, { "title": "RTLLM: An Open-Source Benchmark for Design RTL Generation with Large Language Model", - "authors": ["Y. Lu", "S. Liu", "Q. Zhang", "Z. Xie"], + "authors": [ + "Y. Lu", + "S. Liu", + "Q. Zhang", + "Z. Xie" + ], "year": 2024, "relevance": "One of the two evaluation benchmarks used in this paper for RTL code generation with LLMs." }, { "title": "VerilogEval: Evaluating Large Language Models for Verilog Code Generation", - "authors": ["M. Liu", "N. Pinckney", "B. Khailany", "H. Ren"], + "authors": [ + "M. Liu", + "N. Pinckney", + "B. Khailany", + "H. Ren" + ], "year": 2023, "relevance": "The other evaluation benchmark used in this paper for Verilog code generation evaluation." }, { "title": "Code Llama: Open Foundation Models for Code", - "authors": ["B. Roziere", "J. Gehring"], + "authors": [ + "B. Roziere", + "J. Gehring" + ], "year": 2023, "arxiv_id": "2308.12950", "relevance": "Open-source code LLM used as one of the base models fine-tuned on DeepCircuitX." }, { "title": "CodeT5+: Open Code Large Language Models for Code Understanding and Generation", - "authors": ["Y. Wang", "H. Le", "A. D. Gotmare"], + "authors": [ + "Y. Wang", + "H. Le", + "A. D. Gotmare" + ], "year": 2023, "arxiv_id": "2305.07922", "relevance": "Code understanding and generation LLM used as a base model in the fine-tuning experiments." }, { "title": "DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models in Code Intelligence", - "authors": ["Q. Zhu", "D. Guo", "Z. Shao"], + "authors": [ + "Q. Zhu", + "D. Guo", + "Z. Shao" + ], "year": 2024, "arxiv_id": "2406.11931", "relevance": "Code intelligence LLM that showed strong baseline performance on RTL tasks even before fine-tuning." }, { "title": "CodeV: Empowering LLMs for Verilog Generation through Multi-Level Summarization", - "authors": ["Y. Zhao", "D. Huang", "C. Li"], + "authors": [ + "Y. Zhao", + "D. Huang", + "C. Li" + ], "year": 2024, "arxiv_id": "2407.10424", "relevance": "LLM specifically fine-tuned for Verilog generation, used as a comparison baseline." }, { "title": "VeriGen: A Large Language Model for Verilog Code Generation", - "authors": ["S. Thakur", "B. Ahmad", "H. Pearce"], + "authors": [ + "S. Thakur", + "B. Ahmad", + "H. Pearce" + ], "year": 2024, "relevance": "LLM fine-tuned on Verilog datasets, representing the prior state of LLM-based hardware code generation." }, { "title": "MG-Verilog: Multi-Grained Dataset Towards Enhanced LLM-Assisted Verilog Generation", - "authors": ["Y. Zhang", "Z. Yu", "Y. Fu"], + "authors": [ + "Y. Zhang", + "Z. Yu", + "Y. Fu" + ], "year": 2024, "arxiv_id": "2407.01910", "relevance": "Multi-grained Verilog dataset for LLM training, a direct competitor/comparable dataset to DeepCircuitX." }, { "title": "Origen: Enhancing RTL Code Generation with Code-to-Code Augmentation and Self-Reflection", - "authors": ["F. Cui", "C. Yin", "K. Zhou"], + "authors": [ + "F. Cui", + "C. Yin", + "K. Zhou" + ], "year": 2024, "arxiv_id": "2407.16237", "relevance": "RTL code generation approach using augmentation and self-reflection, relevant to LLM-based hardware design." } ] -} +} +\ No newline at end of file diff --git a/papers/deepcode-open-agentic-2025/scan.json b/papers/deepcode-open-agentic-2025/scan.json @@ -14,8 +14,13 @@ "doi": "10.48550/arXiv.2512.07921" }, "scan_version": 2, - "active_modules": ["experimental_rigor", "data_leakage"], - "methodology_tags": ["benchmark-eval"], + "active_modules": [ + "experimental_rigor", + "data_leakage" + ], + "methodology_tags": [ + "benchmark-eval" + ], "key_findings": "DeepCode is a multi-stage framework for document-to-repository synthesis that orchestrates blueprint distillation, stateful code memory, retrieval-augmented generation, and automated verification. On PaperBench Code-Dev (20 ICML papers), DeepCode reports 73.5% replication score, outperforming commercial agents (Cursor 58.4%, Claude Code 58.7%) and claiming to surpass PhD-level human experts (75.9 vs 72.4 on a 3-paper subset). Ablation studies show CodeRAG provides up to 70% relative gain on weaker models and CodeMem prevents context saturation in multi-file generation.", "checklist": { "artifacts": { @@ -430,84 +435,127 @@ "cited_papers": [ { "title": "PaperBench: Evaluating AI's Ability to Replicate AI Research", - "authors": ["Giulio Starace", "Oliver Jaffe", "Dane Sherburn"], + "authors": [ + "Giulio Starace", + "Oliver Jaffe", + "Dane Sherburn" + ], "year": 2025, "arxiv_id": "2504.01848", "relevance": "Key benchmark used for evaluation; measures AI systems' ability to reproduce ML research papers as code repositories." }, { "title": "Paper2Code: Automating Code Generation from Scientific Papers in Machine Learning", - "authors": ["Minju Seo", "Jinheon Baek", "Seongyun Lee", "Sung Ju Hwang"], + "authors": [ + "Minju Seo", + "Jinheon Baek", + "Seongyun Lee", + "Sung Ju Hwang" + ], "year": 2025, "arxiv_id": "2504.17192", "relevance": "Direct baseline; multi-agent paper-to-code framework achieving 51.1% on PaperBench." }, { "title": "ChatDev: Communicative Agents for Software Development", - "authors": ["Chen Qian", "Wei Liu"], + "authors": [ + "Chen Qian", + "Wei Liu" + ], "year": 2024, "arxiv_id": "2307.07924", "relevance": "Multi-agent software development framework simulating organizational structures." }, { "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework", - "authors": ["Sirui Hong", "Mingchen Zhuge"], + "authors": [ + "Sirui Hong", + "Mingchen Zhuge" + ], "year": 2024, "arxiv_id": "2308.00352", "relevance": "Multi-agent collaborative framework for software engineering tasks." }, { "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", - "authors": ["John Yang", "Carlos E. Jimenez"], + "authors": [ + "John Yang", + "Carlos E. Jimenez" + ], "year": 2025, "relevance": "Agent-computer interface design for LLM-based software engineering agents; foundational work in agentic coding." }, { "title": "The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery", - "authors": ["Chris Lu", "Cong Lu", "Robert Tjarko Lange"], + "authors": [ + "Chris Lu", + "Cong Lu", + "Robert Tjarko Lange" + ], "year": 2024, "arxiv_id": "2408.06292", "relevance": "Fully automated scientific discovery system including experimental code generation and execution." }, { "title": "AlphaEvolve: A Coding Agent for Scientific and Algorithmic Discovery", - "authors": ["Alexander Novikov"], + "authors": [ + "Alexander Novikov" + ], "year": 2025, "arxiv_id": "2506.13131", "relevance": "LLM-based evolutionary code generation for algorithmic discovery." }, { "title": "CodeScientist: End-to-End Semi-Automated Scientific Discovery with Code-Based Experimentation", - "authors": ["Peter Jansen", "Oyvind Tafjord"], + "authors": [ + "Peter Jansen", + "Oyvind Tafjord" + ], "year": 2025, "arxiv_id": "2503.22708", "relevance": "Iterative code generation for scientific experiments with generate-execute-reflect cycle." }, { "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", - "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"], + "authors": [ + "Sida Peng", + "Eirini Kalliamvakou", + "Peter Cihon", + "Mert Demirer" + ], "year": 2023, "arxiv_id": "2302.06590", "relevance": "Empirical study of AI coding assistant impact on developer productivity." }, { "title": "AgentCoder: Multi-Agent-Based Code Generation with Iterative Testing and Optimisation", - "authors": ["Dong Huang", "Jie M. Zhang"], + "authors": [ + "Dong Huang", + "Jie M. Zhang" + ], "year": 2024, "arxiv_id": "2312.13010", "relevance": "Multi-agent code generation with test-driven refinement loop." }, { "title": "AI-Researcher: Autonomous Scientific Innovation", - "authors": ["Jiabin Tang", "Lianghao Xia", "Zhonghang Li", "Chao Huang"], + "authors": [ + "Jiabin Tang", + "Lianghao Xia", + "Zhonghang Li", + "Chao Huang" + ], "year": 2025, "relevance": "Multi-stage autonomous research agent with code generation and experimental execution; shares authors with DeepCode." }, { "title": "ToolGen: Unified Tool Retrieval and Calling via Generation", - "authors": ["Renxi Wang", "Xudong Han"], + "authors": [ + "Renxi Wang", + "Xudong Han" + ], "year": 2025, "relevance": "Integrates tool-specific knowledge into LLM parameters for seamless tool invocation during code generation." } ] -} +} +\ No newline at end of file diff --git a/papers/deepcrceval-revisiting-evaluation-2024/scan.json b/papers/deepcrceval-revisiting-evaluation-2024/scan.json @@ -17,8 +17,14 @@ "doi": "10.48550/arXiv.2412.18291" }, "scan_version": 2, - "active_modules": ["experimental_rigor", "data_leakage"], - "methodology_tags": ["benchmark-eval", "qualitative"], + "active_modules": [ + "experimental_rigor", + "data_leakage" + ], + "methodology_tags": [ + "benchmark-eval", + "qualitative" + ], "key_findings": "Less than 10% of benchmark comments in major code review datasets (3% in Tufano, 8% in CodeReviewer) meet all quality, category, tone, and context criteria for automation references. The proposed DeepCRCEval framework using 9 domain-specific criteria provides better discrimination than text similarity metrics. LLM evaluators reduce evaluation time by 88.78% and cost by 90.32% compared to human evaluators while maintaining commendable reliability. A training-free GPT-4-based LLM-Reviewer substantially outperforms all existing trained CRCGs (Tufano et al., CommentFinder, CodeReviewer, AUGER, CCT5) on the proposed criteria.", "checklist": { "artifacts": { @@ -420,77 +426,141 @@ "cited_papers": [ { "title": "Using pre-trained models to boost code review automation", - "authors": ["R. Tufano", "S. Masiero", "A. Mastropaolo", "L. Pascarella", "D. Poshyvanyk", "G. Bavota"], + "authors": [ + "R. Tufano", + "S. Masiero", + "A. Mastropaolo", + "L. Pascarella", + "D. Poshyvanyk", + "G. Bavota" + ], "year": 2022, "relevance": "Foundational work on DNN-based code review comment generation using T5, key baseline evaluated in this paper." }, { "title": "Automating code review activities by large-scale pre-training", - "authors": ["Z. Li", "S. Lu", "D. Guo", "N. Duan"], + "authors": [ + "Z. Li", + "S. Lu", + "D. Guo", + "N. Duan" + ], "year": 2022, "relevance": "CodeReviewer model — major CRCG baseline with code-review-specific pre-training, evaluated and found lacking by DeepCRCEval." }, { "title": "CCT5: A code-change-oriented pre-trained model", - "authors": ["B. Lin", "S. Wang", "Z. Liu", "Y. Liu", "X. Xia", "X. Mao"], + "authors": [ + "B. Lin", + "S. Wang", + "Z. Liu", + "Y. Liu", + "X. Xia", + "X. Mao" + ], "year": 2023, "relevance": "Code-change-oriented pre-trained model for review comment generation, baseline in this study." }, { "title": "AUGER: automatically generating review comments with pre-training models", - "authors": ["L. Li", "L. Yang", "H. Jiang", "J. Yan"], + "authors": [ + "L. Li", + "L. Yang", + "H. Jiang", + "J. Yan" + ], "year": 2022, "relevance": "Pre-training-based code review comment generator using review tags, baseline in this study." }, { "title": "CommentFinder: a simpler, faster, more accurate code review comments recommendation", - "authors": ["Y. Hong", "C. Tantithamthavorn", "P. Thongtanunam", "A. Aleti"], + "authors": [ + "Y. Hong", + "C. Tantithamthavorn", + "P. Thongtanunam", + "A. Aleti" + ], "year": 2022, "relevance": "Retrieval-based code review comment system, demonstrating alternative to generative approaches." }, { "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena", - "authors": ["L. Zheng", "W.L. Chiang", "Y. Sheng"], + "authors": [ + "L. Zheng", + "W.L. Chiang", + "Y. Sheng" + ], "year": 2023, "relevance": "Foundational work on using LLMs as evaluators, showing GPT-4 agreement with humans surpasses inter-human agreement. Basis for DeepCRCEval's LLM evaluator design." }, { "title": "Llama-Reviewer: Advancing code review automation with large language models through parameter-efficient fine-tuning", - "authors": ["J. Lu", "L. Yu", "X. Li", "L. Yang", "C. Zuo"], + "authors": [ + "J. Lu", + "L. Yu", + "X. Li", + "L. Yang", + "C. Zuo" + ], "year": 2023, "relevance": "First attempt at parameter-efficient fine-tuning of LLMs for code review tasks." }, { "title": "Exploring the impact of code review factors on the code review comment generation", - "authors": ["J. Lu", "Z. Li", "C. Shen", "L. Yang", "C. Zuo"], + "authors": [ + "J. Lu", + "Z. Li", + "C. Shen", + "L. Yang", + "C. Zuo" + ], "year": 2024, "relevance": "Investigates factors influencing code review for both pre-trained LMs and LLMs." }, { "title": "EvaCRC: Evaluating code review comments", - "authors": ["L. Yang", "J. Xu", "Y. Zhang", "H. Zhang", "A. Bacchelli"], + "authors": [ + "L. Yang", + "J. Xu", + "Y. Zhang", + "H. Zhang", + "A. Bacchelli" + ], "year": 2023, "relevance": "BERT-based evaluation of code review comments across four dimensions, key related work on evaluation methodology." }, { "title": "Expectations, outcomes, and challenges of modern code review", - "authors": ["A. Bacchelli", "C. Bird"], + "authors": [ + "A. Bacchelli", + "C. Bird" + ], "year": 2013, "doi": "10.1109/ICSE.2013.6606617", "relevance": "Seminal study on code review expectations and challenges, provides the comment category taxonomy adopted by this paper." }, { "title": "Code review quality: How developers see it", - "authors": ["O. Kononenko", "O. Baysal", "M.W. Godfrey"], + "authors": [ + "O. Kononenko", + "O. Baysal", + "M.W. Godfrey" + ], "year": 2016, "doi": "10.1145/2884781.2884840", "relevance": "Defines developer perspectives on code review quality, directly informs the 9 evaluation criteria used in this paper." }, { "title": "Towards automating code review activities", - "authors": ["R. Tufano", "L. Pascarella", "M. Tufano", "D. Poshyvanyk", "G. Bavota"], + "authors": [ + "R. Tufano", + "L. Pascarella", + "M. Tufano", + "D. Poshyvanyk", + "G. Bavota" + ], "year": 2021, "relevance": "Pioneering work on T5-based code review automation, predecessor to the Tufano et al. 2022 baseline." } ] -} +} +\ No newline at end of file diff --git a/papers/deepseek-coder-2024/scan.json b/papers/deepseek-coder-2024/scan.json @@ -1,14 +1,33 @@ { "paper": { "title": "DeepSeek-Coder: When the Large Language Model Meets Programming — The Rise of Code Intelligence", - "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang", "Zhenda Xie", "Kai Dong", "Wentao Zhang", "Guanting Chen", "Xiao Bi", "Y. Wu", "Y.K. Li", "Fuli Luo", "Yingfei Xiong", "Wenfeng Liang"], + "authors": [ + "Daya Guo", + "Qihao Zhu", + "Dejian Yang", + "Zhenda Xie", + "Kai Dong", + "Wentao Zhang", + "Guanting Chen", + "Xiao Bi", + "Y. Wu", + "Y.K. Li", + "Fuli Luo", + "Yingfei Xiong", + "Wenfeng Liang" + ], "year": 2024, "venue": "arXiv", "arxiv_id": "2401.14196" }, "scan_version": 2, - "active_modules": ["experimental_rigor", "data_leakage"], - "methodology_tags": ["benchmark-eval"], + "active_modules": [ + "experimental_rigor", + "data_leakage" + ], + "methodology_tags": [ + "benchmark-eval" + ], "key_findings": "DeepSeek-Coder is a series of open-source code models (1.3B-33B) trained from scratch on 2T tokens with 87 programming languages and repository-level data organization. DeepSeek-Coder-Base 33B achieves state-of-the-art among open-source code models on HumanEval (56.1%), MBPP (66.0%), DS-1000, and other benchmarks. The 6.7B model matches CodeLlama-34B performance. FIM ablation shows 50% PSM rate optimally balances code completion and fill-in-the-middle capability.", "checklist": { "artifacts": { @@ -410,80 +429,128 @@ "cited_papers": [ { "title": "Evaluating large language models trained on code", - "authors": ["M. Chen", "J. Tworek", "H. Jun"], + "authors": [ + "M. Chen", + "J. Tworek", + "H. Jun" + ], "year": 2021, "arxiv_id": "2107.03374", "relevance": "Introduces HumanEval benchmark and Codex, foundational for code LLM evaluation." }, { "title": "StarCoder: may the source be with you!", - "authors": ["R. Li", "L. B. Allal", "Y. Zi"], + "authors": [ + "R. Li", + "L. B. Allal", + "Y. Zi" + ], "year": 2023, "arxiv_id": "2305.06161", "relevance": "Major open-source code model baseline and data filtering methodology used in DeepSeek-Coder." }, { "title": "Code Llama: Open Foundation Models for Code", - "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"], + "authors": [ + "B. Roziere", + "J. Gehring", + "F. Gloeckle" + ], "year": 2023, "arxiv_id": "2308.12950", "relevance": "Primary open-source baseline for code generation evaluation across all benchmarks." }, { "title": "MultiPL-E: a scalable and polyglot approach to benchmarking neural code generation", - "authors": ["F. Cassano", "J. Gouwar", "D. Nguyen"], + "authors": [ + "F. Cassano", + "J. Gouwar", + "D. Nguyen" + ], "year": 2023, "relevance": "Provides multilingual extension of HumanEval used for evaluating code generation across 8 languages." }, { "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion", - "authors": ["Y. Ding", "Z. Wang", "W. U. Ahmad"], + "authors": [ + "Y. Ding", + "Z. Wang", + "W. U. Ahmad" + ], "year": 2023, "relevance": "Benchmark for cross-file code completion, tests repository-level understanding." }, { "title": "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation", - "authors": ["Y. Lai", "C. Li", "Y. Wang"], + "authors": [ + "Y. Lai", + "C. Li", + "Y. Wang" + ], "year": 2023, "relevance": "Practical data science code generation benchmark with 1000 problems across 7 libraries." }, { "title": "SantaCoder: don't reach for the stars!", - "authors": ["L. B. Allal", "R. Li", "D. Kocetkov"], + "authors": [ + "L. B. Allal", + "R. Li", + "D. Kocetkov" + ], "year": 2023, "arxiv_id": "2301.03988", "relevance": "Open-source code model baseline and FIM code completion benchmark methodology." }, { "title": "Efficient training of language models to fill in the middle", - "authors": ["M. Bavarian", "H. Jun", "N. Tezak"], + "authors": [ + "M. Bavarian", + "H. Jun", + "N. Tezak" + ], "year": 2022, "arxiv_id": "2207.14255", "relevance": "Introduces FIM pre-training methodology adopted by DeepSeek-Coder." }, { "title": "Program synthesis with large language models", - "authors": ["J. Austin", "A. Odena", "M. Nye"], + "authors": [ + "J. Austin", + "A. Odena", + "M. Nye" + ], "year": 2021, "relevance": "Introduces MBPP benchmark widely used for code LLM evaluation." }, { "title": "PAL: Program-aided language models", - "authors": ["L. Gao", "A. Madaan", "S. Zhou"], + "authors": [ + "L. Gao", + "A. Madaan", + "S. Zhou" + ], "year": 2023, "relevance": "Program-aided math reasoning methodology used to evaluate DeepSeek-Coder on mathematical tasks." }, { "title": "The Stack: 3 TB of permissively licensed source code", - "authors": ["D. Kocetkov", "R. Li", "L. Jia"], + "authors": [ + "D. Kocetkov", + "R. Li", + "L. Jia" + ], "year": 2022, "relevance": "Large-scale code dataset with deduplication methodology influencing DeepSeek-Coder's data pipeline." }, { "title": "Deduplicating training data makes language models better", - "authors": ["K. Lee", "D. Ippolito", "A. Nystrom"], + "authors": [ + "K. Lee", + "D. Ippolito", + "A. Nystrom" + ], "year": 2022, "relevance": "Demonstrates importance of training data deduplication for LLM performance." } ] -} +} +\ No newline at end of file diff --git a/papers/deepseek-coder-v2-2024/scan.json b/papers/deepseek-coder-v2-2024/scan.json @@ -1,14 +1,59 @@ { "paper": { "title": "DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models in Code Intelligence", - "authors": ["Qihao Zhu", "Daya Guo", "Zhihong Shao", "Dejian Yang", "Peiyi Wang", "Runxin Xu", "Y. Wu", "Yukun Li", "Huazuo Gao", "Shirong Ma", "Wangding Zeng", "Xiao Bi", "Zihui Gu", "Hanwei Xu", "Damai Dai", "Kai Dong", "Liyue Zhang", "Yishi Piao", "Zhibin Gou", "Zhenda Xie", "Zhewen Hao", "Bingxuan Wang", "Junxiao Song", "Deli Chen", "Xin Xie", "Kang Guan", "Yuxiang You", "Aixin Liu", "Qiushi Du", "Wenjun Gao", "Xuan Lu", "Qinyu Chen", "Yaohui Wang", "Chengqi Deng", "Jiashi Li", "Chenggang Zhao", "Chong Ruan", "Fuli Luo", "Wenfeng Liang"], + "authors": [ + "Qihao Zhu", + "Daya Guo", + "Zhihong Shao", + "Dejian Yang", + "Peiyi Wang", + "Runxin Xu", + "Y. Wu", + "Yukun Li", + "Huazuo Gao", + "Shirong Ma", + "Wangding Zeng", + "Xiao Bi", + "Zihui Gu", + "Hanwei Xu", + "Damai Dai", + "Kai Dong", + "Liyue Zhang", + "Yishi Piao", + "Zhibin Gou", + "Zhenda Xie", + "Zhewen Hao", + "Bingxuan Wang", + "Junxiao Song", + "Deli Chen", + "Xin Xie", + "Kang Guan", + "Yuxiang You", + "Aixin Liu", + "Qiushi Du", + "Wenjun Gao", + "Xuan Lu", + "Qinyu Chen", + "Yaohui Wang", + "Chengqi Deng", + "Jiashi Li", + "Chenggang Zhao", + "Chong Ruan", + "Fuli Luo", + "Wenfeng Liang" + ], "year": 2024, "venue": "arXiv", "arxiv_id": "2406.11931" }, "scan_version": 2, - "active_modules": ["experimental_rigor", "data_leakage"], - "methodology_tags": ["benchmark-eval"], + "active_modules": [ + "experimental_rigor", + "data_leakage" + ], + "methodology_tags": [ + "benchmark-eval" + ], "key_findings": "DeepSeek-Coder-V2, a 236B MoE model (21B active params), achieves performance comparable to GPT-4 Turbo on code and math benchmarks including 90.2% on HumanEval, 76.2% on MBPP+, 75.7% on MATH, and 43.4% on LiveCodeBench. The 16B variant (2.4B active) outperforms larger dense models like DeepSeek-Coder-33B. The model supports 338 programming languages and 128K context length. A 1B ablation study validates the new code corpus improves over the original DeepSeek-Coder corpus.", "checklist": { "artifacts": { @@ -415,70 +460,111 @@ "cited_papers": [ { "title": "Evaluating large language models trained on code", - "authors": ["M. Chen", "J. Tworek", "H. Jun"], + "authors": [ + "M. Chen", + "J. Tworek", + "H. Jun" + ], "year": 2021, "arxiv_id": "2107.03374", "relevance": "Introduces HumanEval benchmark, foundational for code generation evaluation." }, { "title": "SWE-bench: Can language models resolve real-world GitHub issues?", - "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig"], + "authors": [ + "C. E. Jimenez", + "J. Yang", + "A. Wettig" + ], "year": 2023, "arxiv_id": "2310.06770", "relevance": "Real-world software engineering benchmark used to evaluate model capabilities." }, { "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", - "authors": ["N. Jain", "K. Han", "A. Gu"], + "authors": [ + "N. Jain", + "K. Han", + "A. Gu" + ], "year": 2024, "relevance": "Contamination-free code benchmark using temporally-split competitive programming problems." }, { "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", - "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"], + "authors": [ + "J. Liu", + "C. S. Xia", + "Y. Wang", + "L. Zhang" + ], "year": 2023, "relevance": "EvalPlus/MBPP+ evaluation framework used in this paper for more rigorous code evaluation." }, { "title": "DeepSeek-Coder: When the large language model meets programming – the rise of code intelligence", - "authors": ["D. Guo", "Q. Zhu", "D. Yang"], + "authors": [ + "D. Guo", + "Q. Zhu", + "D. Yang" + ], "year": 2024, "arxiv_id": "2401.14196", "relevance": "Predecessor model; provides the base architecture and training methodology built upon in this work." }, { "title": "Code Llama: Open foundation models for code", - "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"], + "authors": [ + "B. Roziere", + "J. Gehring", + "F. Gloeckle" + ], "year": 2023, "arxiv_id": "2308.12950", "relevance": "Major open-source code model baseline for comparison." }, { "title": "StarCoder 2 and The Stack V2: The next generation", - "authors": ["A. Lozhkov", "R. Li", "L. B. Allal"], + "authors": [ + "A. Lozhkov", + "R. Li", + "L. B. Allal" + ], "year": 2024, "arxiv_id": "2402.19173", "relevance": "Open-source code model and training data corpus baseline." }, { "title": "RepoBench: Benchmarking repository-level code auto-completion systems", - "authors": ["T. Liu", "C. Xu", "J. McAuley"], + "authors": [ + "T. Liu", + "C. Xu", + "J. McAuley" + ], "year": 2023, "relevance": "Repository-level code completion benchmark used to evaluate long-context code understanding." }, { "title": "DeepSeekMath: Pushing the limits of mathematical reasoning in open language models", - "authors": ["Z. Shao", "P. Wang", "Q. Zhu"], + "authors": [ + "Z. Shao", + "P. Wang", + "Q. Zhu" + ], "year": 2024, "arxiv_id": "2402.03300", "relevance": "Provides the math training data pipeline and GRPO RL algorithm reused in this work." }, { "title": "Program synthesis with large language models", - "authors": ["J. Austin", "A. Odena", "M. Nye"], + "authors": [ + "J. Austin", + "A. Odena", + "M. Nye" + ], "year": 2021, "arxiv_id": "2108.07732", "relevance": "Introduces MBPP benchmark used for code generation evaluation." } ] -} +} +\ No newline at end of file diff --git a/papers/deepseek-r1-2025/scan.json b/papers/deepseek-r1-2025/scan.json @@ -1,14 +1,21 @@ { "paper": { "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", - "authors": ["DeepSeek-AI"], + "authors": [ + "DeepSeek-AI" + ], "year": 2025, "venue": "arXiv", "arxiv_id": "2501.12948" }, "scan_version": 2, - "active_modules": ["experimental_rigor", "data_leakage"], - "methodology_tags": ["benchmark-eval"], + "active_modules": [ + "experimental_rigor", + "data_leakage" + ], + "methodology_tags": [ + "benchmark-eval" + ], "key_findings": "DeepSeek-R1-Zero demonstrates that pure RL without supervised fine-tuning can incentivize sophisticated reasoning behaviors (self-reflection, verification) in LLMs, achieving 79.8% on AIME 2024 and 97.3% on MATH-500, matching OpenAI-o1. The multi-stage pipeline (cold-start SFT → RL → rejection sampling SFT → RL) produces DeepSeek-R1, which ranks alongside top closed-source models on ChatbotArena. Distilled smaller models (1.5B-70B) outperform GPT-4o and Claude-3.5-Sonnet on math benchmarks, and distillation outperforms RL alone on smaller architectures.", "checklist": { "artifacts": { @@ -415,81 +422,115 @@ "cited_papers": [ { "title": "Evaluating large language models trained on code", - "authors": ["M. Chen", "J. Tworek"], + "authors": [ + "M. Chen", + "J. Tworek" + ], "year": 2021, "arxiv_id": "2107.03374", "relevance": "Introduces HumanEval benchmark and pass@k evaluation methodology used throughout this paper." }, { "title": "Training language models to follow instructions with human feedback", - "authors": ["L. Ouyang", "J. Wu"], + "authors": [ + "L. Ouyang", + "J. Wu" + ], "year": 2022, "relevance": "Foundational RLHF work establishing the SFT→RL pipeline that DeepSeek-R1 modifies by skipping SFT." }, { "title": "Scaling LLM test-time compute optimally can be more effective than scaling model parameters", - "authors": ["C. Snell", "J. Lee"], + "authors": [ + "C. Snell", + "J. Lee" + ], "year": 2024, "arxiv_id": "2408.03314", "relevance": "Test-time compute scaling framework directly relevant to DeepSeek-R1's adaptive token generation strategy." }, { "title": "Chain-of-thought prompting elicits reasoning in large language models", - "authors": ["J. Wei", "X. Wang"], + "authors": [ + "J. Wei", + "X. Wang" + ], "year": 2022, "relevance": "Chain-of-thought prompting foundation that DeepSeek-R1 aims to surpass via RL-emergent reasoning." }, { "title": "DeepSeek-V3 technical report", - "authors": ["DeepSeek-AI"], + "authors": [ + "DeepSeek-AI" + ], "year": 2024, "arxiv_id": "2412.19437", "relevance": "Base model for DeepSeek-R1; provides the 671B MoE architecture and pre-training details." }, { "title": "Agentless: Demystifying LLM-based software engineering agents", - "authors": ["C. S. Xia", "Y. Deng"], + "authors": [ + "C. S. Xia", + "Y. Deng" + ], "year": 2024, "relevance": "Framework used for SWE-Bench Verified evaluation of DeepSeek-R1." }, { "title": "Let's verify step by step", - "authors": ["H. Lightman", "V. Kosaraju"], + "authors": [ + "H. Lightman", + "V. Kosaraju" + ], "year": 2024, "relevance": "Process reward model approach that DeepSeek-R1 considered but rejected in favor of outcome-based RL." }, { "title": "Self-consistency improves chain of thought reasoning in language models", - "authors": ["X. Wang", "J. Wei"], + "authors": [ + "X. Wang", + "J. Wei" + ], "year": 2023, "relevance": "Self-consistency decoding used to boost DeepSeek-R1-Zero's AIME score from 77.9% to 86.7%." }, { "title": "DeepSeekMath: Pushing the limits of mathematical reasoning in open language models", - "authors": ["Z. Shao", "P. Wang"], + "authors": [ + "Z. Shao", + "P. Wang" + ], "year": 2024, "arxiv_id": "2402.03300", "relevance": "Introduces GRPO algorithm adopted for DeepSeek-R1 training." }, { "title": "Distillation scaling laws", - "authors": ["D. Busbridge"], + "authors": [ + "D. Busbridge" + ], "year": 2025, "arxiv_id": "2502.08606", "relevance": "Scaling laws for knowledge distillation, supporting DeepSeek-R1's distillation approach." }, { "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", - "authors": ["N. Jain", "K. Han"], + "authors": [ + "N. Jain", + "K. Han" + ], "year": 2024, "arxiv_id": "2403.07974", "relevance": "Key code generation benchmark used for evaluation with temporal contamination prevention." }, { "title": "Constitutional AI: Harmlessness from AI feedback", - "authors": ["Y. Bai", "A. Jones"], + "authors": [ + "Y. Bai", + "A. Jones" + ], "year": 2022, "relevance": "RLHF safety alignment methodology that DeepSeek-R1's safety training builds upon." } ] -} +} +\ No newline at end of file diff --git a/papers/defending-against-prompt-2025-2/scan.json b/papers/defending-against-prompt-2025-2/scan.json @@ -14,7 +14,10 @@ "doi": "10.48550/arXiv.2510.19207" }, "scan_version": 2, - "active_modules": ["experimental_rigor", "data_leakage"], + "active_modules": [ + "experimental_rigor", + "data_leakage" + ], "checklist": { "artifacts": { "code_released": { @@ -395,7 +398,9 @@ "supported": "moderate" } ], - "methodology_tags": ["benchmark-eval"], + "methodology_tags": [ + "benchmark-eval" + ], "key_findings": "DataFilter, a model-agnostic defense that fine-tunes Llama-3.1-8B-Instruct to strip prompt injections from untrusted data before it reaches the backend LLM, reduces average attack success rates from over 40% to ~2.2% across SEP, InjecAgent, and AgentDojo benchmarks while maintaining utility within 1-2 percentage points. It outperforms all tested model-agnostic baselines including PromptArmor (GPT-4.1-based) on both security and utility. However, strong adaptive LLM-based attacks still achieve 83% ASR against DataFilter, and all results lack uncertainty quantification (no error bars, no multi-run variance).", "red_flags": [ { @@ -422,81 +427,149 @@ "cited_papers": [ { "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection", - "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"], + "authors": [ + "K. Greshake", + "S. Abdelnabi", + "S. Mishra", + "C. Endres", + "T. Holz", + "M. Fritz" + ], "year": 2023, "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications." }, { "title": "The attacker moves second: Stronger adaptive attacks bypass defenses against llm jailbreaks and prompt injections", - "authors": ["M. Nasr", "N. Carlini", "C. Sitawarin"], + "authors": [ + "M. Nasr", + "N. Carlini", + "C. Sitawarin" + ], "year": 2025, "arxiv_id": "2510.09023", "relevance": "Demonstrates that strong adaptive attacks can break all existing prompt injection defenses including DataFilter (83% ASR)." }, { "title": "Meta SecAlign: A Secure Foundation LLM Against Prompt Injection Attacks", - "authors": ["S. Chen", "A. Zharmagambetov", "D. Wagner", "C. Guo"], + "authors": [ + "S. Chen", + "A. Zharmagambetov", + "D. Wagner", + "C. Guo" + ], "year": 2025, "arxiv_id": "2507.02735", "relevance": "Fine-tuning defense for prompt injection that inspired DataFilter's training approach and demonstrated cross-domain generalization." }, { "title": "StruQ: Defending against prompt injection with structured queries", - "authors": ["S. Chen", "J. Piet", "C. Sitawarin", "D. Wagner"], + "authors": [ + "S. Chen", + "J. Piet", + "C. Sitawarin", + "D. Wagner" + ], "year": 2025, "relevance": "System-level defense that structures LLM queries to separate prompts from data, requiring model weight access." }, { "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents", - "authors": ["Q. Zhan", "Z. Liang", "Z. Ying", "D. Kang"], + "authors": [ + "Q. Zhan", + "Z. Liang", + "Z. Ying", + "D. Kang" + ], "year": 2024, "relevance": "Benchmark for evaluating prompt injection attacks in agentic tool-calling scenarios, used as primary evaluation benchmark." }, { "title": "Agentdojo: A dynamic environment to evaluate attacks and defenses for llm agents", - "authors": ["E. Debenedetti", "J. Zhang", "M. Balunović", "L. Beurer-Kellner", "M. Fischer", "F. Tramèr"], + "authors": [ + "E. Debenedetti", + "J. Zhang", + "M. Balunović", + "L. Beurer-Kellner", + "M. Fischer", + "F. Tramèr" + ], "year": 2024, "relevance": "Multi-tool agent benchmark for prompt injection with both security and utility evaluation, used as primary evaluation benchmark." }, { "title": "Can llms separate instructions from data? and what do we even mean by that?", - "authors": ["E. Zverev", "S. Abdelnabi", "M. Fritz", "C. H. Lampert"], + "authors": [ + "E. Zverev", + "S. Abdelnabi", + "M. Fritz", + "C. H. Lampert" + ], "year": 2025, "relevance": "SEP benchmark providing controlled measurement of instruction-data separation in LLMs, used as evaluation benchmark." }, { "title": "PromptArmor: Simple yet effective prompt injection defenses", - "authors": ["T. Shi", "K. Zhu", "Z. Wang"], + "authors": [ + "T. Shi", + "K. Zhu", + "Z. Wang" + ], "year": 2025, "arxiv_id": "2507.15219", "relevance": "Concurrent defense using LLM-based detection and fuzzy string matching for injection removal; primary competitor to DataFilter." }, { "title": "DataSentinel: A game-theoretic detection of prompt injection attacks", - "authors": ["Y. Liu", "Y. Jia", "J. Jia", "D. Song", "N. Z. Gong"], + "authors": [ + "Y. Liu", + "Y. Jia", + "J. Jia", + "D. Song", + "N. Z. Gong" + ], "year": 2025, "relevance": "Game-theoretic prompt injection detector using deliberately vulnerable LLM design, evaluated as baseline defense." }, { "title": "Defeating prompt injections by design", - "authors": ["E. Debenedetti", "I. Shumailov", "T. Fan", "J. Hayes", "N. Carlini"], + "authors": [ + "E. Debenedetti", + "I. Shumailov", + "T. Fan", + "J. Hayes", + "N. Carlini" + ], "year": 2025, "arxiv_id": "2503.18813", "relevance": "System-level defense providing security-by-design against prompt injection through pipeline redesign." }, { "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", - "authors": ["E. Wallace", "K. Xiao", "R. Leike", "L. Weng", "J. Heidecke", "A. Beutel"], + "authors": [ + "E. Wallace", + "K. Xiao", + "R. Leike", + "L. Weng", + "J. Heidecke", + "A. Beutel" + ], "year": 2024, "arxiv_id": "2404.13208", "relevance": "Model-level defense training LLMs to prioritize system instructions over injected instructions in data." }, { "title": "Defending against indirect prompt injection attacks with spotlighting", - "authors": ["K. Hines", "G. Lopez", "M. Hall", "F. Zarfati", "Y. Zunger", "E. Kiciman"], + "authors": [ + "K. Hines", + "G. Lopez", + "M. Hall", + "F. Zarfati", + "Y. Zunger", + "E. Kiciman" + ], "year": 2024, "arxiv_id": "2403.14720", "relevance": "Prompt-based defense using delimiting to mark untrusted data, evaluated as baseline." } ] -} +} +\ No newline at end of file diff --git a/papers/defending-against-prompt-2025/scan.json b/papers/defending-against-prompt-2025/scan.json @@ -14,8 +14,13 @@ "doi": "10.1145/3733799.3762982" }, "scan_version": 2, - "active_modules": ["experimental_rigor", "data_leakage"], - "methodology_tags": ["benchmark-eval"], + "active_modules": [ + "experimental_rigor", + "data_leakage" + ], + "methodology_tags": [ + "benchmark-eval" + ], "key_findings": "DefensiveToken introduces optimized special token embeddings prepended to LLM input as a test-time prompt injection defense. On the largest benchmark (TaskTracker, 31K samples), it reduces attack success rate to 0.24% averaged across four 7B/8B models, comparable to training-time defenses (0.20–0.51%) and far below other test-time defenses (>11%). The defense adds only 5 tokens (~20K parameters) with minimal utility loss, and can be toggled on/off at deployment time. Against adaptive optimization-based GCG attacks, ASR drops from 95.2% to 48.8%, outperforming all other test-time defenses.", "checklist": { "artifacts": { @@ -423,86 +428,153 @@ "cited_papers": [ { "title": "StruQ: Defending against prompt injection with structured queries", - "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"], + "authors": [ + "Sizhe Chen", + "Julien Piet", + "Chawin Sitawarin", + "David Wagner" + ], "year": 2025, "arxiv_id": "2402.06363", "relevance": "Training-time prompt injection defense that DefensiveToken builds on; provides the loss function and dataset construction used by DefensiveToken." }, { "title": "SecAlign: Defending Against Prompt Injection with Preference Optimization", - "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"], + "authors": [ + "Sizhe Chen", + "Arman Zharmagambetov", + "Saeed Mahloujifar", + "Kamalika Chaudhuri", + "David Wagner", + "Chuan Guo" + ], "year": 2025, "arxiv_id": "2410.05451", "relevance": "Training-time prompt injection defense using preference optimization; key baseline and comparison point." }, { "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", - "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"], + "authors": [ + "Eric Wallace", + "Kai Xiao", + "Reimar Leike", + "Lilian Weng", + "Johannes Heidecke", + "Alex Beutel" + ], "year": 2024, "arxiv_id": "2404.13208", "relevance": "Defines multi-layer instruction priority for prompt injection defense, implemented in GPT-4o and Gemini; training-time defense approach." }, { "title": "Defeating prompt injections by design", - "authors": ["Edoardo Debenedetti", "Ilia Shumailov", "Tianqi Fan", "Jamie Hayes", "Nicholas Carlini"], + "authors": [ + "Edoardo Debenedetti", + "Ilia Shumailov", + "Tianqi Fan", + "Jamie Hayes", + "Nicholas Carlini" + ], "year": 2025, "arxiv_id": "2503.18813", "relevance": "System-level prompt injection defense using system security principles; represents the system-design approach to defense." }, { "title": "Get my drift? Catching LLM Task Drift with Activation Deltas", - "authors": ["Sahar Abdelnabi", "Aideen Fay", "Giovanni Cherubin", "Ahmed Salem", "Mario Fritz", "Andrew Paverd"], + "authors": [ + "Sahar Abdelnabi", + "Aideen Fay", + "Giovanni Cherubin", + "Ahmed Salem", + "Mario Fritz", + "Andrew Paverd" + ], "year": 2025, "arxiv_id": "2406.00799", "relevance": "TaskTracker benchmark (31K samples) used as the largest evaluation benchmark; activation-based prompt injection detection." }, { "title": "Instructional Segment Embedding: Improving LLM Safety with Instruction Hierarchy", - "authors": ["Tong Wu", "Shujian Zhang", "Kaiqiang Song"], + "authors": [ + "Tong Wu", + "Shujian Zhang", + "Kaiqiang Song" + ], "year": 2025, "arxiv_id": "2410.09102", "relevance": "Training-time defense using instructional segment embeddings for prompt injection; related embedding-based defense approach." }, { "title": "Can LLMs Separate Instructions From Data? And What Do We Even Mean By That?", - "authors": ["Egor Zverev", "Sahar Abdelnabi", "Mario Fritz", "Christoph H Lampert"], + "authors": [ + "Egor Zverev", + "Sahar Abdelnabi", + "Mario Fritz", + "Christoph H Lampert" + ], "year": 2025, "relevance": "SEP benchmark (9.1K samples) for prompt injection evaluation; provides the utility-security evaluation framework used in this paper." }, { "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents", - "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"], + "authors": [ + "Qiusi Zhan", + "Zhixiang Liang", + "Zifan Ying", + "Daniel Kang" + ], "year": 2024, "doi": "10.18653/v1/2024.findings-acl.624", "relevance": "Agentic tool-calling prompt injection benchmark (1K samples); tests defense generalization to API-calling scenarios." }, { "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", - "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"], + "authors": [ + "Andy Zou", + "Zifan Wang", + "Nicholas Carlini", + "Milad Nasr", + "J. Zico Kolter", + "Matt Fredrikson" + ], "year": 2023, "arxiv_id": "2307.15043", "relevance": "GCG attack used as the optimization-based adversarial attack for evaluating defense robustness." }, { "title": "Cyberseceval 2: A wide-ranging cybersecurity evaluation suite for large language models", - "authors": ["Manish Bhatt", "Sahana Chennabasappa"], + "authors": [ + "Manish Bhatt", + "Sahana Chennabasappa" + ], "year": 2024, "arxiv_id": "2404.13161", "relevance": "CyberSecEval2 prompt injection benchmark (55 test cases) used in evaluation." }, { "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", - "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"], + "authors": [ + "Kai Greshake", + "Sahar Abdelnabi", + "Shailesh Mishra", + "Christoph Endres", + "Thorsten Holz", + "Mario Fritz" + ], "year": 2023, "doi": "10.1145/3605764.3623985", "relevance": "Foundational work on indirect prompt injection attacks in real-world LLM applications." }, { "title": "Lessons from Defending Gemini Against Indirect Prompt Injections", - "authors": ["Chongyang Shi", "Sharon Lin", "Shuang Song"], + "authors": [ + "Chongyang Shi", + "Sharon Lin", + "Shuang Song" + ], "year": 2025, "arxiv_id": "2505.14534", "relevance": "Industry-scale perspective on defending against prompt injection in production (Gemini); instruction hierarchy implementation." } ] -} +} +\ No newline at end of file diff --git a/papers/defense-against-indirect-2026/scan.json b/papers/defense-against-indirect-2026/scan.json @@ -1,15 +1,24 @@ { "paper": { "title": "Defense Against Indirect Prompt Injection via Tool Result Parsing", - "authors": ["Qiang Yu", "Xinran Cheng", "Chuanyi Liu"], + "authors": [ + "Qiang Yu", + "Xinran Cheng", + "Chuanyi Liu" + ], "year": 2026, "venue": "arXiv.org", "arxiv_id": "2601.04795", "doi": "10.48550/arXiv.2601.04795" }, "scan_version": 2, - "active_modules": ["experimental_rigor", "data_leakage"], - "methodology_tags": ["benchmark-eval"], + "active_modules": [ + "experimental_rigor", + "data_leakage" + ], + "methodology_tags": [ + "benchmark-eval" + ], "key_findings": "The paper proposes ParseData and CheckTool, two prompt-based defense modules against indirect prompt injection in LLM agents. ParseData extracts only needed data from tool results using format/logic constraints, while CheckTool detects and removes content that triggers unauthorized tool calls. On the AgentDojo benchmark with three LLMs (gpt-oss-120b, llama-3.1-70b, qwen3-32b), the combined methods achieve average ASR of 0–1.33% and average Risk of 0–1.33%, roughly 1/10 that of the next-best defense (Tool Filter), at the cost of 28–45% reduction in benign utility.", "checklist": { "artifacts": { @@ -420,76 +429,146 @@ "cited_papers": [ { "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents", - "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"], + "authors": [ + "Edoardo Debenedetti", + "Jie Zhang", + "Mislav Balunovic", + "Luca Beurer-Kellner", + "Marc Fischer", + "Florian Tramèr" + ], "year": 2024, "relevance": "Primary benchmark used for evaluation; provides standardized framework for testing indirect prompt injection attacks and defenses on LLM agents." }, { "title": "StruQ: Defending Against Prompt Injection with Structured Queries", - "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"], + "authors": [ + "Sizhe Chen", + "Julien Piet", + "Chawin Sitawarin", + "David Wagner" + ], "year": 2025, "relevance": "Training-based defense approach that fine-tunes LLMs to separate instructions from data using structured queries." }, { "title": "MELON: Provable Defense Against Indirect Prompt Injection Attacks in AI Agents", - "authors": ["Kaijie Zhu", "Xianjun Yang", "Jindong Wang", "Wenbo Guo", "William Yang Wang"], + "authors": [ + "Kaijie Zhu", + "Xianjun Yang", + "Jindong Wang", + "Wenbo Guo", + "William Yang Wang" + ], "year": 2025, "relevance": "Detection-based defense that identifies prompt injections when suspicious tool calls are about to be executed." }, { "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", - "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"], + "authors": [ + "Kai Greshake", + "Sahar Abdelnabi", + "Shailesh Mishra", + "Christoph Endres", + "Thorsten Holz", + "Mario Fritz" + ], "year": 2023, "relevance": "Foundational work demonstrating indirect prompt injection attacks on LLM-integrated applications including privacy leakage and malware distribution." }, { "title": "Defending Against Indirect Prompt Injection Attacks With Spotlighting", - "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"], + "authors": [ + "Keegan Hines", + "Gary Lopez", + "Matthew Hall", + "Federico Zarfati", + "Yonatan Zunger", + "Emre Kiciman" + ], "year": 2024, "relevance": "Baseline defense method using delimiters to separate data from instructions, evaluated as a comparison in this work." }, { "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", - "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"], + "authors": [ + "Eric Wallace", + "Kai Xiao", + "Reimar Leike", + "Lilian Weng", + "Johannes Heidecke", + "Alex Beutel" + ], "year": 2024, "arxiv_id": "2404.13208", "relevance": "Proposes training LLMs to prioritize user instructions over injected content, relevant to the instruction-data separation problem." }, { "title": "The Task Shield: Enforcing Task Alignment to Defend Against Indirect Prompt Injection in LLM Agents", - "authors": ["Feiran Jia", "Tong Wu", "Xin Qin", "Anna Cinzia Squicciarini"], + "authors": [ + "Feiran Jia", + "Tong Wu", + "Xin Qin", + "Anna Cinzia Squicciarini" + ], "year": 2025, "relevance": "Defense approach that evaluates correlation between user instructions and assistant messages to detect workflow compromises." }, { "title": "Can Indirect Prompt Injection Attacks Be Detected and Removed?", - "authors": ["Yulin Chen", "Haoran Li", "Yuan Sui", "Yufei He", "Yue Liu", "Yangqiu Song", "Bryan Hooi"], + "authors": [ + "Yulin Chen", + "Haoran Li", + "Yuan Sui", + "Yufei He", + "Yue Liu", + "Yangqiu Song", + "Bryan Hooi" + ], "year": 2025, "relevance": "Trains auxiliary lightweight models to detect and remove indirect prompt injections from external prompts." }, { "title": "Adaptive Attacks Break Defenses Against Indirect Prompt Injection Attacks on LLM Agents", - "authors": ["Qiusi Zhan", "Richard Fang", "Henil Shalin Panchal", "Daniel Kang"], + "authors": [ + "Qiusi Zhan", + "Richard Fang", + "Henil Shalin Panchal", + "Daniel Kang" + ], "year": 2025, "relevance": "Demonstrates that adaptive attacks can circumvent existing IPI defenses, relevant to evaluating defense robustness." }, { "title": "IsolateGPT: An Execution Isolation Architecture for LLM-Based Agentic Systems", - "authors": ["Yuhao Wu", "Franziska Roesner", "Tadayoshi Kohno", "Ning Zhang", "Umar Iqbal"], + "authors": [ + "Yuhao Wu", + "Franziska Roesner", + "Tadayoshi Kohno", + "Ning Zhang", + "Umar Iqbal" + ], "year": 2025, "relevance": "Proposes execution isolation as a defense mechanism for LLM agents, representing the privilege-control approach to IPI defense." }, { "title": "Ignore Previous Prompt: Attack Techniques For Language Models", - "authors": ["Fábio Perez", "Ian Ribeiro"], + "authors": [ + "Fábio Perez", + "Ian Ribeiro" + ], "year": 2022, "relevance": "Foundational work on prompt injection attack techniques, including the 'ignore previous instructions' attack pattern used in this paper's evaluation." }, { "title": "A Survey on Trustworthy LLM Agents: Threats and Countermeasures", - "authors": ["Miao Yu", "Fanci Meng", "Xinyun Zhou"], + "authors": [ + "Miao Yu", + "Fanci Meng", + "Xinyun Zhou" + ], "year": 2025, "relevance": "Comprehensive survey of threats and countermeasures for LLM agents, providing broader context for the prompt injection defense landscape." } ] -} +} +\ No newline at end of file diff --git a/papers/defense-against-prompt-2024/scan.json b/papers/defense-against-prompt-2024/scan.json @@ -15,8 +15,13 @@ "doi": "10.48550/arXiv.2411.00459" }, "scan_version": 2, - "active_modules": ["experimental_rigor", "data_leakage"], - "methodology_tags": ["benchmark-eval"], + "active_modules": [ + "experimental_rigor", + "data_leakage" + ], + "methodology_tags": [ + "benchmark-eval" + ], "key_findings": "The paper inverts prompt injection attack techniques to create novel training-free defense methods. The Fake Completion with Template (Fakecom-t) defense reduces ASR to near zero in many scenarios across both direct and indirect attacks. The approach outperforms existing training-free defenses (Sandwich, Instructional, Reminder, Isolation, Spotlight) and is comparable to fine-tuning-based methods like StruQ. The authors observe that stronger attack techniques tend to produce stronger defense methods when repurposed.", "checklist": { "artifacts": { @@ -415,77 +420,148 @@ "cited_papers": [ { "title": "Struq: Defending against prompt injection with structured queries", - "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"], + "authors": [ + "Sizhe Chen", + "Julien Piet", + "Chawin Sitawarin", + "David Wagner" + ], "year": 2024, "arxiv_id": "2402.06363", "relevance": "Fine-tuning-based defense against prompt injection attacks, used as a key baseline comparison in this paper." }, { "title": "Ignore previous prompt: Attack techniques for language models", - "authors": ["Fábio Perez", "Ian Ribeiro"], + "authors": [ + "Fábio Perez", + "Ian Ribeiro" + ], "year": 2022, "arxiv_id": "2211.09527", "relevance": "Foundational work on prompt injection attack techniques that forms the basis for the defense methods proposed in this paper." }, { "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection", - "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"], + "authors": [ + "Kai Greshake", + "Sahar Abdelnabi", + "Shailesh Mishra", + "Christoph Endres", + "Thorsten Holz", + "Mario Fritz" + ], "year": 2023, "relevance": "Seminal work on indirect prompt injection attacks against LLM-integrated applications." }, { "title": "Defending against indirect prompt injection attacks with spotlighting", - "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"], + "authors": [ + "Keegan Hines", + "Gary Lopez", + "Matthew Hall", + "Federico Zarfati", + "Yonatan Zunger", + "Emre Kiciman" + ], "year": 2024, "arxiv_id": "2403.14720", "relevance": "Training-free defense method using special tokens to mark data content, used as a baseline in this paper." }, { "title": "The instruction hierarchy: Training LLMs to prioritize privileged instructions", - "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"], + "authors": [ + "Eric Wallace", + "Kai Xiao", + "Reimar Leike", + "Lilian Weng", + "Johannes Heidecke", + "Alex Beutel" + ], "year": 2024, "arxiv_id": "2404.13208", "relevance": "Fine-tuning approach granting privileged status to authorized instructions for prompt injection defense." }, { "title": "Formalizing and benchmarking prompt injection attacks and defenses", - "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"], + "authors": [ + "Yupei Liu", + "Yuqi Jia", + "Runpeng Geng", + "Jinyuan Jia", + "Neil Zhenqiang Gong" + ], "year": 2024, "relevance": "Formalizes prompt injection attack/defense framework including combined attack methods used in this paper." }, { "title": "Universal and transferable adversarial attacks on aligned language models", - "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J Zico Kolter", "Matt Fredrikson"], + "authors": [ + "Andy Zou", + "Zifan Wang", + "Nicholas Carlini", + "Milad Nasr", + "J Zico Kolter", + "Matt Fredrikson" + ], "year": 2023, "arxiv_id": "2307.15043", "relevance": "GCG attack method used to evaluate defense transferability against gradient-based attacks." }, { "title": "Evaluating the instruction-following robustness of large language models to prompt injection", - "authors": ["Zekun Li", "Baolin Peng", "Pengcheng He", "Xifeng Yan"], + "authors": [ + "Zekun Li", + "Baolin Peng", + "Pengcheng He", + "Xifeng Yan" + ], "year": 2023, "relevance": "Provides the filtered QA evaluation dataset and evaluation protocol used in this paper's indirect attack experiments." }, { "title": "Automatic and universal prompt injection attacks against large language models", - "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"], + "authors": [ + "Xiaogeng Liu", + "Zhiyuan Yu", + "Yizhe Zhang", + "Ning Zhang", + "Chaowei Xiao" + ], "year": 2024, "arxiv_id": "2403.04957", "relevance": "Automatic prompt injection attack methods relevant to LLM security evaluation." }, { "title": "Jatmo: Prompt injection defense by task-specific finetuning", - "authors": ["Julien Piet", "Maha Alrashed", "Chawin Sitawarin", "Sizhe Chen", "Zeming Wei", "Elizabeth Sun", "Basel Alomair", "David Wagner"], + "authors": [ + "Julien Piet", + "Maha Alrashed", + "Chawin Sitawarin", + "Sizhe Chen", + "Zeming Wei", + "Elizabeth Sun", + "Basel Alomair", + "David Wagner" + ], "year": 2023, "arxiv_id": "2312.17673", "relevance": "Fine-tuning defense that trains models for specific tasks to prevent malicious instruction following." }, { "title": "Benchmarking and defending against indirect prompt injection attacks on large language models", - "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Keegan Hines", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"], + "authors": [ + "Jingwei Yi", + "Yueqi Xie", + "Bin Zhu", + "Keegan Hines", + "Emre Kiciman", + "Guangzhong Sun", + "Xing Xie", + "Fangzhao Wu" + ], "year": 2023, "arxiv_id": "2312.14197", "relevance": "Benchmarking framework for indirect prompt injection with Reminder defense baseline used in this paper." } ] -} +} +\ No newline at end of file diff --git a/scripts/catchup-v3.py b/scripts/catchup-v3.py @@ -108,7 +108,13 @@ def classify_one(paper_id): if scan.get("scan_version") == 3: return paper_id, True, "already v3" - # Build prompt from existing scan data + # Read the full paper text + txt_path = PAPERS_DIR / paper_id / "paper.txt" + if not txt_path.exists(): + return paper_id, False, "no paper.txt" + paper_text = txt_path.read_text(encoding="utf-8").replace("\x00", "") + + # Build prompt with full paper text + existing scan context paper = scan.get("paper", {}) claims_text = "\n".join( f"- [{c.get('supported', '?')}] {c.get('claim', '')}" @@ -128,13 +134,13 @@ def classify_one(paper_id): key_findings=scan.get("key_findings", ""), claims=claims_text or "(none)", red_flags=red_flags_text or "(none)", - ) + ) + f"\n\n## Full Paper Text\n{paper_text}" try: result = subprocess.run( ["claude", "-p", "-", "--model", "opus", "--max-turns", "1"], input=prompt, - capture_output=True, text=True, timeout=120, + capture_output=True, text=True, timeout=300, cwd=str(ROOT), )

Impressum · Datenschutz