commit fbc3c552e124c8c6c91d532e531bbc6f81f4d957 parent 95f484d01c4aded0fbdb7faed0aa7f17b69da21b Author: Brian Graham <brian@buildingbetterteams.de> Date: Mon, 30 Mar 2026 16:40:40 +0200 Add Haiku paper type classification script (preliminary) scripts/classify-paper-type.py classifies papers into 5 types: empirical, benchmark-creation, survey, position, theoretical. Uses Haiku (cheap, fast) reading title + key_findings + tags from existing scan.json. Writes papers/{slug}/paper_type.json as a separate non-destructive file. 20/20 correct on manual verification. Running full corpus in background. This is preliminary — classification feeds into v4 instrument redesign where each type gets its own question panel. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Diffstat:
625 files changed, 3303 insertions(+), 0 deletions(-)
diff --git a/papers/2025-ai-agent-2026/paper_type.json b/papers/2025-ai-agent-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Systematic documentation and meta-analysis of 30 existing deployed agentic AI systems across safety and technical features, synthesizing the landscape rather than reporting original experimental results." +} +\ No newline at end of file diff --git a/papers/3dshape2vecset-3d-shape-2023/paper_type.json b/papers/3dshape2vecset-3d-shape-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a 3D shape representation method and validates it through experiments on ShapeNet-v2, reporting quantitative improvements in shape autoencoding (IoU 0.965) and generation quality (FPD 0.76) metrics." +} +\ No newline at end of file diff --git a/papers/a2hcoder-llmdriven-coding-2025/paper_type.json b/papers/a2hcoder-llmdriven-coding-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes A2H-MAS and reports quantitative experimental results (operating frequencies, resource utilization, ablation studies) on real FPGA implementation tasks, with the primary contribution being the empirical validation of the system's performance." +} +\ No newline at end of file diff --git a/papers/aart-aiassisted-redteaming-2023/paper_type.json b/papers/aart-aiassisted-redteaming-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper introduces AART, a structured pipeline and resulting dataset for adversarial evaluation of LLMs across policy concepts, task formats, and geographic regions, with empirical validation showing superior coverage metrics compared to existing approaches." +} +\ No newline at end of file diff --git a/papers/acar-adaptive-complexity-2026/paper_type.json b/papers/acar-adaptive-complexity-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces ACAR routing method and validates it through experiments on 1,510 tasks with quantitative accuracy metrics, baselines, and ablation studies demonstrating its effectiveness." +} +\ No newline at end of file diff --git a/papers/agentic-bug-reproduction-2025/paper_type.json b/papers/agentic-bug-reproduction-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports quantitative experimental results (28% generation rate, 30% improvement, 70% accuracy) from evaluating an agentic bug reproduction system on 80 real production bugs, with comparisons to baselines and manual validation." +} +\ No newline at end of file diff --git a/papers/agentic-refactoring-empirical-2025/paper_type.json b/papers/agentic-refactoring-empirical-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Large-scale mining study reporting quantitative findings (26.1% refactoring prevalence, 15,451 instances analyzed) with statistical analysis of structural metrics and behavioral patterns in AI-generated code." +} +\ No newline at end of file diff --git a/papers/agents-of-chaos-2026/paper_type.json b/papers/agents-of-chaos-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts original red-teaming experiments on 6 LLM agents and reports empirical findings of vulnerability categories and failure modes discovered through the study." +} +\ No newline at end of file diff --git a/papers/ai-ides-vs-agents-impact-2026/paper_type.json b/papers/ai-ides-vs-agents-impact-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports quantitative findings from observational methodology measuring the impact of coding agents on software development metrics (+36% commits, +77% lines, +18% warnings, +39% complexity), with heterogeneous effects analysis." +} +\ No newline at end of file diff --git a/papers/chain-of-thought-prompting-2022/paper_type.json b/papers/chain-of-thought-prompting-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper conducts experiments on multiple benchmarks (GSM8K, arithmetic, commonsense, symbolic reasoning), reports quantitative results (e.g., 56.9% on GSM8K), and includes ablation studies, with the primary contribution being empirical findings about chain-of-thought prompting's effectiveness." +} +\ No newline at end of file diff --git a/papers/codex-humaneval-2021/paper_type.json b/papers/codex-humaneval-2021/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments on multiple code generation models (Codex, GPT-3, GPT-J) using the HumanEval benchmark, reports quantitative results (pass@k metrics), and provides empirical analysis of scaling laws and sampling effectiveness." +} +\ No newline at end of file diff --git a/papers/coding-agents-generating-2026/paper_type.json b/papers/coding-agents-generating-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs an observational study with quantitative comparisons of coding agent vs non-agent behavior across test file modifications and mock usage patterns." +} +\ No newline at end of file diff --git a/papers/copilot-productivity-controlled-2023/paper_type.json b/papers/copilot-productivity-controlled-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Randomized controlled experiment measuring quantitative outcomes (task completion time and success rate) with 95 developers, comparing treatment vs control groups on a real programming task." +} +\ No newline at end of file diff --git a/papers/copilot-zoominfo-productivity-2025/paper_type.json b/papers/copilot-zoominfo-productivity-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "This case study reports quantitative empirical results from a real-world GitHub Copilot deployment, measuring acceptance rates (33%), developer satisfaction (72%), and productivity gains (median 20% time reduction) across 400+ developers." +} +\ No newline at end of file diff --git a/papers/cursor-speed-quality-tradeoff-2025/paper_type.json b/papers/cursor-speed-quality-tradeoff-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs observational analysis on real-world open-source projects using statistical methods (Panel GMM) to measure quantitative effects of Cursor AI adoption on code velocity and quality metrics." +} +\ No newline at end of file diff --git a/papers/data-contamination-benchmarks-2023/paper_type.json b/papers/data-contamination-benchmarks-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes and empirically validates two methods (retrieval-based search and TS-Guessing protocol) for detecting data contamination in existing benchmarks, reporting quantitative findings (52% exact match on MMLU, fine-tuning validation) that constitute the primary contribution." +} +\ No newline at end of file diff --git a/papers/data-distributional-properties-2022/paper_type.json b/papers/data-distributional-properties-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments manipulating training data distributional properties and measures their effects on in-context learning emergence across architectures, with quantitative results on how burstiness, class distribution, and dynamics drive emergent capabilities." +} +\ No newline at end of file diff --git a/papers/database-perspective-llm-2025/paper_type.json b/papers/database-perspective-llm-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "A tutorial paper that surveys and organizes existing LLM inference systems and techniques from a database perspective, synthesizing the field rather than running new experiments or creating benchmarks." +} +\ No newline at end of file diff --git a/papers/datadreamer-tool-synthetic-2024/paper_type.json b/papers/datadreamer-tool-synthetic-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes and positions a tool (DataDreamer) with feature comparison to alternatives, but lacks empirical evaluation on benchmarks or datasets, making it prescriptive rather than experimental." +} +\ No newline at end of file diff --git a/papers/datasentinel-gametheoretic-detection-2025/paper_type.json b/papers/datasentinel-gametheoretic-detection-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces a game-theoretic detection method and reports quantitative experimental results across 9 attacks, 7 NLP tasks, and 6 LLMs with FPR/FNR metrics compared against 6 baselines." +} +\ No newline at end of file diff --git a/papers/datasetresearch-benchmarking-agent-2025/paper_type.json b/papers/datasetresearch-benchmarking-agent-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces the DATASETRESEARCH benchmark with 208 real-world dataset demands as the primary contribution; agent evaluation results are baseline findings using the new benchmark." +} +\ No newline at end of file diff --git a/papers/dear-diary-rct-copilot-2024/paper_type.json b/papers/dear-diary-rct-copilot-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs a randomized controlled trial (N=106) with quantitative statistical results and qualitative findings on the impact of GenAI coding tools." +} +\ No newline at end of file diff --git a/papers/dear-novel-deep-2022/paper_type.json b/papers/dear-novel-deep-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces DEAR, a novel deep learning approach for program repair, and validates it through experiments on standard benchmarks (Defects4J, CPatMiner) with quantitative results showing improvements over baselines." +} +\ No newline at end of file diff --git a/papers/declarative-agentic-layer-2026/paper_type.json b/papers/declarative-agentic-layer-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a declarative architectural framework (DALIA) and argues a viewpoint about MAS design without empirical validation or mathematical proofs, using only an illustrative scenario." +} +\ No newline at end of file diff --git a/papers/decoding-latent-attack-2025/paper_type.json b/papers/decoding-latent-attack-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper conducts controlled experiments testing HTML-based prompt injection attacks on LLMs, reporting quantitative success rates and divergence metrics across 140 test cases and 8 attack techniques." +} +\ No newline at end of file diff --git a/papers/decoding-ml-decision-2026/paper_type.json b/papers/decoding-ml-decision-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Paper presents GEARS framework with quantitative experimental results comparing accuracy against baselines (94% vs 77% vs 68%), includes ablation studies, and reports deployment results across 9 Meta product surfaces, making experimental findings the primary contribution." +} +\ No newline at end of file diff --git a/papers/decomposed-prompting-modular-2022/paper_type.json b/papers/decomposed-prompting-modular-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes Decomposed Prompting and experimentally validates it against baselines (CoT, Least-to-Most) across multiple benchmark datasets, with the primary contribution being the empirical findings that the method outperforms existing approaches." +} +\ No newline at end of file diff --git a/papers/deep-dive-into-2024-2/paper_type.json b/papers/deep-dive-into-2024-2/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes Toggle framework for bug localization/repair and validates it with quantitative experiments on CodeXGLUE and Defects4J benchmarks, reporting SOTA results and empirical findings about prompt design effects." +} +\ No newline at end of file diff --git a/papers/deep-dive-into-2024/paper_type.json b/papers/deep-dive-into-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Primary contribution is experimental findings about types and causes of LLM code generation mistakes through systematic analysis on existing benchmarks, with quantitative results (F1 scores, error percentages)." +} +\ No newline at end of file diff --git a/papers/deep-dive-into-2025/paper_type.json b/papers/deep-dive-into-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper conducts systematic experiments evaluating RAG techniques across 26 LLMs on a codebase, reporting quantitative results (CodeBLEU scores, performance metrics) with the primary contribution being empirical findings about retrieval method effectiveness." +} +\ No newline at end of file diff --git a/papers/deepcircuitx-comprehensive-repositorylevel-2025/paper_type.json b/papers/deepcircuitx-comprehensive-repositorylevel-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "DeepCircuitX introduces a comprehensive RTL dataset with 4,000+ projects and multi-level annotations; while it includes fine-tuning experiments, the primary contribution is the dataset itself, not the experimental findings." +} +\ No newline at end of file diff --git a/papers/deepcode-open-agentic-2025/paper_type.json b/papers/deepcode-open-agentic-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces a framework (DeepCode) and validates it with quantitative experiments on PaperBench Code-Dev, including comparative baselines and ablation studies; the primary contribution is the experimental findings of the system's performance." +} +\ No newline at end of file diff --git a/papers/deepcrceval-revisiting-evaluation-2024/paper_type.json b/papers/deepcrceval-revisiting-evaluation-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper introduces DeepCRCEval, a new evaluation framework with 9 domain-specific criteria for assessing code review comment generation, and validates it empirically against existing approaches." +} +\ No newline at end of file diff --git a/papers/deepreview-improving-llmbased-2025/paper_type.json b/papers/deepreview-improving-llmbased-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a framework and trains a model on a synthetic dataset, then reports experimental results (MSE improvements, win rates vs baselines) demonstrating empirical effectiveness." +} +\ No newline at end of file diff --git a/papers/deepseek-coder-2024/paper_type.json b/papers/deepseek-coder-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs extensive experiments on multiple benchmarks (HumanEval, MBPP, DS-1000) and reports quantitative performance metrics and ablation studies validating the model's capabilities." +} +\ No newline at end of file diff --git a/papers/deepseek-coder-v2-2024/paper_type.json b/papers/deepseek-coder-v2-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces a model and validates it through quantitative experiments on established benchmarks (HumanEval, MBPP+, MATH, LiveCodeBench), with the primary contribution being the reported performance results." +} +\ No newline at end of file diff --git a/papers/deepseek-r1-2025/paper_type.json b/papers/deepseek-r1-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper demonstrates through experiments on standard benchmarks (AIME, MATH-500, ChatbotArena) that an RL-based training pipeline produces models with strong reasoning capabilities; the primary contribution is the experimental findings validating this approach." +} +\ No newline at end of file diff --git a/papers/defects4c-benchmarking-large-2025/paper_type.json b/papers/defects4c-benchmarking-large-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper's primary contribution is introducing Defects4C, a new C/C++ bug benchmark for evaluating LLM repair; the experiments serve to establish baselines and demonstrate benchmark difficulty rather than propose novel repair methods." +} +\ No newline at end of file diff --git a/papers/defending-against-indirect-2024/paper_type.json b/papers/defending-against-indirect-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes spotlighting techniques for defending against prompt injection attacks and validates them experimentally on GPT models with quantitative results on standard NLP benchmarks (SQuAD, IMDB, SuperGLUE)." +} +\ No newline at end of file diff --git a/papers/defending-against-prompt-2025-2/paper_type.json b/papers/defending-against-prompt-2025-2/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes DataFilter defense method and reports quantitative experimental results across three existing benchmarks (SEP, InjecAgent, AgentDojo) comparing security and utility metrics against baselines." +} +\ No newline at end of file diff --git a/papers/defending-against-prompt-2025/paper_type.json b/papers/defending-against-prompt-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces DefensiveToken defense method and validates it through extensive quantitative experiments on TaskTracker benchmark (31K samples) across multiple models, reporting success rates and comparative metrics." +} +\ No newline at end of file diff --git a/papers/defending-aipowered-commerce-2025/paper_type.json b/papers/defending-aipowered-commerce-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a conceptual security framework for AI-powered commerce with prescriptive defense mechanisms, but provides no empirical evaluation, implementation, or validation." +} +\ No newline at end of file diff --git a/papers/defense-against-indirect-2026/paper_type.json b/papers/defense-against-indirect-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes defense mechanisms and validates them experimentally on the AgentDojo benchmark with three LLMs, reporting quantitative metrics (ASR and Risk) as primary contributions." +} +\ No newline at end of file diff --git a/papers/defense-against-prompt-2024/paper_type.json b/papers/defense-against-prompt-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes defense methods and validates them through quantitative experiments, measuring attack success rates (ASR) across multiple attack scenarios and comparing against existing defenses." +} +\ No newline at end of file diff --git a/papers/defense-against-prompt-2025/paper_type.json b/papers/defense-against-prompt-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes and experimentally validates a defense method (mixture of encodings) against prompt injection attacks, reporting quantitative results (0-1.5% ASR) across multiple attack datasets and models." +} +\ No newline at end of file diff --git a/papers/defense-massive-false-2022/paper_type.json b/papers/defense-massive-false-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a deep learning methodology for detecting false data injection attacks and reports quantitative experimental results (>95% detection accuracy, ~80-86% localization) on IEEE test systems compared against baselines." +} +\ No newline at end of file diff --git a/papers/defensive-prompt-patch-2024/paper_type.json b/papers/defensive-prompt-patch-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces DPP defense method and validates it through experiments on multiple LLM models, reporting quantitative results (ASR, Win-Rate) across 7 jailbreak attack types compared to existing baselines." +} +\ No newline at end of file diff --git a/papers/dehallucinator-mitigating-llm-2024/paper_type.json b/papers/dehallucinator-mitigating-llm-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a method (De-Hallucinator) and validates it through experiments reporting quantitative improvements (23.3–50.6% edit distance, 63.2% test failure fixes) across multiple LLMs and tasks." +} +\ No newline at end of file diff --git a/papers/demonstratesearchpredict-composing-retrieval-2022/paper_type.json b/papers/demonstratesearchpredict-composing-retrieval-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes the DSP framework and validates it through experiments on multiple benchmarks (Open-SQuAD, HotPotQA, QReCC), reporting quantitative performance gains against baselines." +} +\ No newline at end of file diff --git a/papers/deployabilitycentric-infrastructureascode-generation-2025/paper_type.json b/papers/deployabilitycentric-infrastructureascode-generation-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments evaluating LLM performance on IaC generation tasks, reports quantitative results on deployment success and compliance rates, and demonstrates the effectiveness of an iterative feedback framework across multiple models." +} +\ No newline at end of file diff --git a/papers/deputydev-ai-powered-2025/paper_type.json b/papers/deputydev-ai-powered-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts a controlled A/B experiment with quantitative metrics measuring DeputyDev's impact on code review time, with primary contribution being experimental findings across different PR sizes." +} +\ No newline at end of file diff --git a/papers/derag-blackbox-adversarial-2025/paper_type.json b/papers/derag-blackbox-adversarial-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes DeRAG method using Differential Evolution to generate adversarial suffixes for RAG systems and reports experimental results with quantitative metrics (success rates, AUROC) on multiple benchmarks." +} +\ No newline at end of file diff --git a/papers/design-evaluation-assisted-2026/paper_type.json b/papers/design-evaluation-assisted-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Designs a system (BETR-GUI) and reports quantitative experimental results comparing human performance with vs without the system, including ablation studies measuring component contributions." +} +\ No newline at end of file diff --git a/papers/design-implementation-secure-2025/paper_type.json b/papers/design-implementation-secure-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper implements a specific RAG chatbot system and reports quantitative experimental results from testing against 674 adversarial prompts, measuring defense effectiveness (45% recall, 95% benign accuracy), with the primary contribution being the empirical findings about defense mechanisms rather than a novel benchmark or conceptual framework." +} +\ No newline at end of file diff --git a/papers/designbench-comprehensive-benchmark-2025/paper_type.json b/papers/designbench-comprehensive-benchmark-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The primary contribution is DesignBench, a new benchmark dataset with 900 webpage samples for evaluating MLLM front-end code generation; model evaluations are baseline demonstrations of the benchmark." +} +\ No newline at end of file diff --git a/papers/designing-llmbased-multiagent-2025/paper_type.json b/papers/designing-llmbased-multiagent-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Systematic meta-analysis of 94 papers extracting and synthesizing design patterns, quality attributes, and task categories for LLM-based multi-agent systems—primary contribution is field synthesis and categorization." +} +\ No newline at end of file diff --git a/papers/detecting-adversarial-finetuning-2025/paper_type.json b/papers/detecting-adversarial-finetuning-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper introduces auditing agents to detect adversarial fine-tuning and validates them experimentally with quantitative results (1400+ audits, 56.2% TPR across 8 attack types), making the primary contribution empirical findings rather than a benchmark dataset or framework." +} +\ No newline at end of file diff --git a/papers/detecting-benchmark-contamination-2025/paper_type.json b/papers/detecting-benchmark-contamination-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a watermarking-based contamination detection method and validates it through experiments with quantitative results (p-values, accuracy measurements) on benchmark contamination scenarios." +} +\ No newline at end of file diff --git a/papers/detecting-correcting-hallucinations-code-2026/paper_type.json b/papers/detecting-correcting-hallucinations-code-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Primary contribution is experimental validation of an AST-based detection framework with quantitative results (precision, recall, correction rates) on a curated dataset, analyzing performance systematically by error type and library." +} +\ No newline at end of file diff --git a/papers/detecting-proxy-gaming-2025/paper_type.json b/papers/detecting-proxy-gaming-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments to validate a detection method (EST) with quantitative results (precision/recall metrics) on annotated datasets, making experimental findings the primary contribution." +} +\ No newline at end of file diff --git a/papers/detecting-silent-failures-2025/paper_type.json b/papers/detecting-silent-failures-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper's primary contribution is curating and introducing two new benchmark datasets (4,275 and 894 traces) for detecting silent failures in multi-agentic systems, with experiments validating the benchmark's utility." +} +\ No newline at end of file diff --git a/papers/detecting-sleeper-agents-2025/paper_type.json b/papers/detecting-sleeper-agents-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes a detection method and evaluates it experimentally, reporting quantitative results (accuracy, precision, recall) on a backdoored LLM, making the primary contribution experimental validation of their detection system." +} +\ No newline at end of file diff --git a/papers/detection-method-prompt-2025/paper_type.json b/papers/detection-method-prompt-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a detection method (DMPI-PMHFE) and validates it with quantitative experiments comparing against four baselines on three datasets, with ablation studies and deployment results across five LLMs." +} +\ No newline at end of file diff --git a/papers/detectlocalizerepair-unified-framework-2022/paper_type.json b/papers/detectlocalizerepair-unified-framework-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Primary contribution is the CodeT5-DLR framework and its experimental validation across multiple debugging tasks with quantitative metrics; new datasets serve as evaluation medium, not the main contribution." +} +\ No newline at end of file diff --git a/papers/devbench-realistic-developerinformed-2026/paper_type.json b/papers/devbench-realistic-developerinformed-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces DevBench, a novel telemetry-driven benchmark of 1,800 code completion instances designed for ecological validity; model evaluations are secondary to the benchmark contribution." +} +\ No newline at end of file diff --git a/papers/developer-productivity-genai-2025/paper_type.json b/papers/developer-productivity-genai-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts an observational survey of 415 software practitioners using the SPACE framework and reports quantitative findings about GenAI's impact on developer productivity." +} +\ No newline at end of file diff --git a/papers/deveval-manuallyannotated-code-2024/paper_type.json b/papers/deveval-manuallyannotated-code-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces DevEval, a new manually-annotated code generation benchmark from real-world repositories; LLM evaluation is secondary to demonstrating the benchmark's construction and utility." +} +\ No newline at end of file diff --git a/papers/devil-details-emergent-2025/paper_type.json b/papers/devil-details-emergent-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs fine-tuning experiments on nine models, reports quantitative misalignment rates and correlation coefficients, and tests hypotheses about format constraints and coherence coupling—the primary contribution is experimental findings, not a new benchmark." +} +\ No newline at end of file diff --git a/papers/diagnostic-codes-ai-2025/paper_type.json b/papers/diagnostic-codes-ai-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Primary contribution is experimental validation on MIMIC-IV showing label leakage through ML models achieving high AUROC (0.97-0.98), with a secondary systematic review documenting prevalence in published work." +} +\ No newline at end of file diff --git a/papers/dialogue-injection-attack-2025/paper_type.json b/papers/dialogue-injection-attack-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces a novel jailbreak attack (DIA) and validates it experimentally with quantitative results on benchmarks (AdvBench) across multiple models and defenses, with the primary contribution being empirical findings about attack success rates." +} +\ No newline at end of file diff --git a/papers/disaggregation-reveals-hidden-2025/paper_type.json b/papers/disaggregation-reveals-hidden-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts experiments on subject-verb agreement benchmarks, disaggregates performance by condition, and reports quantitative findings about hidden training phases in language models." +} +\ No newline at end of file diff --git a/papers/disagreements-reasoning-how-2025/paper_type.json b/papers/disagreements-reasoning-how-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments measuring how reasoning processes affect persuasion in multi-agent LLM systems, reporting quantitative results (21% average increase) on objective tasks; the primary contribution is the experimental findings about Persuasion Duality, not a benchmark itself." +} +\ No newline at end of file diff --git a/papers/disentangling-causal-importance-2026/paper_type.json b/papers/disentangling-causal-importance-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Empirically investigates multi-expert LLM orchestration through analysis of routing patterns and gradient-based causal attribution, with primary contribution being quantitative findings about expert dynamics rather than a new benchmark or framework." +} +\ No newline at end of file diff --git a/papers/dissecting-swe-bench-leaderboard-2025/paper_type.json b/papers/dissecting-swe-bench-leaderboard-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "This is a meta-analysis of existing SWE-Bench submissions, profiling submitters and architectures rather than running new experiments or introducing a new benchmark." +} +\ No newline at end of file diff --git a/papers/dive-into-agent-2025/paper_type.json b/papers/dive-into-agent-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Evaluates 21 LLM agents under realistic conditions, measuring self-replication behavior with quantitative risk scores and success rates as the primary contribution." +} +\ No newline at end of file diff --git a/papers/dlap-deep-learning-2024/paper_type.json b/papers/dlap-deep-learning-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes DLAP framework and validates it through experiments on four C/C++ projects, reporting quantitative results (F1, MCC) across multiple baselines." +} +\ No newline at end of file diff --git a/papers/do-as-i-2025/paper_type.json b/papers/do-as-i-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper develops a jailbreak attack method and validates it through systematic experiments with quantitative results (ASR scores) and ablation studies against Gemini 1.5 Flash." +} +\ No newline at end of file diff --git a/papers/do-prompts-reshape-2025/paper_type.json b/papers/do-prompts-reshape-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Empirical study running experiments across multiple models (BERT, RoBERTa, GPT-2) and tasks to measure how prompts affect sentence-level representations, reporting quantitative findings." +} +\ No newline at end of file diff --git a/papers/do-we-truly-2025/paper_type.json b/papers/do-we-truly-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes ModelSwitch method and validates it through experiments on seven benchmarks with quantitative results (63.2% MMLU-Pro accuracy, 10.2pp improvement, 34% cost reduction); primary contribution is experimental findings, not the theoretical analysis provided as support." +} +\ No newline at end of file diff --git a/papers/does-ai-code-2025/paper_type.json b/papers/does-ai-code-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Observational study measuring effectiveness of AI code review tools across 178 GitHub repositories with quantitative metrics and SHAP analysis of predictive factors." +} +\ No newline at end of file diff --git a/papers/does-it-tie-2025/paper_type.json b/papers/does-it-tie-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes the Equall system for capitalization table verification and evaluates it experimentally on 4 datarooms, reporting quantitative results (F1 scores, speed metrics) against baselines." +} +\ No newline at end of file diff --git a/papers/does-prompt-formatting-2024/paper_type.json b/papers/does-prompt-formatting-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts controlled experiments testing prompt format effects across multiple models (GPT-3.5, GPT-4) on existing benchmarks (code translation, MMLU), with quantitative performance metrics as the primary contribution." +} +\ No newline at end of file diff --git a/papers/does-reasoning-introduce-2025/paper_type.json b/papers/does-reasoning-introduce-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments on the BBQ benchmark comparing reasoning-based LLMs, reports quantitative findings about bias and accuracy, and evaluates proposed mitigation methods (SfRP, ADBP)." +} +\ No newline at end of file diff --git a/papers/domaineval-autoconstructed-benchmark-2024/paper_type.json b/papers/domaineval-autoconstructed-benchmark-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Primary contribution is DOMAINEVAL, a new multi-domain code generation benchmark with 2454 subjects and automated construction pipeline; LLM evaluations validate the benchmark rather than being the primary contribution." +} +\ No newline at end of file diff --git a/papers/domainspecific-constitutional-ai-2025/paper_type.json b/papers/domainspecific-constitutional-ai-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs comparative experiments with quantitative results (31.7% improvement metrics, ablation studies, model-scale comparisons) to demonstrate the effectiveness of domain-specific constitutional principles for LLM safety." +} +\ No newline at end of file diff --git a/papers/dont-always-pick-2026/paper_type.json b/papers/dont-always-pick-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "Primary contribution is the information-theoretic framework with formal theorems proving properties of ensemble selection; experiments validate the theory but are secondary to the mathematical analysis." +} +\ No newline at end of file diff --git a/papers/dover-interventiondriven-auto-2025/paper_type.json b/papers/dover-interventiondriven-auto-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes DoVer, a debugging framework, and validates it through experiments on existing benchmarks (GAIA, AssistantBench, GSMPlus), reporting quantitative recovery rates and validation metrics across multiple LLM systems." +} +\ No newline at end of file diff --git a/papers/dpo-superior-ppo-2024/paper_type.json b/papers/dpo-superior-ppo-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper provides theoretical analysis of DPO's distribution shift problem and validates conclusions through comprehensive empirical comparison of PPO and DPO across multiple benchmarks (HH-RLHF, SafeRLHF, APPS, CodeContest), making the primary contribution the experimental findings." +} +\ No newline at end of file diff --git a/papers/drawing-pandas-benchmark-2024/paper_type.json b/papers/drawing-pandas-benchmark-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The primary contribution is PandasPlotBench, a new 175-task human-curated benchmark for evaluating LLMs on plotting code generation; baseline model evaluations are secondary." +} +\ No newline at end of file diff --git a/papers/drccoder-automated-drc-2024/paper_type.json b/papers/drccoder-automated-drc-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper presents a multi-agent system (DRC-Coder) and reports quantitative experimental results (F1 scores, timing metrics) with baseline comparisons and ablation studies to validate its effectiveness." +} +\ No newline at end of file diff --git a/papers/drex-benchmark-detecting-2025/paper_type.json b/papers/drex-benchmark-detecting-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces D-REX, a new benchmark with 8,162 adversarial samples for detecting deceptive reasoning in LLMs; primary contribution is the benchmark itself, with experimental validation on frontier models." +} +\ No newline at end of file diff --git a/papers/drift-dynamic-rulebased-2025/paper_type.json b/papers/drift-dynamic-rulebased-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes DRIFT and validates it through systematic experiments on AgentDojo, reporting quantitative results (ASR reduction from 30.7% to 1.3%), utility metrics (58.5% vs 38.4%), and generalization across 5 LLMs—primary contribution is experimental evidence of effectiveness." +} +\ No newline at end of file diff --git a/papers/drip-defending-prompt-2025/paper_type.json b/papers/drip-defending-prompt-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a novel defense method (DRIP) and validates its effectiveness through experiments on established benchmarks, reporting quantitative attack success rates." +} +\ No newline at end of file diff --git a/papers/driving-style-alignment-2024/paper_type.json b/papers/driving-style-alignment-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Develops and experimentally validates a multi-alignment framework in CARLA simulation with quantitative metrics (collision rates, speeds, behavior) and human evaluation (n=259), with the primary contribution being the experimental findings about alignment effectiveness." +} +\ No newline at end of file diff --git a/papers/dscodebench-realistic-benchmark-2025/paper_type.json b/papers/dscodebench-realistic-benchmark-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The primary contribution is DSCodeBench itself—a new 1,000-problem benchmark for data science code generation with extensive test suites—while LLM evaluation serves to validate and characterize the benchmark's difficulty." +} +\ No newline at end of file diff --git a/papers/dspy-compiling-declarative-2023/paper_type.json b/papers/dspy-compiling-declarative-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces DSPy framework and validates it through extensive experiments on GSM8K and HotPotQA, reporting quantitative improvements across multiple models (GPT-3.5, llama2-13b-chat, T5-Large)." +} +\ No newline at end of file diff --git a/papers/dual-latent-memory-2026/paper_type.json b/papers/dual-latent-memory-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a method (L2-VMAS) and validates it through extensive experiments across multiple VLM backbones and topologies, with quantitative results on accuracy and token usage improvements as the primary contribution." +} +\ No newline at end of file diff --git a/papers/dynacode-dynamic-complexityaware-2025/paper_type.json b/papers/dynacode-dynamic-complexityaware-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces DynaCode, a dynamic benchmark generating ~189M Python problems with varying complexity levels, with experiments demonstrating the benchmark's effectiveness at resisting memorization rather than reporting primary experimental findings." +} +\ No newline at end of file diff --git a/papers/dynafix-iterative-automated-2025/paper_type.json b/papers/dynafix-iterative-automated-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes DynaFix method, runs experiments on Defects4J benchmark, and reports quantitative results (186 bugs fixed, 17 more than next-best baseline) with ablation study validating the approach." +} +\ No newline at end of file diff --git a/papers/dynamic-benchmarking-reasoning-2025/paper_type.json b/papers/dynamic-benchmarking-reasoning-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces DyCodeEval, a new dynamic benchmarking framework with a four-agent pipeline for generating contamination-resistant test variants, plus a novel DyPass@K metric; experiments validate the benchmark's robustness rather than constituting the primary contribution." +} +\ No newline at end of file diff --git a/papers/dynamic-memory-management-2025/paper_type.json b/papers/dynamic-memory-management-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper implements a SYCL port of a GPU memory allocator and reports quantitative performance benchmarks comparing it against the original CUDA version." +} +\ No newline at end of file diff --git a/papers/dynamic-mix-precision-2026/paper_type.json b/papers/dynamic-mix-precision-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a dynamic precision routing method and validates it with quantitative experiments on ALFWorld benchmark, reporting performance comparisons and efficiency gains." +} +\ No newline at end of file diff --git a/papers/early-approaches-adversarial-2025/paper_type.json b/papers/early-approaches-adversarial-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs controlled experiments testing adversarial fine-tuning as a prompt injection defense across multiple GPT-3 model sizes, reporting quantitative attack success rates and vulnerability correlations." +} +\ No newline at end of file diff --git a/papers/early-categorization-prompt-2024/paper_type.json b/papers/early-categorization-prompt-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Uses qualitative meta-analysis methodology to synthesize and categorize 17 existing prompt injection attack variations into a taxonomy, with no new experiments or benchmarks." +} +\ No newline at end of file diff --git a/papers/ecogym-evaluating-llms-2026/paper_type.json b/papers/ecogym-evaluating-llms-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "EcoGym introduces three new economic simulation environments as a long-horizon planning benchmark, with LLM evaluations serving to demonstrate the benchmark's utility rather than being the primary contribution." +} +\ No newline at end of file diff --git a/papers/ecomstage-stagewise-orientationspecific-2026/paper_type.json b/papers/ecomstage-stagewise-orientationspecific-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces EComStage, a novel benchmarking framework that decomposes e-commerce tasks into stage-wise (Perception, Planning, Action) and orientation-specific (customer/merchant) dimensions, evaluated on 33 LLMs." +} +\ No newline at end of file diff --git a/papers/economics-ai-inference-2025/paper_type.json b/papers/economics-ai-inference-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes a framework but validates it through experiments on 9 LLMs, with primary contributions being empirical findings about cost-performance tradeoffs, concurrency effects, and cost-quality Pareto frontiers." +} +\ No newline at end of file diff --git a/papers/edge-memorization-diffusion-2025/paper_type.json b/papers/edge-memorization-diffusion-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "Primary contribution is deriving tight mathematical approximations for memorization vs. generalization crossover points in diffusion models, with experiments serving to validate the theoretical predictions." +} +\ No newline at end of file diff --git a/papers/editflow-benchmarking-optimizing-2026/paper_type.json b/papers/editflow-benchmarking-optimizing-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes and experimentally validates an optimization approach (prompt auto-tuning, post-processing wrapper) with quantitative results from an RCT (n=32 participants) showing 87.26% accuracy, 66.99% precision improvement, and statistically significant speedup; benchmarking is the methodology, not the primary contribution." +} +\ No newline at end of file diff --git a/papers/effective-lora-adapter-2026/paper_type.json b/papers/effective-lora-adapter-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes LORAUTER framework and validates it through quantitative experiments on benchmarks (LLaMA2-7B), reporting performance metrics, comparisons, and ablations." +} +\ No newline at end of file diff --git a/papers/effectively-leveraging-execution-2025/paper_type.json b/papers/effectively-leveraging-execution-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports quantitative experimental results across multiple model-dataset configurations measuring the effectiveness of execution traces for program repair, with no new benchmark introduced." +} +\ No newline at end of file diff --git a/papers/effectiveness-llmasajudge-code-2025/paper_type.json b/papers/effectiveness-llmasajudge-code-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper experimentally evaluates 8 LLMs as judges for code generation and summarization tasks, reporting quantitative metrics on judgment accuracy, agreement rates with human evaluations, and systematic biases." +} +\ No newline at end of file diff --git a/papers/efficient-guided-generation-2023/paper_type.json b/papers/efficient-guided-generation-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "The primary contribution is the formal algorithmic reformulation of guided generation as FSM transitions with O(N) to O(1) indexing, with empirical validation being secondary and limited to single-run comparisons." +} +\ No newline at end of file diff --git a/papers/efficient-jailbreak-mitigation-2025/paper_type.json b/papers/efficient-jailbreak-mitigation-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a jailbreak defense method and validates it through experiments, reporting quantitative accuracy metrics and comparing against baselines on held-out test sets." +} +\ No newline at end of file diff --git a/papers/efficient-knowledge-infusion-2024/paper_type.json b/papers/efficient-knowledge-infusion-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes the ELPF framework and validates it through experiments on existing benchmarks (CMedQA, BioASQ) with quantitative results (ROUGE-L improvements), ablation studies, and human evaluation of the approach's impact." +} +\ No newline at end of file diff --git a/papers/efficient-strategy-finetuning-2026/paper_type.json b/papers/efficient-strategy-finetuning-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs controlled experiments comparing multiple fine-tuning strategies (DSS, LoRA, QLoRA, full-precision) with quantitative measurements of performance and GPU memory costs, making experimental findings the primary contribution." +} +\ No newline at end of file diff --git a/papers/efficient-switchable-safety-2025/paper_type.json b/papers/efficient-switchable-safety-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a co-training method for LLM safety control and validates it through quantitative experiments on benchmarks, reporting concrete performance metrics (97.55% safety score) and comparing against baselines." +} +\ No newline at end of file diff --git a/papers/effilearner-enhancing-efficiency-2024/paper_type.json b/papers/effilearner-enhancing-efficiency-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes EFFI-LEARNER method and validates it through experiments on 22 LLMs across multiple benchmarks, reporting quantitative efficiency gains and pass@1 metrics." +} +\ No newline at end of file diff --git a/papers/ella-equip-diffusion-2024/paper_type.json b/papers/ella-equip-diffusion-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "ELLA is a new method (TSC connector) with primary contribution being experimental validation showing state-of-the-art quantitative results on semantic alignment benchmarks; DPG-Bench is a secondary contribution used to evaluate the method." +} +\ No newline at end of file diff --git a/papers/embedguard-crosslayer-detection-2026/paper_type.json b/papers/embedguard-crosslayer-detection-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes and experimentally validates a detection framework (EmbedGuard) for adversarial attacks, reporting quantitative results (94.7%/89.3% detection rates, 51ms latency) with ablation studies on a large corpus; the primary contribution is the experimental findings, not the benchmark itself." +} +\ No newline at end of file diff --git a/papers/emergent-abilities-large-2022/paper_type.json b/papers/emergent-abilities-large-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Catalogs and meta-analyzes emergent abilities across existing language model families and benchmarks, synthesizing patterns of performance jumps at scale rather than running novel experiments." +} +\ No newline at end of file diff --git a/papers/emergent-abilities-mirage-2023/paper_type.json b/papers/emergent-abilities-mirage-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper conducts a quantitative analysis of benchmark data from BIG-Bench to demonstrate that claimed emergent abilities are artifacts of metric choice, with the primary contribution being empirical findings about how performance metrics scale differently for LLMs." +} +\ No newline at end of file diff --git a/papers/emergent-abilities-survey-2025/paper_type.json b/papers/emergent-abilities-survey-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "The title explicitly identifies it as a survey, uses meta-analysis methodology, and synthesizes existing research on emergent abilities in LLMs rather than introducing new benchmarks or conducting primary experiments." +} +\ No newline at end of file diff --git a/papers/emergent-misalignment-easy-2026/paper_type.json b/papers/emergent-misalignment-easy-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts finetuning experiments on LLMs with narrowly harmful datasets and reports quantitative results on misalignment rates and coherency, with primary contribution being experimental findings about how misalignment emerges." +} +\ No newline at end of file diff --git a/papers/emperors-new-clothes-2025/paper_type.json b/papers/emperors-new-clothes-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper systematically evaluates 20 mitigation strategies across 10 LLMs and 5 benchmarks under contamination conditions, with the primary contribution being experimental findings that no strategy significantly outperforms the baseline." +} +\ No newline at end of file diff --git a/papers/empirical-analysis-large-2024/paper_type.json b/papers/empirical-analysis-large-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts experiments testing goal hijacking attacks on multiple vision-language models and reports quantitative findings (attack success rates, correlation coefficients, and defense effectiveness measurements)." +} +\ No newline at end of file diff --git a/papers/empirical-evaluation-large-2025/paper_type.json b/papers/empirical-evaluation-large-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs comparative experiments on LLMs using existing APR benchmarks, reporting quantitative findings about model performance and prompt design effects." +} +\ No newline at end of file diff --git a/papers/empirical-study-bugs-2026/paper_type.json b/papers/empirical-study-bugs-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Analyzes 998 real bug reports from CrewAI and LangChain frameworks, identifying quantitative patterns across root cause and symptom categories with specific prevalence percentages—the primary contribution is empirical findings, not a new benchmark or literature synthesis." +} +\ No newline at end of file diff --git a/papers/empirical-study-design-llm-code-2025/paper_type.json b/papers/empirical-study-design-llm-code-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "The paper conducts meta-analysis of existing LLM code generation studies (13 papers) to extract and synthesize design patterns into a reference framework, which is primarily a systematic review contribution." +} +\ No newline at end of file diff --git a/papers/empirical-study-generative-2025/paper_type.json b/papers/empirical-study-generative-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts a survey of 204 practitioners across 37 countries and reports quantitative empirical findings on GenAI adoption rates, use cases, perceived benefits, challenges, and workforce expectations." +} +\ No newline at end of file diff --git a/papers/empirical-study-retrievalaugmented-2025/paper_type.json b/papers/empirical-study-retrievalaugmented-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Empirical study that runs experiments comparing retrieval-augmented frameworks and retrieval techniques across multiple code generation models and existing datasets, reporting quantitative performance results." +} +\ No newline at end of file diff --git a/papers/empirical-study-unit-2024/paper_type.json b/papers/empirical-study-unit-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs comparative experiments on LLMs vs traditional tools for unit test generation, reporting quantitative results on test coverage, syntactic validity rates, and prompt design effects." +} +\ No newline at end of file diff --git a/papers/empowering-business-transformation-2023/paper_type.json b/papers/empowering-business-transformation-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "This is explicitly a systematic literature review that synthesizes and maps existing generative AI research and applications to a software product management framework, making the primary contribution a synthesis of existing work rather than new experiments, benchmarks, or formal analysis." +} +\ No newline at end of file diff --git a/papers/empowering-lowresource-languages-2025/paper_type.json b/papers/empowering-lowresource-languages-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes the TraSe architecture and validates it experimentally with quantitative results (34-63% accuracy) against multiple baselines on a Bangla QA benchmark; primary contribution is the empirical findings." +} +\ No newline at end of file diff --git a/papers/energyaware-routing-large-2025/paper_type.json b/papers/energyaware-routing-large-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "Introduces mathematical formulations and proofs about operating regimes for energy-aware routing using Brownian motion diffusion approximation, with primary contribution being formal analysis rather than experiments." +} +\ No newline at end of file diff --git a/papers/engineering-multiagent-llms-2025/paper_type.json b/papers/engineering-multiagent-llms-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes and experimentally validates SEMAP methodology through quantitative benchmarking across multiple domains, with concrete failure reduction metrics compared against MetaGPT baseline." +} +\ No newline at end of file diff --git a/papers/enhancing-android-malware-2025/paper_type.json b/papers/enhancing-android-malware-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes AgenticRAG and validates it through experiments on an 18,000-sample dataset, reporting quantitative accuracy results as the primary contribution." +} +\ No newline at end of file diff --git a/papers/enhancing-automated-program-2023/paper_type.json b/papers/enhancing-automated-program-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments fine-tuning neural models (PLBART, CodeT5) and evaluating them with zero-shot/few-shot prompting on existing code repair datasets, reporting quantitative improvements in accuracy metrics." +} +\ No newline at end of file diff --git a/papers/enhancing-automated-program-2025/paper_type.json b/papers/enhancing-automated-program-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes TokenRepair method and reports quantitative experimental results on Defects4J and HumanEval-Java benchmarks with ablation studies, demonstrating performance improvements over baselines." +} +\ No newline at end of file diff --git a/papers/enhancing-code-generation-2025/paper_type.json b/papers/enhancing-code-generation-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports quantitative performance metrics (pass@1 percentages) for multiple LLM sizes and techniques (fine-tuning, in-context learning) across low-resource programming languages, with conclusions grounded in experimental evidence." +} +\ No newline at end of file diff --git a/papers/enhancing-code-translation-2024/paper_type.json b/papers/enhancing-code-translation-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs controlled experiments comparing RAG-based few-shot learning versus zero-shot code translation across multiple LLMs, reporting quantitative CodeBLEU results as its primary contribution." +} +\ No newline at end of file diff --git a/papers/enhancing-crosslanguage-code-2025/paper_type.json b/papers/enhancing-crosslanguage-code-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes a method (S-InfoNCE embedding alignment) and validates it experimentally across four models on code translation benchmarks, reporting quantitative improvements of 14-15% without fine-tuning." +} +\ No newline at end of file diff --git a/papers/enhancing-llm-code-2025/paper_type.json b/papers/enhancing-llm-code-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Systematically evaluates multi-agent and debugging approaches on HumanEval/HumanEval+ across 19 models, reporting quantitative accuracy improvements and comparative performance findings as the primary contribution." +} +\ No newline at end of file diff --git a/papers/enhancing-llm-factual-2024/paper_type.json b/papers/enhancing-llm-factual-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments on a RAG pipeline with LLaMA-2-7B, reports quantitative results (F1, Cosine similarity metrics), and analyzes how fine-tuning affects factual accuracy—primary contribution is experimental findings, not benchmark creation." +} +\ No newline at end of file diff --git a/papers/enhancing-llmbased-quantum-2025/paper_type.json b/papers/enhancing-llmbased-quantum-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs controlled experiments comparing multi-agent techniques (CoT, SCoT, RAG, QEC) on quantum code generation and reports quantitative results (~40pp improvement for structured CoT, ~4% for RAG) on a custom test suite." +} +\ No newline at end of file diff --git a/papers/enhancing-security-large-2025/paper_type.json b/papers/enhancing-security-large-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "This narrative review synthesizes existing prompt injection attacks, defenses, and benchmarks from the literature rather than contributing new empirical results, benchmarks, or theoretical analysis." +} +\ No newline at end of file diff --git a/papers/enhancing-software-quality-2023/paper_type.json b/papers/enhancing-software-quality-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a high-level architecture for AI-assisted code review with prescriptive claims, but provides no empirical evaluation—only literature review and illustrative examples." +} +\ No newline at end of file diff --git a/papers/enterprise-ai-coding-requirements-2026/paper_type.json b/papers/enterprise-ai-coding-requirements-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper's primary contribution is original observational/qualitative research with 57 enterprise developers, with a secondary companion meta-analysis of prior work, making the original empirical study the dominant contribution." +} +\ No newline at end of file diff --git a/papers/episodic-memories-generation-2025/paper_type.json b/papers/episodic-memories-generation-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces a new episodic memory evaluation benchmark for LLMs with baseline experiments on state-of-the-art models; the primary contribution is the benchmark framework itself." +} +\ No newline at end of file diff --git a/papers/epistemic-alignment-mediating-2025/paper_type.json b/papers/epistemic-alignment-mediating-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a conceptual framework (Epistemic Alignment with ten challenges) derived from philosophical epistemology to address user-LLM knowledge delivery, with empirical validation as supporting evidence rather than primary contribution." +} +\ No newline at end of file diff --git a/papers/equinox-holistic-fair-2025/paper_type.json b/papers/equinox-holistic-fair-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a fairness scheduling framework and reports quantitative experimental results (throughput, latency, fairness improvements) across multiple LLM serving systems using existing benchmarks." +} +\ No newline at end of file diff --git a/papers/esapiens-platform-secure-2025/paper_type.json b/papers/esapiens-platform-secure-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs controlled experiments evaluating a RAG platform's retrieval and generation performance across multiple benchmarks and LLMs, with quantitative metrics as the primary contribution." +} +\ No newline at end of file diff --git a/papers/eva-redteaming-gui-2025/paper_type.json b/papers/eva-redteaming-gui-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Primary contribution is quantitative experimental findings (ASR metrics across six agents) demonstrating the effectiveness of the EVA red-teaming framework, not the creation of a benchmark for community use." +} +\ No newline at end of file diff --git a/papers/eval-benchmarking-llm-agents-survey-2025/paper_type.json b/papers/eval-benchmarking-llm-agents-survey-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "This is explicitly a survey paper that proposes a taxonomy for LLM agent evaluation and catalogues existing metrics, benchmarks, and frameworks—the primary contribution is synthesis and organization of the field." +} +\ No newline at end of file diff --git a/papers/evaluating-diverse-large-2023/paper_type.json b/papers/evaluating-diverse-large-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper introduces LIBRO and runs experiments evaluating 15 LLMs on their ability to reproduce bugs, reporting quantitative results (success rates, ROC-AUC) on the Defects4J benchmark." +} +\ No newline at end of file diff --git a/papers/evaluating-efficiency-source-2024/paper_type.json b/papers/evaluating-efficiency-source-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments comparing LLMs (GPT-4, GPT-3.5, Code Llama, WizardCoder) on existing benchmarks (HumanEval, MBPP, LeetCode) and reports quantitative findings about code efficiency, correctness-efficiency correlation, and prompting strategies." +} +\ No newline at end of file diff --git a/papers/evaluating-embeddable-language-2025/paper_type.json b/papers/evaluating-embeddable-language-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments evaluating 6 models on a verbalization task and reports quantitative findings about factors affecting performance (justification order, context, model size), with the primary contribution being these experimental results rather than the benchmark itself." +} +\ No newline at end of file diff --git a/papers/evaluating-judges-as-2025/paper_type.json b/papers/evaluating-judges-as-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper introduces JETTS, a new benchmark for evaluating LLM-judges as test-time scaling evaluators, and reports comprehensive baseline results comparing different judge variants against reward models." +} +\ No newline at end of file diff --git a/papers/evaluating-language-models-2024/paper_type.json b/papers/evaluating-language-models-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper's primary contribution is introducing EVALPERF, a new benchmark with 121 performance-exercising coding tasks, along with DPE (Differential Performance Evaluation), an evaluation method for code efficiency—though it includes experimental baseline results on these resources." +} +\ No newline at end of file diff --git a/papers/evaluating-large-language-2024-2/paper_type.json b/papers/evaluating-large-language-2024-2/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Primary contribution is experimental findings on model generalization and robustness across 14 models and 6 datasets; the compression-based evaluation is the methodology, not the primary artifact." +} +\ No newline at end of file diff --git a/papers/evaluating-large-language-2025/paper_type.json b/papers/evaluating-large-language-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Primary contribution is experimental evaluation of LLM performance on code review tasks with quantitative results (accuracy percentages, correction rates) across multiple models and datasets." +} +\ No newline at end of file diff --git a/papers/evaluating-llm-alignment-2025/paper_type.json b/papers/evaluating-llm-alignment-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper introduces ALIGNEVAL, a new benchmark framework for assessing LLM alignment through evaluation capabilities, building on the empirical finding of generation-evaluation consistency." +} +\ No newline at end of file diff --git a/papers/evaluating-llm-reasoning-2025/paper_type.json b/papers/evaluating-llm-reasoning-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces SIEV, a novel dialectical evaluation framework for assessing LLM reasoning on existing benchmarks beyond correctness metrics." +} +\ No newline at end of file diff --git a/papers/evaluating-mitigating-errors-2025/paper_type.json b/papers/evaluating-mitigating-errors-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments evaluating LLM performance on API integration tasks and reports quantitative results showing constrained decoding improves correctness by 90-135%." +} +\ No newline at end of file diff --git a/papers/evaluating-reducing-deceptive-2025/paper_type.json b/papers/evaluating-reducing-deceptive-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The primary contribution is experimental findings on deceptive behavior in LLMs and quantitative results (r=0.788, 77.6% reduction) from running multi-turn RL experiments; the belief misalignment metric is a tool supporting the evaluation, not the primary contribution." +} +\ No newline at end of file diff --git a/papers/evaluating-robustness-chinchilla-2025/paper_type.json b/papers/evaluating-robustness-chinchilla-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "The paper formally analyzes the mathematical robustness of Chinchilla's scaling law through structured perturbation analyses, examining how parameter uncertainties and systematic errors affect the scaling law estimates." +} +\ No newline at end of file diff --git a/papers/evaluation-code-llms-2024/paper_type.json b/papers/evaluation-code-llms-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Evaluates multiple code LLMs with quantitative metrics (pass@1 percentages) on geospatial code generation tasks, with the primary contribution being experimental findings about model performance and domain-specific gaps." +} +\ No newline at end of file diff --git a/papers/evaluation-cultural-value-2025/paper_type.json b/papers/evaluation-cultural-value-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments evaluating 10 LLMs across 20 countries with quantitative metrics (deviation ratios), reporting empirical findings about cultural value alignment rather than introducing a new benchmark as the primary contribution." +} +\ No newline at end of file diff --git a/papers/evaluation-impact-code-2025/paper_type.json b/papers/evaluation-impact-code-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Randomized controlled trial measuring the quantitative impact of Copilot on task completion time and code correctness in student developers; primary contribution is experimental findings." +} +\ No newline at end of file diff --git a/papers/evaluation-llm-code-2024/paper_type.json b/papers/evaluation-llm-code-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts experiments evaluating GPT-4o-mini on 14,346 existing Codewars challenges, reporting quantitative performance findings across languages and difficulty levels with SHAP feature importance analysis." +} +\ No newline at end of file diff --git a/papers/evaluation-llms-syntaxaware-2024/paper_type.json b/papers/evaluation-llms-syntaxaware-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces SAFIM, a new 17,720-example syntax-aware fill-in-the-middle benchmark across four languages; empirical results on model evaluations serve to validate and demonstrate the benchmark's utility." +} +\ No newline at end of file diff --git a/papers/everything-you-wanted-2025/paper_type.json b/papers/everything-you-wanted-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments evaluating LLM vulnerability detection, reports quantitative results (67% accuracy, precision ~0.8), analyzes false positive patterns, and tests scaling effects—primary contribution is experimental findings, not a new benchmark or survey." +} +\ No newline at end of file diff --git a/papers/evidence-phase-transitions-2025/paper_type.json b/papers/evidence-phase-transitions-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper trains a small transformer model and runs controlled experiments with statistical diagnostics to detect and quantify phase transitions during training, reporting quantitative empirical findings." +} +\ No newline at end of file diff --git a/papers/evocodebench-evolving-code-2024-2/paper_type.json b/papers/evocodebench-evolving-code-2024-2/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces EvoCodeBench, a new code generation benchmark designed to reduce data leakage in evaluation; empirical results on LLMs serve to validate the benchmark rather than being the primary contribution." +} +\ No newline at end of file diff --git a/papers/evocodebench-evolving-code-2024/paper_type.json b/papers/evocodebench-evolving-code-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper introduces EvoCodeBench, a new code generation benchmark aligned with real-world repositories; experimental results (performance drops, context effects) serve to validate and characterize the benchmark rather than being the primary contribution." +} +\ No newline at end of file diff --git a/papers/evogpt-leveraging-llmdriven-2025/paper_type.json b/papers/evogpt-leveraging-llmdriven-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Presents a new hybrid LLM-SBST system and validates it experimentally with quantitative results (9-11% improvements) on existing benchmarks (Defects4J), including ablation studies to isolate component contributions." +} +\ No newline at end of file diff --git a/papers/evolving-ai-longitudinal-2026/paper_type.json b/papers/evolving-ai-longitudinal-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Longitudinal observational study reporting quantitative findings from 800 developers' IDE telemetry and a 62-person survey on AI tool adoption effects." +} +\ No newline at end of file diff --git a/papers/evolving-excellence-automated-2025/paper_type.json b/papers/evolving-excellence-automated-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Presents Artemis, an evolutionary optimization platform, and reports quantitative experimental results (13.6-36.9% improvements) across multiple benchmarks, with primary contribution being the empirical findings about automated agent configuration tuning effectiveness." +} +\ No newline at end of file diff --git a/papers/ewallet-delivery-technology-2025/paper_type.json b/papers/ewallet-delivery-technology-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Systematic review of 12 existing articles synthesizing findings on e-wallet delivery technology options; primary contribution is comparative analysis of NFC, QR code, and SMS across prior work." +} +\ No newline at end of file diff --git a/papers/experepair-dualmemory-enhanced-2025/paper_type.json b/papers/experepair-dualmemory-enhanced-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces a dual-memory LLM-based repair method and validates it with quantitative results on SWE-Bench Lite, including ablation studies and baseline comparisons." +} +\ No newline at end of file diff --git a/papers/experimental-evidence-productivity-2023/paper_type.json b/papers/experimental-evidence-productivity-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports quantitative experimental findings from a preregistered RCT measuring productivity effects of ChatGPT on 444 professionals, with specific effect sizes and behavioral outcomes." +} +\ No newline at end of file diff --git a/papers/explainable-ai-software-2024/paper_type.json b/papers/explainable-ai-software-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a conceptual three-layer XAI architecture and argues its benefits for developer-AI collaboration without empirical validation or formal theoretical analysis." +} +\ No newline at end of file diff --git a/papers/explainable-automated-debugging-2023/paper_type.json b/papers/explainable-automated-debugging-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes AutoSD method and validates it through quantitative experiments on established benchmarks (ARHE, Defects4J v2.0) plus a human study evaluating explanation quality." +} +\ No newline at end of file diff --git a/papers/explainable-finegrained-safeguarding-2025/paper_type.json b/papers/explainable-finegrained-safeguarding-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes XG-Guard method and validates it through extensive experiments across 6 datasets, 4 topologies, and 3 LLM backbones, reporting quantitative performance metrics (>90% AUC) and ablation studies." +} +\ No newline at end of file diff --git a/papers/exploring-adversarial-robustness-2024/paper_type.json b/papers/exploring-adversarial-robustness-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper conducts experimental comparisons of adversarial robustness across multiple neural image compression methods (JPEG AI, diffusion-based CDC) and evaluates defense mechanisms, with primary contributions being quantitative findings about robustness degradation and defense effectiveness." +} +\ No newline at end of file diff --git a/papers/exploring-aiaugmented-sensemaking-2026/paper_type.json b/papers/exploring-aiaugmented-sensemaking-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts a mixed-methods study with 16 healthcare professionals, measures quantitative outcomes (NASA-TLX workload) and qualitative perceptions, and reports experimental findings about LLM-augmented sensemaking." +} +\ No newline at end of file diff --git a/papers/exploring-code-language-2025/paper_type.json b/papers/exploring-code-language-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments fine-tuning a code model on an HLS dataset and reports quantitative results on syntax/functionality performance, with analysis of overfitting behavior—the primary contribution is experimental findings, not the dataset itself." +} +\ No newline at end of file diff --git a/papers/exploring-dataefficient-adaptation-2024/paper_type.json b/papers/exploring-dataefficient-adaptation-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes DEED method and validates it experimentally across five benchmarks and four LLMs, reporting quantitative improvements (46.2% relative gain) over baselines—the primary contribution is experimental findings demonstrating the method's effectiveness." +} +\ No newline at end of file diff --git a/papers/exploring-generalizable-automated-2025/paper_type.json b/papers/exploring-generalizable-automated-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts systematic experiments evaluating different LLMs for automated program repair across multiple programming languages, reporting quantitative pass@1/pass@5 metrics and analyzing the impact of different prompting strategies and techniques." +} +\ No newline at end of file diff --git a/papers/exploring-large-language-2024/paper_type.json b/papers/exploring-large-language-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Provides comprehensive taxonomy and catalog of 80+ existing LLM-based agent systems with classifications and definitions, synthesizing the field rather than introducing novel experiments, benchmarks, or theoretical contributions." +} +\ No newline at end of file diff --git a/papers/exploring-lifting-robustness-2024/paper_type.json b/papers/exploring-lifting-robustness-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments with metamorphic testing on four LLMs, reports quantitative results on robustness metrics (34.4%-48.5% instability), discovers correlations between perturbation and performance, and validates a readability improvement method with measurable enhancement (49.32%)." +} +\ No newline at end of file diff --git a/papers/exploring-parameterefficient-finetuning-2023/paper_type.json b/papers/exploring-parameterefficient-finetuning-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Primary contribution is experimental comparison of PEFT techniques (LoRA, QLoRA) on code generation across multiple benchmarks and LLM families, reporting quantitative results." +} +\ No newline at end of file diff --git a/papers/exploring-personadependent-llm-2025/paper_type.json b/papers/exploring-personadependent-llm-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments evaluating LLM moral decision-making across personas against the Moral Machine benchmark, reporting quantitative alignment metrics between models and human responses." +} +\ No newline at end of file diff --git a/papers/exploring-security-threats-2025/paper_type.json b/papers/exploring-security-threats-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments testing knowledge base poisoning attacks on RACG systems and reports quantitative findings (e.g., 48% code compromise, 60% similarity threshold) comparing different retrievers and LLMs." +} +\ No newline at end of file diff --git a/papers/exploring-synergy-between-2024/paper_type.json b/papers/exploring-synergy-between-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Described as a narrative review synthesizing existing literature on generative AI applications in software engineering, with no original empirical evidence presented." +} +\ No newline at end of file diff --git a/papers/explosive-growth-from-2023/paper_type.json b/papers/explosive-growth-from-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Explicitly a review of existing arguments and economic models on AI automation, synthesizing and evaluating counterarguments through meta-analysis rather than running new experiments or introducing novel theoretical contributions." +} +\ No newline at end of file diff --git a/papers/exposing-privacy-gaps-2024/paper_type.json b/papers/exposing-privacy-gaps-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Primary contribution is quantitative experimental findings (AUROC metrics) comparing vulnerability of DPO vs PPO-aligned models to membership inference attacks, with a novel attack framework (PREMIA) as the methodology." +} +\ No newline at end of file diff --git a/papers/extending-range-bugs-2022/paper_type.json b/papers/extending-range-bugs-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a bug classification framework and sketches hybrid repair algorithms conceptually, without experimental validation or formal proofs." +} +\ No newline at end of file diff --git a/papers/extensive-study-model-2023/paper_type.json b/papers/extensive-study-model-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs systematic experiments comparing code representations across APR datasets and reports quantitative results (exact match accuracy metrics), with the primary contribution being empirical findings about representation effectiveness." +} +\ No newline at end of file diff --git a/papers/extracting-fix-ingredients-2025/paper_type.json b/papers/extracting-fix-ingredients-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments on existing benchmarks (TSSB-3M, Defects4J), proposes and evaluates a method (ScanFix), and reports quantitative performance improvements (7-31% relative gain)." +} +\ No newline at end of file diff --git a/papers/f2a-innovative-approach-2024/paper_type.json b/papers/f2a-innovative-approach-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces a novel attack method (F2A) and reports quantitative experimental results testing it across 9 LLMs with concrete success rates, plus validation of a proposed defense mechanism." +} +\ No newline at end of file diff --git a/papers/factool-factuality-detection-2023/paper_type.json b/papers/factool-factuality-detection-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces FacTool framework and reports quantitative experimental results (F1 scores, performance comparisons) across multiple domains, with the primary contribution being empirical findings about factuality detection effectiveness." +} +\ No newline at end of file diff --git a/papers/failure-modes-llm-2025/paper_type.json b/papers/failure-modes-llm-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "The paper proposes a conceptual taxonomy and argues that LLM reliability should be framed as a systems-engineering problem, making prescriptive design claims without experimental validation." +} +\ No newline at end of file diff --git a/papers/fair-comprehensive-evaluation-2026/paper_type.json b/papers/fair-comprehensive-evaluation-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper introduces RouterXBench, a three-dimensional evaluation framework as its primary contribution, and uses baseline methods and experiments to validate the framework's design." +} +\ No newline at end of file diff --git a/papers/fairmindsim-alignment-behavior-2024/paper_type.json b/papers/fairmindsim-alignment-behavior-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs controlled experiments with 100 human participants and multiple LLM models in an economic game, reporting quantitative results on behavior, rejection rates, and emotional responses." +} +\ No newline at end of file diff --git a/papers/fara7b-efficient-agentic-2025/paper_type.json b/papers/fara7b-efficient-agentic-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Fara-7B's performance results and data scaling trends are the primary contributions; WebTailBench is a supporting evaluation tool." +} +\ No newline at end of file diff --git a/papers/fast-controlled-generation-2025/paper_type.json b/papers/fast-controlled-generation-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes AWRS and validates it through extensive experiments across 5 domains, reporting quantitative speedup and accuracy results compared to baselines; the theoretical analysis of KL divergence scaling is secondary to the experimental findings." +} +\ No newline at end of file diff --git a/papers/fast-inference-from-2022/paper_type.json b/papers/fast-inference-from-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The primary contribution is demonstrating empirical speedup results (2X-3X) on real models through experiments; the mathematical proof of output equivalence is supporting evidence rather than the main contribution." +} +\ No newline at end of file diff --git a/papers/faster-wind-accelerating-2024/paper_type.json b/papers/faster-wind-accelerating-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "Primary contribution is establishing game-theoretic connections and proving convergence guarantees with sample complexity bounds; the WIND algorithm and empirical results are applications of these theoretical insights." +} +\ No newline at end of file diff --git a/papers/fath-authenticationbased-testtime-2024/paper_type.json b/papers/fath-authenticationbased-testtime-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a defense method (FATH) and evaluates it experimentally across models and attack types, reporting quantitative attack success rates and comparisons with existing defenses." +} +\ No newline at end of file diff --git a/papers/feabench-benchmark-evaluating-2025/paper_type.json b/papers/feabench-benchmark-evaluating-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces FEA-Bench, a new benchmark dataset with 1,401 repository-level code generation tasks; while baseline experiments are run, the primary contribution is the benchmark itself rather than experimental findings." +} +\ No newline at end of file diff --git a/papers/featbench-more-realistic-2025/paper_type.json b/papers/featbench-more-realistic-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "FeatBench is a new evolving benchmark with 157 tasks for feature-level code generation; experiments serve to establish baselines and demonstrate the benchmark's utility, but the benchmark itself is the primary contribution." +} +\ No newline at end of file diff --git a/papers/featurizeddecomposition-join-lowcost-2025/paper_type.json b/papers/featurizeddecomposition-join-lowcost-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "The paper's primary contribution is a formal method (FDJ) with statistical precision and recall guarantees backed by logical decompositions (CNF), with experimental validation serving as supporting evidence rather than the main contribution." +} +\ No newline at end of file diff --git a/papers/federate-router-learning-2026/paper_type.json b/papers/federate-router-learning-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper's primary contribution is the empirical validation of a federated routing framework across multiple benchmarks (RouterBench-Data and ProxRouter-Data), demonstrating quantitative improvements in the accuracy-cost frontier; theoretical guarantees are supporting analysis rather than the main contribution." +} +\ No newline at end of file diff --git a/papers/finegrained-analysis-brainllm-2025/paper_type.json b/papers/finegrained-analysis-brainllm-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts fine-grained experiments measuring input attribution patterns in brain-LLM alignment, reporting quantitative results (IoU values, bias patterns, layer-wise comparisons) as primary contributions." +} +\ No newline at end of file diff --git a/papers/finetuned-large-language-2025/paper_type.json b/papers/finetuned-large-language-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments comparing MDAgent to manual methods, reports quantitative performance metrics (42.22% time reduction, MAE/MSE improvements), and evaluates a fine-tuned model on expert-scored code generation tasks." +} +\ No newline at end of file diff --git a/papers/first-look-at-2024/paper_type.json b/papers/first-look-at-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces LiCoEval, the first benchmark for evaluating LLM license compliance; empirical results on 14 models are baseline validations of the benchmark itself, not the primary contribution." +} +\ No newline at end of file diff --git a/papers/five-fatal-assumptions-2026/paper_type.json b/papers/five-fatal-assumptions-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "The paper critiques T-shirt sizing methodology for AI projects and proposes an alternative framework (Checkpoint Sizing) without reporting original experimental validation." +} +\ No newline at end of file diff --git a/papers/fixing-7400-bugs-2025/paper_type.json b/papers/fixing-7400-bugs-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces WILLIAMT tool and reports quantitative experimental results (46.1% fix rate, cost comparisons, model performance metrics) on an existing bug dataset, with the primary contribution being empirical findings about cost-effective crash-site repair." +} +\ No newline at end of file diff --git a/papers/flockvote-llmempowered-agentbased-2025/paper_type.json b/papers/flockvote-llmempowered-agentbased-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs agent-based simulations of the 2024 election with quantitative results (prediction accuracy, sensitivity measurements) and the primary contribution is empirical findings about LLM agent behavior (bias, prompt sensitivity, positional instability)." +} +\ No newline at end of file diff --git a/papers/floodbrain-flood-disaster-2023/paper_type.json b/papers/floodbrain-flood-disaster-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Presents a RAG system (FloodBrain) and reports quantitative experimental results comparing GPT-4-generated reports against human baselines using G-EVAL and ROUGE metrics, with ablation studies on pipeline components." +} +\ No newline at end of file diff --git a/papers/flowsteer-interactive-agentic-2026/paper_type.json b/papers/flowsteer-interactive-agentic-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a reinforcement learning framework (CWRPO) and validates it experimentally across 12 benchmarks, reporting quantitative improvements over multiple baselines—the primary contribution is experimental findings, not the benchmark itself." +} +\ No newline at end of file diff --git a/papers/following-autoregressive-nature-2025/paper_type.json b/papers/following-autoregressive-nature-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes AutoRegEmbed method and validates it experimentally on 10 STS benchmarks with quantitative results and ablation studies showing performance improvements." +} +\ No newline at end of file diff --git a/papers/forgetful-but-faithful-2025/paper_type.json b/papers/forgetful-but-faithful-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper's primary contributions are MaRS (a typed memory architecture) and FiFA (a benchmark for measuring privacy-aware agent behavior), with experiments serving to validate and demonstrate the benchmark rather than being the main findings." +} +\ No newline at end of file diff --git a/papers/forgetting-forget-attention-2025/paper_type.json b/papers/forgetting-forget-attention-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments on established benchmarks (MUSE-Books, MUSE-News, WMDP) to demonstrate a backdoor vulnerability in LLM unlearning and validate the attack mechanism across multiple methods and settings." +} +\ No newline at end of file diff --git a/papers/formal-verification-llmgenerated-2025/paper_type.json b/papers/formal-verification-llmgenerated-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper experimentally evaluates Astrogator across 6 LLMs on 1260 generated programs, reporting quantitative metrics for verification accuracy and LLM code correctness as its primary contribution." +} +\ No newline at end of file diff --git a/papers/formalizing-benchmarking-prompt-2023/paper_type.json b/papers/formalizing-benchmarking-prompt-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The primary contribution is systematic experimental evaluation showing quantitative results (ASV scores) of attacks and defenses across multiple models and tasks, with empirical findings about effectiveness." +} +\ No newline at end of file diff --git a/papers/formulaone-prompting-adaptive-2026/paper_type.json b/papers/formulaone-prompting-adaptive-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a prompting method and validates it through experiments on 4 math benchmarks across 5 models, with quantitative comparisons to baselines and ablation studies; primary contribution is experimental findings, not the method alone." +} +\ No newline at end of file diff --git a/papers/foundational-automatic-evaluators-2025/paper_type.json b/papers/foundational-automatic-evaluators-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces FARE evaluators and reports quantitative experimental results across 7 benchmarks, with performance comparisons and downstream application metrics as the primary contribution." +} +\ No newline at end of file diff --git a/papers/fragility-benchmark-contamination-2025/paper_type.json b/papers/fragility-benchmark-contamination-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The primary contribution is experimental demonstration of detection method fragility through quantitative measurements (AUROC drops, performance inflation) and testing across 10 detection methods, with theoretical analysis as supporting investigation of the mechanism." +} +\ No newline at end of file diff --git a/papers/from-benchmarks-business-2025/paper_type.json b/papers/from-benchmarks-business-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper evaluates an agent system (CUGA) on existing benchmarks (WebArena, AppWorld) and a real-world production pilot, reporting quantitative performance metrics as its primary contribution." +} +\ No newline at end of file diff --git a/papers/from-code-courtroom-2025/paper_type.json b/papers/from-code-courtroom-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "This is a meta-analysis reviewing 16 existing studies on LLM-as-a-Judge in software engineering, synthesizing limitations and proposing a research roadmap rather than conducting original experiments." +} +\ No newline at end of file diff --git a/papers/from-code-generation-2025/paper_type.json b/papers/from-code-generation-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes an AI testing system and validates it through quantitative experiments on benchmark datasets (SIR) and a user study with measurable outcomes (31.2% accuracy improvement, 12.6% test coverage increase, 10.5% acceptance rate), with experimental findings as the primary contribution." +} +\ No newline at end of file diff --git a/papers/from-evaluation-enhancement-2025/paper_type.json b/papers/from-evaluation-enhancement-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces ZK-Eval benchmark to reveal performance gaps, then proposes and extensively validates ZK-Coder method with quantitative improvements (20%→88%, 28%→98%) and ablation studies—the primary contribution is the experimental findings on method effectiveness." +} +\ No newline at end of file diff --git a/papers/from-firewalls-frontiers-2025/paper_type.json b/papers/from-firewalls-frontiers-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "The paper argues that AI red-teaming is a domain-specific evolution of cyber red-teaming and proposes a prescriptive framework for how both disciplines should evolve, without conducting experiments or introducing new benchmarks." +} +\ No newline at end of file diff --git a/papers/from-fluent-verifiable-2026/paper_type.json b/papers/from-fluent-verifiable-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "The paper identifies failure modes in deep research agents and proposes a conceptual framework (AAR standard with metrics and semantic provenance architecture) with prescriptive claims, but lacks experimental validation or benchmark runs." +} +\ No newline at end of file diff --git a/papers/from-gains-strains-2025/paper_type.json b/papers/from-gains-strains-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts an observational study (N=442) with quantitative statistical modeling (R²=0.398) to empirically test relationships between GenAI adoption, job demands/resources, and developer burnout outcomes." +} +\ No newline at end of file diff --git a/papers/from-helpfulness-toxic-2026/paper_type.json b/papers/from-helpfulness-toxic-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Tests 10 SOTA models to measure and analyze misalignment rates across different conditions, reporting quantitative findings about toxic proactivity rather than proposing a new benchmark as the primary contribution." +} +\ No newline at end of file diff --git a/papers/from-horizontal-layering-2026/paper_type.json b/papers/from-horizontal-layering-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a conceptual framework (Human-AI Collaboration Efficacy) and argues for vertical integration over horizontal layering using qualitative case studies with counterfactual efficiency estimates, not experimental validation." +} +\ No newline at end of file diff --git a/papers/from-llm-reasoning-2025/paper_type.json b/papers/from-llm-reasoning-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Comprehensive review that taxonomizes 60+ existing benchmarks, reviews agent frameworks, catalogs applications, and surveys communication protocols across the field." +} +\ No newline at end of file diff --git a/papers/from-poisoned-aware-2025/paper_type.json b/papers/from-poisoned-aware-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes an RL-based training framework and validates it through experiments, reporting quantitative results (AWARENESS@5, attack reduction, detection accuracy) across five backdoor types and six baselines." +} +\ No newline at end of file diff --git a/papers/from-single-multiagent-2026/paper_type.json b/papers/from-single-multiagent-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments comparing a new multi-agent system (GenomAgent) to an existing baseline (GeneGPT), reporting quantitative performance improvements and a reproducibility analysis on an existing benchmark—the primary contribution is experimental findings, not the benchmark itself." +} +\ No newline at end of file diff --git a/papers/from-task-solving-2026/paper_type.json b/papers/from-task-solving-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments on LLM agents under various robustness conditions (partial observability, noise, non-stationarity) and reports quantitative findings about performance gaps and model behavior, with the benchmark as the evaluation tool rather than the primary contribution." +} +\ No newline at end of file diff --git a/papers/frontier-models-in-context-scheming-2024/paper_type.json b/papers/frontier-models-in-context-scheming-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper conducts experiments on frontier LLMs under controlled conditions to demonstrate their capability for in-context scheming, reporting quantitative findings about model behaviors (covert subversion, deception, alignment faking)." +} +\ No newline at end of file diff --git a/papers/frontiermath-benchmark-evaluating-2024/paper_type.json b/papers/frontiermath-benchmark-evaluating-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "FrontierMath is a new benchmark dataset of expert-crafted mathematics problems designed to evaluate AI reasoning, with evaluation results presented to demonstrate the benchmark's value rather than as the primary contribution." +} +\ No newline at end of file diff --git a/papers/fundamental-language-models-2025/paper_type.json b/papers/fundamental-language-models-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments across model sizes with quantitative measurements (linear regression, Mann-Whitney U tests) to empirically demonstrate scaling relationships between model size and linguistic competence." +} +\ No newline at end of file diff --git a/papers/fundamental-limits-gametheoretic-2025/paper_type.json b/papers/fundamental-limits-gametheoretic-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "Establishes necessary and sufficient conditions and proves an impossibility theorem (Theorem 5.1) about game-theoretic LLM alignment through formal mathematical analysis." +} +\ No newline at end of file diff --git a/papers/funtuning-characterizing-vulnerability-2025/paper_type.json b/papers/funtuning-characterizing-vulnerability-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Empirically demonstrates vulnerability in proprietary LLM fine-tuning interfaces through optimization attacks with quantitative success rates (65-82%) on Gemini models, primary contribution is experimental findings." +} +\ No newline at end of file diff --git a/papers/future-software-reuse-2025/paper_type.json b/papers/future-software-reuse-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a conceptual framework (generative reuse) and 16-question research agenda for AI-native software engineering, reviewing existing productivity studies but conducting no original experiments or formal analysis." +} +\ No newline at end of file diff --git a/papers/fuzz4all-universal-fuzzing-2023/paper_type.json b/papers/fuzz4all-universal-fuzzing-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments across six languages and nine systems, reporting quantitative results (36.8% higher coverage, 98 bugs found) as the primary contribution." +} +\ No newline at end of file diff --git a/papers/fveval-understanding-language-2024/paper_type.json b/papers/fveval-understanding-language-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper introduces FVEval, a first-of-its-kind comprehensive benchmark for evaluating LLMs on hardware formal verification with three sub-tasks and 571 test instances; while it includes baseline experiments, the primary contribution is the benchmark itself." +} +\ No newline at end of file diff --git a/papers/gaia-benchmark-general-2023/paper_type.json b/papers/gaia-benchmark-general-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "GAIA introduces a new 466-question benchmark for evaluating general AI assistants; while the paper includes baseline experiments, the primary contribution is the benchmark itself and its difficulty level framework." +} +\ No newline at end of file diff --git a/papers/gamma-revisiting-templatebased-2023/paper_type.json b/papers/gamma-revisiting-templatebased-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments on existing benchmarks (Defects4J, QuixBugs) and reports quantitative results showing GAMMA fixes more bugs with higher precision than compared APR techniques, with generalization tests across models." +} +\ No newline at end of file diff --git a/papers/gate-integrated-assessment-2025/paper_type.json b/papers/gate-integrated-assessment-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "Primary contribution is GATE, a formal integrated assessment model with mathematical modules for compute, automation, and macroeconomic dynamics, not experimental validation or empirical benchmarking." +} +\ No newline at end of file diff --git a/papers/gdpval-evaluating-ai-2025/paper_type.json b/papers/gdpval-evaluating-ai-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Primary contribution is GDPval, a new evaluation framework with 1,320 real-world economically valuable tasks; model performance results are baseline demonstrations of the benchmark." +} +\ No newline at end of file diff --git a/papers/gemma-2-improving-2024/paper_type.json b/papers/gemma-2-improving-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces new language models and reports experimental results on benchmark performance and safety evaluations, with the primary contribution being quantitative findings from model comparisons." +} +\ No newline at end of file diff --git a/papers/generative-agents-interactive-2023/paper_type.json b/papers/generative-agents-interactive-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a generative agent architecture and validates it through controlled experiments with quantitative comparisons against ablated versions and human baselines (TrueSkill scores, network metrics, behavioral emergence)." +} +\ No newline at end of file diff --git a/papers/generative-ai-at-2023/paper_type.json b/papers/generative-ai-at-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs RCT and observational experiments measuring quantitative productivity and sentiment outcomes with generative AI assistance." +} +\ No newline at end of file diff --git a/papers/generative-ai-computational-2025/paper_type.json b/papers/generative-ai-computational-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Perspective paper that surveys existing generative AI methods and proposes a prescriptive roadmap for computational chemistry without presenting new experimental results or validation." +} +\ No newline at end of file diff --git a/papers/generative-ai-construction-2024/paper_type.json b/papers/generative-ai-construction-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "The paper is primarily a systematic review and meta-analysis of generative AI in construction, with a Delphi panel synthesizing expert views and a supporting case study." +} +\ No newline at end of file diff --git a/papers/generative-ai-paradox-2026/paper_type.json b/papers/generative-ai-paradox-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a conceptual framework (synthetic reality stack) and argues a prescriptive viewpoint about GenAI risks without experimental validation or mathematical proofs." +} +\ No newline at end of file diff --git a/papers/generative-ai-pull-2024/paper_type.json b/papers/generative-ai-pull-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Observational study with propensity score weighting on 18,256 GitHub PRs reporting quantitative impacts (19.3-hour review time reduction, 1.57× merge likelihood increase) and qualitative analysis of developer interventions." +} +\ No newline at end of file diff --git a/papers/generative-ai-software-2024/paper_type.json b/papers/generative-ai-software-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "This narrative review surveys existing AI coding tools and synthesizes findings from secondary sources without conducting original empirical evaluation." +} +\ No newline at end of file diff --git a/papers/generative-ai-software-2025/paper_type.json b/papers/generative-ai-software-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "This is a mapping literature review synthesizing findings from 46 studies on GenAI in software architecture; the primary contribution is a structured review and synthesis of existing work, not new experiments or benchmarks." +} +\ No newline at end of file diff --git a/papers/genetic-instruct-scaling-2024/paper_type.json b/papers/genetic-instruct-scaling-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper describes a method for synthetic instruction generation and reports quantitative experimental results (69.7% accuracy on HumanEval/MBPP) comparing the proposed approach against baselines, with the primary contribution being empirical findings rather than the dataset or benchmark itself." +} +\ No newline at end of file diff --git a/papers/geoanalystbench-geoai-benchmark-2025/paper_type.json b/papers/geoanalystbench-geoai-benchmark-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces GeoAnalystBench, a new 50-task Python-based GIS benchmark for evaluating LLMs on spatial analysis; while baselines are evaluated, the primary contribution is the benchmark framework itself." +} +\ No newline at end of file diff --git a/papers/geocodegpt-large-language-2024/paper_type.json b/papers/geocodegpt-large-language-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper fine-tunes an existing model and reports quantitative performance improvements across multiple metrics on geospatial code tasks; the primary contribution is the experimental findings demonstrating the effectiveness of the fine-tuned GeoCode-GPT model." +} +\ No newline at end of file diff --git a/papers/geometry-thought-how-2026/paper_type.json b/papers/geometry-thought-how-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports quantitative experimental findings measuring how LLM reasoning representations reorganize across scales (dimensionality collapse %, alignment increases, coherence values) and domains, with primary contribution being empirical discovery of geometric patterns rather than a new benchmark or theoretical analysis." +} +\ No newline at end of file diff --git a/papers/getting-more-juice-2024/paper_type.json b/papers/getting-more-juice-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes algorithms (RFT and IRFT) and demonstrates their effectiveness through experiments (59.48% → 61.03% improvement on HuggingFace Open LLM Leaderboard), with theoretical convergence analysis as supporting evidence rather than the primary contribution." +} +\ No newline at end of file diff --git a/papers/gitbugs-bug-reports-2025/paper_type.json b/papers/gitbugs-bug-reports-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "GitBugs is a new dataset of 150,000+ bug reports; the experiments (ARIMA forecasting, classification, duplicate detection) are case studies demonstrating the dataset's utility for various tasks, making the dataset itself the primary contribution." +} +\ No newline at end of file diff --git a/papers/gittaskbench-2025/paper_type.json b/papers/gittaskbench-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces GitTaskBench, a new 54-task benchmark for evaluating code agents across 7 domains; the benchmark itself is the primary contribution, with experimental evaluation of baselines serving to demonstrate its utility." +} +\ No newline at end of file diff --git a/papers/give-llms-security-2025/paper_type.json b/papers/give-llms-security-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes CodeGuarder framework and validates it through quantitative experiments on CyberSecEval benchmark across multiple LLMs and languages, reporting security improvements and functional correctness metrics." +} +\ No newline at end of file diff --git a/papers/give-positive-review-2025/paper_type.json b/papers/give-positive-review-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments testing prompt injection attacks and defenses on multiple AI models, reporting quantitative results on attack effectiveness and defense detection rates." +} +\ No newline at end of file diff --git a/papers/glad-neural-predicate-2022/paper_type.json b/papers/glad-neural-predicate-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports experimental results and quantitative findings from running GLAD on Defects4J benchmarks, measuring fault fixes and analyzing performance through ablation studies; the primary contribution is the experimental validation of the approach, not a new benchmark or dataset." +} +\ No newline at end of file diff --git a/papers/glimprouter-efficient-collaborative-2026/paper_type.json b/papers/glimprouter-efficient-collaborative-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes GlimpRouter, a routing method for collaborative inference, and validates it experimentally with quantitative results (51.67% accuracy, 25.9% latency reduction) on AIME25." +} +\ No newline at end of file diff --git a/papers/glmdialog-noisetolerant-pretraining-2023/paper_type.json b/papers/glmdialog-noisetolerant-pretraining-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes GLM-Dialog model and reports experimental results (quantitative metrics and human evaluations) comparing it against baselines, with ablation studies validating design choices." +} +\ No newline at end of file diff --git a/papers/goal-alignment-llmbased-2025/paper_type.json b/papers/goal-alignment-llmbased-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a User Goal State Tracking framework and three-stage training methodology, then validates it through experiments showing 14.1% absolute improvement in goal alignment with human evaluation confirmation—the primary contribution is the experimental findings." +} +\ No newline at end of file diff --git a/papers/goalguided-generative-prompt-2024/paper_type.json b/papers/goalguided-generative-prompt-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes an attack method (G2PIA) and primarily contributes experimental findings demonstrating its effectiveness across 7 LLMs and 4 datasets, with theoretical analysis serving as supporting justification for the approach." +} +\ No newline at end of file diff --git a/papers/good-bad-exploring-2024/paper_type.json b/papers/good-bad-exploring-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experimental attacks on RAG systems and reports quantitative vulnerability rates (~50% extraction) and comparative effectiveness of mitigation strategies, with primary contribution being empirical findings rather than a new benchmark, survey, or theoretical analysis." +} +\ No newline at end of file diff --git a/papers/gorilla-large-language-2023/paper_type.json b/papers/gorilla-large-language-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The primary contribution is experimental: demonstrating that a finetuned model (Gorilla) outperforms GPT-4 on API call generation with quantitative results across multiple benchmarks, though it also introduces APIBench as a supporting resource." +} +\ No newline at end of file diff --git a/papers/gpqa-graduatelevel-googleproof-2023/paper_type.json b/papers/gpqa-graduatelevel-googleproof-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces GPQA, a new graduate-level multiple-choice benchmark dataset; experiments with experts and models validate the benchmark's difficulty and utility rather than constituting the primary contribution." +} +\ No newline at end of file diff --git a/papers/gpt4-technical-report-2023/paper_type.json b/papers/gpt4-technical-report-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper reports quantitative experimental results evaluating GPT-4's performance across professional and academic benchmarks, analyzes scaling laws empirically, and measures post-training effects with concrete metrics (exam scores, safety improvements, calibration changes)." +} +\ No newline at end of file diff --git a/papers/gptbased-code-review-2024/paper_type.json b/papers/gptbased-code-review-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Presents experimental evaluation of a GPT-4 code review system with quantitative results (error detection rates, performance metrics, API cost reductions) and expert user evaluation via survey." +} +\ No newline at end of file diff --git a/papers/gptoss-good-comprehensive-2025/paper_type.json b/papers/gptoss-good-comprehensive-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments evaluating existing GPT-OSS models on established benchmarks (MMLU, SciQ, C-Eval) and reports quantitative comparative results with statistical significance testing; the primary contribution is the experimental findings, not the benchmarks themselves." +} +\ No newline at end of file diff --git a/papers/gpts-are-gpts-labor-market-2023/paper_type.json b/papers/gpts-are-gpts-labor-market-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper conducts original empirical analysis by annotating O*NET occupation data with human and GPT-4 labels to estimate task exposure and reports quantitative findings on labor market impact, with supporting correlation analyses." +} +\ No newline at end of file diff --git a/papers/grading-scale-impact-2026/paper_type.json b/papers/grading-scale-impact-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments across six benchmarks and six LLM judges, measuring human-LLM alignment (ICC = 0.853) across different grading scales with quantitative results as the primary contribution." +} +\ No newline at end of file diff --git a/papers/granite-code-models-2024/paper_type.json b/papers/granite-code-models-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper's primary contribution is experimental evaluation of the Granite Code models, reporting quantitative performance results across multiple benchmarks and tasks rather than creating new benchmarks or introducing novel methodology." +} +\ No newline at end of file diff --git a/papers/graphbased-agent-memory-2026/paper_type.json b/papers/graphbased-agent-memory-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Provides a comprehensive taxonomy and synthesis of existing graph-based agent memory techniques, with categorization of storage types, retrieval paradigms, and benchmark mapping across the literature." +} +\ No newline at end of file diff --git a/papers/graphcodeagent-dual-graphguided-2025/paper_type.json b/papers/graphcodeagent-dual-graphguided-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a new method and validates it experimentally on the DevEval benchmark, reporting quantitative results (58.14% Pass@1, 43.81% relative improvement) and ablation studies." +} +\ No newline at end of file diff --git a/papers/greencode-learning-optimize-2025/paper_type.json b/papers/greencode-learning-optimize-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes an RL-based method for energy-efficient code generation and validates it through experiments on existing benchmarks (JavaCorpus, PY150), reporting quantitative energy consumption improvements (23-50%)." +} +\ No newline at end of file diff --git a/papers/greenserv-energyefficient-contextaware-2026/paper_type.json b/papers/greenserv-energyefficient-contextaware-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes GreenServ routing framework and validates it experimentally across 5 benchmarks and 16 LLMs, reporting quantitative results (22% accuracy improvement, 31% energy reduction)." +} +\ No newline at end of file diff --git a/papers/grok-4-model-2025/paper_type.json b/papers/grok-4-model-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports quantitative evaluation results of Grok 4 on multiple safety and capability benchmarks (MASK, AgentHarm, BioLP-Bench, VCT) with specific performance metrics; primary contribution is experimental findings." +} +\ No newline at end of file diff --git a/papers/grokking-generalization-beyond-2022/paper_type.json b/papers/grokking-generalization-beyond-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments on neural networks with quantitative measurements of the grokking phenomenon (generalization timing, training steps, weight decay effects), making experimental findings the primary contribution." +} +\ No newline at end of file diff --git a/papers/grokking-modular-arithmetic-2023/paper_type.json b/papers/grokking-modular-arithmetic-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "Provides analytic weight solutions and formal mathematical characterization of grokking through Fourier feature analysis rather than introducing experiments or benchmarks." +} +\ No newline at end of file diff --git a/papers/gsmplus-comprehensive-benchmark-2024/paper_type.json b/papers/gsmplus-comprehensive-benchmark-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "GSM-Plus is a new benchmark introducing perturbation-based evaluation of mathematical reasoning; the primary contribution is the benchmark framework itself, with experimental results demonstrating its utility." +} +\ No newline at end of file diff --git a/papers/gspr-aligning-llm-2025/paper_type.json b/papers/gspr-aligning-llm-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes a training method (GSPR) and validates it with quantitative experiments across 8 safety benchmarks, reporting specific accuracy metrics and comparative improvements as primary contributions." +} +\ No newline at end of file diff --git a/papers/guardian-multitiered-defense-2024/paper_type.json b/papers/guardian-multitiered-defense-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes and experimentally validates a defense system (GUARDIAN) against prompt injection attacks, reporting quantitative blocking rates across defense layers and model comparisons." +} +\ No newline at end of file diff --git a/papers/guiding-llms-right-2024/paper_type.json b/papers/guiding-llms-right-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper introduces DOMINO, a constrained decoding algorithm, and validates it through experiments on benchmarks (GSM8K), reporting quantitative results on speedup (up to 2.71x) and accuracy impacts compared to existing methods." +} +\ No newline at end of file diff --git a/papers/hacking-back-aihacker-2024/paper_type.json b/papers/hacking-back-aihacker-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Tests a defensive framework (Mantis) experimentally against LLM cyberattack agents, reporting quantitative success metrics (95.4%) across multiple challenges and agents." +} +\ No newline at end of file diff --git a/papers/hafixagent-historyaware-automated-2025/paper_type.json b/papers/hafixagent-historyaware-automated-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces HAFixAgent with history-aware APR strategies and reports quantitative experimental results (71.1%, 212.3% improvement) on the existing Defects4J benchmark, with the primary contribution being the experimental validation of the approach." +} +\ No newline at end of file diff --git a/papers/haieval-measuring-humanai-2025/paper_type.json b/papers/haieval-measuring-humanai-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts a rigorous within-subject RCT with 45 participants measuring quantitative outcomes of human-AI collaboration, with primary contributions being experimental findings about synergy effects and developer behavior rather than the benchmark methodology itself." +} +\ No newline at end of file diff --git a/papers/hairtrigger-alignment-blackbox-2026/paper_type.json b/papers/hairtrigger-alignment-blackbox-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "The primary contribution is a formal proof that static black-box evaluation fundamentally cannot distinguish post-update-robust from post-update-fragile models, with empirical validation as supporting evidence." +} +\ No newline at end of file diff --git a/papers/hallucination-consensus-multiagent-2025/paper_type.json b/papers/hallucination-consensus-multiagent-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Presents a multi-agent LLM framework (CANDOR) with quantitative experimental validation on existing benchmarks (HumanEvalJava, LeetCodeJava), demonstrating improvements over baselines in coverage, mutation score, and oracle correctness." +} +\ No newline at end of file diff --git a/papers/hallulens-llm-hallucination-2025/paper_type.json b/papers/hallulens-llm-hallucination-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The primary contribution is HalluLens, a new hallucination benchmark with a taxonomy and three dynamically-generated evaluation tasks; the LLM experiments are baselines demonstrating the benchmark's utility." +} +\ No newline at end of file diff --git a/papers/haps-hierarchical-llm-2026/paper_type.json b/papers/haps-hierarchical-llm-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes HAPS method and validates it through experiments on HotpotQA and MMLU benchmarks, reporting quantitative F1 improvements (1.6-3.6%) and ablation studies on parameter sharing." +} +\ No newline at end of file diff --git a/papers/hardtests-synthesizing-highquality-2025/paper_type.json b/papers/hardtests-synthesizing-highquality-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces HardTestGen, a pipeline for synthesizing high-quality test cases for evaluating LLM code generation, with validation experiments showing superior precision and recall compared to existing datasets." +} +\ No newline at end of file diff --git a/papers/hardware-security-benchmarking-2024/paper_type.json b/papers/hardware-security-benchmarking-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Proposes a new security-focused evaluation suite for LLM-generated HDL code with preliminary empirical validation across models and scenarios." +} +\ No newline at end of file diff --git a/papers/harmtransform-transforming-explicit-2025/paper_type.json b/papers/harmtransform-transforming-explicit-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper introduces HarmTransform framework and validates it through experiments with quantitative results (attack effectiveness, intent preservation metrics), ablation studies, and comparative baselines." +} +\ No newline at end of file diff --git a/papers/harnessing-language-coordination-2024/paper_type.json b/papers/harnessing-language-coordination-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Primary contribution is quantitative experimental findings from testing LLM coordination capabilities (model performance comparisons, human-machine collaboration analysis, capability limitations), not the benchmark design itself." +} +\ No newline at end of file diff --git a/papers/harnessing-large-language-2025/paper_type.json b/papers/harnessing-large-language-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes CuRev curation pipeline and reports quantitative experimental results (clarity, civility, BLEU/CodeBLEU scores) demonstrating improved downstream model performance on curated data, with primary contribution being the empirical validation of the approach." +} +\ No newline at end of file diff --git a/papers/haven-hallucinationmitigated-llm-2025/paper_type.json b/papers/haven-hallucinationmitigated-llm-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes a methodology to mitigate hallucinations in Verilog code generation and validates it through experiments on VerilogEval-Human, reporting quantitative improvements (61.1% pass@1, 6.7pp gain over baseline), making the experimental findings the primary contribution." +} +\ No newline at end of file diff --git a/papers/hazard-analysis-framework-2022/paper_type.json b/papers/hazard-analysis-framework-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a hazard analysis framework adapted from safety engineering for code synthesis LLMs and uses qualitative analysis to argue about risks and hazards, without experimental validation or quantitative benchmarking." +} +\ No newline at end of file diff --git a/papers/hcast-humancalibrated-autonomy-2025/paper_type.json b/papers/hcast-humancalibrated-autonomy-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The primary contribution is HCAST, a new benchmark dataset of 189 autonomy tasks calibrated against human performance baselines; agent evaluations are provided as validation of the benchmark's difficulty calibration." +} +\ No newline at end of file diff --git a/papers/hearsay-benchmark-do-2026/paper_type.json b/papers/hearsay-benchmark-do-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces HearSay Benchmark, a new evaluation framework for assessing privacy leakage in audio LLMs, with baseline results across 13 models." +} +\ No newline at end of file diff --git a/papers/heterogeneous-multiagent-reinforcement-2024/paper_type.json b/papers/heterogeneous-multiagent-reinforcement-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a new method (SHPPO) and validates it through experiments on established benchmarks (SMAC, GRF) with quantitative comparisons to baselines and ablation studies." +} +\ No newline at end of file diff --git a/papers/hidden-dangers-browsing-2025/paper_type.json b/papers/hidden-dangers-browsing-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "The paper proposes a prescriptive defense-in-depth framework and argues security risks in Browser Use based on qualitative case-study analysis, rather than quantitative benchmarking or broad empirical validation." +} +\ No newline at end of file diff --git a/papers/hidden-dimensions-llm-2025/paper_type.json b/papers/hidden-dimensions-llm-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "Primary contribution is formal mathematical analysis of LLM alignment via SVD-based orthogonal direction decomposition and formal interpretation methods (Layer-wise Relevance Propagation), with empirical validation of these theoretical properties." +} +\ No newline at end of file diff --git a/papers/hidden-progress-deep-2022/paper_type.json b/papers/hidden-progress-deep-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "Provides formal mathematical proofs of SGD convergence on parity learning and analyzes the mechanism (Fourier gap amplification) governing learning dynamics." +} +\ No newline at end of file diff --git a/papers/hidden-risks-llm-web-code-2025/paper_type.json b/papers/hidden-risks-llm-web-code-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Evaluates security of code generated by multiple LLMs through systematic testing across 48 security parameters, reporting quantitative experimental findings about failure rates and vulnerabilities across models." +} +\ No newline at end of file diff --git a/papers/hiding-ai-traffic-2025/paper_type.json b/papers/hiding-ai-traffic-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Demonstrates a novel MCP-based C2 attack via case study with quantitative results on time-to-compromise and detection evasion metrics." +} +\ No newline at end of file diff --git a/papers/hierarchical-document-refinement-2025/paper_type.json b/papers/hierarchical-document-refinement-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes LongRefiner system and validates it through experiments across seven QA datasets with quantitative results (performance, token efficiency, latency), ablation studies, and scaling analysis." +} +\ No newline at end of file diff --git a/papers/hijacking-large-language-2023/paper_type.json b/papers/hijacking-large-language-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a GGI attack method and reports quantitative experimental results (success rates, ASR percentages) across multiple models and datasets to validate the approach." +} +\ No newline at end of file diff --git a/papers/hintaugmented-reranking-efficient-2025/paper_type.json b/papers/hintaugmented-reranking-efficient-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Paper runs experiments with quantitative metrics (MAP, MRR) and human evaluation to validate the hint-augmented re-ranking approach for product search, making experimental findings the primary contribution." +} +\ No newline at end of file diff --git a/papers/hits-highcoverage-llmbased-2024/paper_type.json b/papers/hits-highcoverage-llmbased-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes HITS method and validates it through experiments, reporting quantitative coverage metrics and comparisons against multiple baselines on code generation tasks." +} +\ No newline at end of file diff --git a/papers/hogyan-igazodjunk-el-2026/paper_type.json b/papers/hogyan-igazodjunk-el-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Reviews existing expert assessments to argue a viewpoint about AI's labor market effects using the SBTC framework, without experimental validation of its conclusions." +} +\ No newline at end of file diff --git a/papers/holistic-eval-llms-code-2025/paper_type.json b/papers/holistic-eval-llms-code-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments comparing multiple LLM models on 944 LeetCode problems, reporting quantitative metrics (Pass@1, error types, performance across languages) with systematic analysis of results." +} +\ No newline at end of file diff --git a/papers/holistic-framework-multimodal-2024/paper_type.json b/papers/holistic-framework-multimodal-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Paper runs experiments on 18,885 brain CT scans and reports quantitative results (FORTE F1=0.71, 74% Turing test indistinguishability) from BrainGPT; FORTE metric is a methodological contribution enabling better evaluation, but the primary contribution is the experimental findings about the model's performance." +} +\ No newline at end of file diff --git a/papers/how-ai-impacts-2026/paper_type.json b/papers/how-ai-impacts-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports quantitative results from a randomized controlled trial (RCT) with 52 developers measuring AI's impact on quiz scores and task completion, supplemented by qualitative analysis of interaction patterns." +} +\ No newline at end of file diff --git a/papers/how-alignment-jailbreak-2024/paper_type.json b/papers/how-alignment-jailbreak-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts experiments analyzing hidden states of LLMs, reporting quantitative findings (>95% classifier accuracy) about how alignment and jailbreak mechanisms work mechanistically." +} +\ No newline at end of file diff --git a/papers/how-beginning-programmers-2024/paper_type.json b/papers/how-beginning-programmers-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Controlled study of 120 students attempting 48 CS1 problems, reporting quantitative success rates and statistical tests alongside qualitative findings about mental models of Code LLMs." +} +\ No newline at end of file diff --git a/papers/how-data-mixing-2025/paper_type.json b/papers/how-data-mixing-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "The paper's primary contribution is a formal asymptotic equivalence result proving that Transformers with MLPs behave like finite-degree polynomial predictors; experiments on synthetic and real data serve as validation of this theoretical result." +} +\ No newline at end of file diff --git a/papers/how-do-ai-2025/paper_type.json b/papers/how-do-ai-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts observational comparison of AI agents versus human workflows across occupations, reporting quantitative findings on performance metrics (speed, cost, quality, workflow alignment)." +} +\ No newline at end of file diff --git a/papers/how-do-data-2023/paper_type.json b/papers/how-do-data-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs a Wizard-of-Oz experimental study with 13 human participants and reports quantitative findings about how data analysts respond to planning and execution assistance." +} +\ No newline at end of file diff --git a/papers/how-does-controllability-2025/paper_type.json b/papers/how-does-controllability-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Investigates when and how linear steerability emerges during pretraining through experiments tracking concept representations across training checkpoints, measuring separability metrics, and generalizing findings across models." +} +\ No newline at end of file diff --git a/papers/how-efficient-llmgenerated-2024/paper_type.json b/papers/how-efficient-llmgenerated-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The primary contribution is experimental findings quantifying LLM code efficiency gaps (eff@1=0.454 for GPT-4, etc.) and analyzing algorithmic limitations, with the benchmark serving as the evaluation methodology rather than the primary contribution." +} +\ No newline at end of file diff --git a/papers/how-far-can-2024/paper_type.json b/papers/how-far-can-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments comparing zero-shot vs few-shot LLM repair, evaluates auxiliary information impact, and reports quantitative results (repair rates, percentage improvements/degradations) across multiple models and the proposed SRepair framework." +} +\ No newline at end of file diff --git a/papers/how-far-we-2025/paper_type.json b/papers/how-far-we-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces FINDER benchmark (100 tasks, 419 checklists) and DEFT failure taxonomy as primary contributions; evaluation of 13 systems serves to validate the framework rather than being the primary contribution." +} +\ No newline at end of file diff --git a/papers/how-much-does-2024/paper_type.json b/papers/how-much-does-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Randomized controlled trial with 96 engineers measuring quantitative impact of AI coding features on task completion time, primary contribution is experimental findings." +} +\ No newline at end of file diff --git a/papers/how-personnel-security-2025/paper_type.json b/papers/how-personnel-security-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "The paper argues a viewpoint about applying personnel security concepts to AI insider risk and proposes a unified taxonomy as a conceptual framework, without experimental validation." +} +\ No newline at end of file diff --git a/papers/how-safe-aigenerated-2025/paper_type.json b/papers/how-safe-aigenerated-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Large-scale experimental study quantifying security vulnerabilities in AI-generated patches across 20,000+ SWE-bench issues, with specific numerical findings comparing LLM and agentic frameworks." +} +\ No newline at end of file diff --git a/papers/how-should-i-2025/paper_type.json b/papers/how-should-i-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Decade-scale systematic review of 572 existing code benchmarks with meta-analysis and human study of researcher awareness, synthesizing findings about benchmark quality deficiencies across the field." +} +\ No newline at end of file diff --git a/papers/human-interaction-evals-llm-2024/paper_type.json b/papers/human-interaction-evals-llm-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "The paper argues that current evaluations are inadequate and proposes three prescriptive organizing principles for interactive evaluation design without reporting experimental results or formal proofs." +} +\ No newline at end of file diff --git a/papers/human-machine-how-2025/paper_type.json b/papers/human-machine-how-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Interview study with 20 software engineers conducting original research to understand perceptions and engagement with AI-assisted code reviews, reporting qualitative findings about cognitive, emotional, and behavioral dimensions." +} +\ No newline at end of file diff --git a/papers/humancentered-ai-product-2024/paper_type.json b/papers/humancentered-ai-product-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper validates a conceptual framework through a single case study with real data (3,440 examples), reporting qualitative findings about NC AutoML's capabilities across prototyping stages, making the empirical case study the primary evidence base rather than the framework alone." +} +\ No newline at end of file diff --git a/papers/humanevalcomm-benchmarking-communication-2024/paper_type.json b/papers/humanevalcomm-benchmarking-communication-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper introduces HumanEvalComm, a new benchmark for evaluating communication competence in code generation LLMs, with baseline experiments demonstrating its utility." +} +\ No newline at end of file diff --git a/papers/humanevalxl-multilingual-code-2024/paper_type.json b/papers/humanevalxl-multilingual-code-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The primary contribution is introducing HumanEval-XL, a new multilingual code generation benchmark spanning 23 natural languages and 12 programming languages; while the paper includes experimental results from running models on the benchmark, the benchmark itself is the main contribution." +} +\ No newline at end of file diff --git a/papers/humaninstructionfree-llm-selfalignment-2024/paper_type.json b/papers/humaninstructionfree-llm-selfalignment-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes ISARA and validates it through experiments on multiple benchmarks (BeaverTails, TruthfulQA, AlpacaEval), reporting quantitative performance improvements and data scaling ratios compared to baselines." +} +\ No newline at end of file diff --git a/papers/hybrid-automated-program-2024/paper_type.json b/papers/hybrid-automated-program-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes GIANTREPAIR method and validates it through quantitative experiments on Defects4J benchmark, comparing performance against baselines and prior APR tools." +} +\ No newline at end of file diff --git a/papers/hybridflow-resourceadaptive-subtask-2025/paper_type.json b/papers/hybridflow-resourceadaptive-subtask-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces HybridFlow system and reports quantitative results on multiple benchmarks (GPQA, MMLU-Pro, AIME24, LiveBench-Reasoning) showing accuracy, latency, and cost improvements compared to baselines." +} +\ No newline at end of file diff --git a/papers/hyperagent-generalist-software-2024/paper_type.json b/papers/hyperagent-generalist-software-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper presents a new multi-agent system and evaluates it with quantitative experiments across multiple existing benchmarks (SWE-Bench, RepoExec, Defects4J) with ablation studies, making the primary contribution experimental findings rather than a benchmark, survey, position, or theory." +} +\ No newline at end of file diff --git a/papers/hypergraphrag-retrievalaugmented-generation-2025-2/paper_type.json b/papers/hypergraphrag-retrievalaugmented-generation-2025-2/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The primary contribution is experimental validation showing HyperGraphRAG outperforms six baselines across five domains on multiple metrics (F1, retrieval similarity, generation evaluation) with supporting ablation studies; the theoretical proof appears to be supplementary analysis." +} +\ No newline at end of file diff --git a/papers/hypergraphrag-retrievalaugmented-generation-2025/paper_type.json b/papers/hypergraphrag-retrievalaugmented-generation-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes HyperGraphRAG and validates it through experiments across five domains with quantitative metrics (F1, retrieval similarity, generation evaluation) and ablation studies, making the primary contribution the experimental findings rather than theory or benchmarks." +} +\ No newline at end of file diff --git a/papers/hypothesis-generation-materials-2025/paper_type.json b/papers/hypothesis-generation-materials-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper develops an LLM-based hypothesis generation framework (ACCELMAT) and validates it with quantitative experimental results on the MATDESIGN benchmark, showing improvements over baselines (80% vs 70%, 89% vs 79.67%), with the primary contribution being the experimental findings rather than the benchmark itself." +} +\ No newline at end of file diff --git a/papers/icon-indirect-prompt-2026/paper_type.json b/papers/icon-indirect-prompt-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes ICON defense mechanism and reports quantitative experimental results (0.4% ASR, 10% UA improvement, generalization rates) across multiple benchmarks and LLM backbones, with primary contribution being empirical validation of the method." +} +\ No newline at end of file diff --git a/papers/identifying-inaccurate-descriptions-2024/paper_type.json b/papers/identifying-inaccurate-descriptions-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments testing comment accuracy detection methods on LLM-generated code, evaluates nine existing techniques, proposes and validates a novel 'document testing' approach with statistical results (p < 10⁻⁹, ROC-AUC 0.67)." +} +\ No newline at end of file diff --git a/papers/idgenrec-llmrecsys-alignment-2024/paper_type.json b/papers/idgenrec-llmrecsys-alignment-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes IDGenRec method and validates it through extensive experiments on 4 sequential recommendation benchmarks with 23-42% improvements over 10 baselines plus zero-shot evaluation on 6 unseen datasets." +} +\ No newline at end of file diff --git a/papers/illusion-insight-reasoning-2026/paper_type.json b/papers/illusion-insight-reasoning-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper conducts large-scale experiments analyzing 1M+ model traces and reports quantitative findings (6.31%, 1.79%, +8.41pp improvements) about reasoning shifts in language models, with primary contribution being experimental results." +} +\ No newline at end of file diff --git a/papers/imagebased-prompt-injection-2025/paper_type.json b/papers/imagebased-prompt-injection-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Systematically experiments with image-based prompt injection attacks on GPT-4-turbo, reporting quantitative success rates (100% visible, 64% stealth) across different parameters like font scale and prefixing strategies." +} +\ No newline at end of file diff --git a/papers/immaculate-practical-llm-2026/paper_type.json b/papers/immaculate-practical-llm-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces IMMACULATE, a novel auditing framework and methodology for detecting deviations in black-box LLM APIs; the primary contribution is the framework itself, with experiments validating its effectiveness." +} +\ No newline at end of file diff --git a/papers/impact-artificial-intelligence-2025/paper_type.json b/papers/impact-artificial-intelligence-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Narrative literature review surveying and comparing existing AI-enhanced no-code/low-code platforms from the literature without conducting original experiments or creating new benchmarks." +} +\ No newline at end of file diff --git a/papers/impact-code-language-2023/paper_type.json b/papers/impact-code-language-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments comparing code language models on bug-fixing tasks across four benchmarks, reporting quantitative results (bug counts, percentages) as its primary contribution." +} +\ No newline at end of file diff --git a/papers/impact-finetuning-large-2025/paper_type.json b/papers/impact-finetuning-large-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments comparing full fine-tuning, LoRA, and IA3 on multiple LLMs for program repair, reporting quantitative performance results and efficiency metrics." +} +\ No newline at end of file diff --git a/papers/impact-large-language-2024/paper_type.json b/papers/impact-large-language-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Uses a natural experiment (GitHub Copilot's selective language support) to measure quantitative causal effects on open-source contributions, with primary contribution being empirical findings about LLM impact." +} +\ No newline at end of file diff --git a/papers/implementing-grassroots-logic-2026/paper_type.json b/papers/implementing-grassroots-logic-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "The paper proves correctness of formal operational semantics (dGLP and madGLP) with respect to nondeterministic specifications, constituting a formal analysis contribution rather than experimental validation." +} +\ No newline at end of file diff --git a/papers/importance-sampling-all-2025/paper_type.json b/papers/importance-sampling-all-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes BIS method using importance sampling/IWAE, runs experiments on CodeLlama models (7B-70B), and reports quantitative results (1.1% error, baseline comparisons) on cross-benchmark prediction tasks." +} +\ No newline at end of file diff --git a/papers/importanceaware-data-selection-2025/paper_type.json b/papers/importanceaware-data-selection-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes MIWV data selection method and validates it through controlled experiments on multiple models (LLaMA/LLaMA2/Qwen2.5) and datasets (Alpaca/WizardLM), demonstrating quantitative improvements over baselines and full-dataset training." +} +\ No newline at end of file diff --git a/papers/importsnare-directed-code-2025/paper_type.json b/papers/importsnare-directed-code-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Demonstrates a poison attack on RAG systems with quantitative experimental results (>50% success rate, cross-platform validation), where the primary contribution is empirical findings about attack effectiveness." +} +\ No newline at end of file diff --git a/papers/improving-automated-program-2022/paper_type.json b/papers/improving-automated-program-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments quantifying how domain shift degrades APR model accuracy and validates a proposed domain adaptation framework with empirical results (13.05% improvement, up to 39.6% on target projects)." +} +\ No newline at end of file diff --git a/papers/improving-automated-secure-2025/paper_type.json b/papers/improving-automated-secure-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a methodology for generating synthetic vulnerability datasets using LLMs with preliminary data collection, but presents no completed experiments or validated results." +} +\ No newline at end of file diff --git a/papers/improving-automatically-generated-2022/paper_type.json b/papers/improving-automatically-generated-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments comparing APR tools and LLM approaches on code repair tasks, reporting quantitative results on fix rates and bug pattern analysis." +} +\ No newline at end of file diff --git a/papers/improving-factuality-reasoning-2023/paper_type.json b/papers/improving-factuality-reasoning-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments on existing benchmarks (GSM8K, MMLU, arithmetic, biographies) and reports quantitative improvements from multiagent debate, with primary contribution being the experimental findings on performance gains and scaling behavior." +} +\ No newline at end of file diff --git a/papers/improving-llm-general-2025/paper_type.json b/papers/improving-llm-general-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "The primary contribution is a theoretical algorithm (ONPO) with a formal convergence proof achieving O(T⁻¹) duality gap bounds; empirical validation on benchmarks is secondary." +} +\ No newline at end of file diff --git a/papers/improving-llm-reasoning-2024/paper_type.json b/papers/improving-llm-reasoning-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes a multi-agent Tree-of-Thought method and reports quantitative experimental results (0.6-8.8pp improvements) on the GSM8K benchmark, making the primary contribution empirical findings rather than a new benchmark or theoretical analysis." +} +\ No newline at end of file diff --git a/papers/improving-llm-safety-2025/paper_type.json b/papers/improving-llm-safety-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes DOOR/W-DOOR methods and validates them experimentally, reporting quantitative results on attack success rates and utility benchmarks (Llama-3-8B, HellaSwag)." +} +\ No newline at end of file diff --git a/papers/improving-robustness-llmbased-2024/paper_type.json b/papers/improving-robustness-llmbased-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes a method for improving LLM-based TTS robustness and reports quantitative experimental results (character error rate improvement from 9.03% to 3.92%), making the primary contribution experimental findings rather than a new benchmark, survey, or theoretical analysis." +} +\ No newline at end of file diff --git a/papers/incontext-distillation-selfconsistency-2025/paper_type.json b/papers/incontext-distillation-selfconsistency-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces an in-context distillation method and validates it through quantitative experiments on ALFWorld and AppWorld, reporting specific cost reduction and accuracy metrics." +} +\ No newline at end of file diff --git a/papers/incontext-learning-learning-2025/paper_type.json b/papers/incontext-learning-learning-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Large-scale empirical study (1.89M predictions) evaluating in-context learning performance across LLMs and prompting strategies, with quantitative findings about ICL brittleness and task inconsistency." +} +\ No newline at end of file diff --git a/papers/indirect-prompt-injections-2025/paper_type.json b/papers/indirect-prompt-injections-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Paper evaluates a firewall defense strategy across existing IPI benchmarks with quantitative results (attack success rates, utility metrics) comparing against other defenses; benchmark critique is a secondary finding, not the primary contribution." +} +\ No newline at end of file diff --git a/papers/inducing-vulnerable-code-2025/paper_type.json b/papers/inducing-vulnerable-code-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper conducts experimental attacks on LLM coding assistants and reports quantitative results (84.29% average attack success rate, transfer rates, real-world ASR), with the primary contribution being the experimental findings about adversarial code induction." +} +\ No newline at end of file diff --git a/papers/inference-scaling-flaws-2024/paper_type.json b/papers/inference-scaling-flaws-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "Proves fundamental limits of resampling-based inference scaling through mathematical analysis, deriving accuracy ceilings and optimal sample counts under cost-benefit assumptions." +} +\ No newline at end of file diff --git a/papers/inference-scaling-laws-2024/paper_type.json b/papers/inference-scaling-laws-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Primary contribution is empirical analysis of inference scaling laws with experimental comparisons (Llemma models, REBASE algorithm, FLOP measurements), supported by theoretical convergence analysis." +} +\ No newline at end of file diff --git a/papers/inference-time-llm-2024/paper_type.json b/papers/inference-time-llm-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts experiments on Mistral-7B across multiple domains with quantitative results (22/27 domain-behavior combinations, 12x speedup), demonstrating that Alignment Vectors enable inference-time preference tunability." +} +\ No newline at end of file diff --git a/papers/inferenceonly-prompt-projection-2026/paper_type.json b/papers/inferenceonly-prompt-projection-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "The paper's primary contribution is formalizing the Safety-Prompt Alignment Trade-off (SPAT) in total variation and proving that nontrivial safety gains require distributional deviation; the proposed method is a practical application built on this theoretical foundation." +} +\ No newline at end of file diff --git a/papers/information-capacity-evaluating-2025/paper_type.json b/papers/information-capacity-evaluating-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper introduces 'information capacity' as a novel evaluation metric/framework for measuring LLM efficiency via text compression; experiments on 52 models serve to validate and demonstrate the metric's utility across model scales." +} +\ No newline at end of file diff --git a/papers/inide-humanai-experience-2024/paper_type.json b/papers/inide-humanai-experience-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "It is explicitly a literature review synthesizing 36 papers across three research branches of in-IDE human-AI experience, with the primary contribution being the organization and synthesis of existing work." +} +\ No newline at end of file diff --git a/papers/injecguard-benchmarking-mitigating-2024/paper_type.json b/papers/injecguard-benchmarking-mitigating-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper's primary contribution is the InjecGuard benchmark for evaluating prompt injection guards across multiple dimensions (benign, malicious, over-defense), which reveals and characterizes the over-defense problem in existing models; the proposed model is a secondary contribution validating the benchmark's utility." +} +\ No newline at end of file diff --git a/papers/injecting-falsehoods-adversarial-2025/paper_type.json b/papers/injecting-falsehoods-adversarial-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments with the χmera framework to quantify MitM attack success rates and detection performance, with primary contribution being empirical findings on factual recall vulnerabilities in LLMs." +} +\ No newline at end of file diff --git a/papers/input-reduction-enhanced-2025/paper_type.json b/papers/input-reduction-enhanced-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces ReduceFix technique with quantitative experimental results (89.1% input reduction, 53.8% improvement) validated on LFTBench and OSS-Fuzz benchmarks; primary contribution is empirical findings about input reduction's impact on LLM repair performance." +} +\ No newline at end of file diff --git a/papers/institutional-ai-governance-2026/paper_type.json b/papers/institutional-ai-governance-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a governance framework (Institutional AI) and argues why existing approaches fail, without experimental validation; the mechanism design formalism is a tool within the framework rather than the primary contribution." +} +\ No newline at end of file diff --git a/papers/institutional-ai-governing-2026/paper_type.json b/papers/institutional-ai-governing-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments across 6 model configurations with N=90 runs per condition, reporting quantitative metrics (Cohen's d effect sizes, collusion tier means, incidence rates) comparing institutional governance vs baselines." +} +\ No newline at end of file diff --git a/papers/instruction-tuning-large-2025/paper_type.json b/papers/instruction-tuning-large-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper's primary contribution is experimental validation that instruction tuning a smaller LLM on a curated 7K dataset achieves competitive performance with GPT-4o on tabular data generation, demonstrated through quantitative metrics across 20 existing datasets." +} +\ No newline at end of file diff --git a/papers/insured-agents-decentralized-2025/paper_type.json b/papers/insured-agents-decentralized-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "Provides formal game-theoretic analysis proving subgame-perfect equilibrium conditions for the proposed mechanism rather than empirical validation." +} +\ No newline at end of file diff --git a/papers/integrated-alignment-2025/paper_type.json b/papers/integrated-alignment-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "The paper argues a viewpoint about field fragmentation, proposes a conceptual framework (Integrated Alignment with 12 design principles), and makes prescriptive claims about how the field should unify—without experimental validation or formal proofs." +} +\ No newline at end of file diff --git a/papers/integrating-aidriven-automated-2025/paper_type.json b/papers/integrating-aidriven-automated-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Makes prescriptive claims about benefits, challenges, and best practices for AI-driven code review without experimental validation or actual quantitative data despite methodology labels." +} +\ No newline at end of file diff --git a/papers/integrating-generative-ai-2024/paper_type.json b/papers/integrating-generative-ai-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Explicitly described as a narrative review that synthesizes existing knowledge about generative AI in software development across multiple dimensions without presenting original empirical evidence or experiments." +} +\ No newline at end of file diff --git a/papers/intelligence-per-watt-2025/paper_type.json b/papers/intelligence-per-watt-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports quantitative experimental results measuring intelligence-per-watt across local and cloud models, with decomposed efficiency improvements and hybrid routing performance." +} +\ No newline at end of file diff --git a/papers/intelligent-devops-leveraging-2024/paper_type.json b/papers/intelligent-devops-leveraging-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Narrative review and meta-analysis drawing metrics from secondary citations rather than original experiments." +} +\ No newline at end of file diff --git a/papers/interactive-code-generation-2022/paper_type.json b/papers/interactive-code-generation-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes TiCoder method and validates it with quantitative experiments on MBPP and HumanEval benchmarks, including ablation studies and performance metrics as primary contributions." +} +\ No newline at end of file diff --git a/papers/interactive-debugging-steering-2025/paper_type.json b/papers/interactive-debugging-steering-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts formative interviews and a user study (n=14) reporting quantitative results on debugging strategies and tool effectiveness metrics." +} +\ No newline at end of file diff --git a/papers/interfaze-future-ai-2026/paper_type.json b/papers/interfaze-future-ai-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes Interfaze, a tool-augmented system combining small models, and validates the approach experimentally by reporting quantitative results on existing benchmarks (AIME, MMLU, and others)." +} +\ No newline at end of file diff --git a/papers/internbootcamp-technical-report-2025/paper_type.json b/papers/internbootcamp-technical-report-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Primary contribution is experimental findings on task scaling: showing that increasing task count from 8 to 512 consistently improves reasoning performance and training efficiency, with SOTA results on benchmarks, while InternBootcamp framework and BOOTCAMP-EVAL are the experimental infrastructure rather than the main focus." +} +\ No newline at end of file diff --git a/papers/interpreting-emergent-extreme-2026/paper_type.json b/papers/interpreting-emergent-extreme-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a Shapley value-based attribution framework and validates it experimentally across multiple simulation scenarios, comparing performance against baselines." +} +\ No newline at end of file diff --git a/papers/interpretive-cultures-resonance-2026/paper_type.json b/papers/interpretive-cultures-resonance-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts original qualitative research (12 semi-structured interviews) and reports empirical findings about AI use patterns and meaning-making in tarot divination; primary contribution is understanding grounded in interview data." +} +\ No newline at end of file diff --git a/papers/inthewild-model-organisms-2026/paper_type.json b/papers/inthewild-model-organisms-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a data attribution method and empirically validates it on OLMo 2, reporting quantitative results (63-85% reduction in harmful behaviors) demonstrating effectiveness." +} +\ No newline at end of file diff --git a/papers/introduction-generative-ai-2025/paper_type.json b/papers/introduction-generative-ai-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Primarily reviews and synthesizes existing applications of Generative AI in DevOps (meta-analysis), with quantitative claims sourced from secondary sources rather than original experiments or benchmarks." +} +\ No newline at end of file diff --git a/papers/introlm-introspective-language-2026/paper_type.json b/papers/introlm-introspective-language-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces a method (IntroLM) and validates it through experiments with quantitative metrics (ROC-AUC scores), baseline comparisons, ablation studies, and practical application results (latency/usage reduction)." +} +\ No newline at end of file diff --git a/papers/intuition-to-evidence-productivity-2025/paper_type.json b/papers/intuition-to-evidence-productivity-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts a year-long real-world deployment study with 300 engineers reporting quantitative productivity metrics, making the primary contribution an experimental finding rather than a benchmark, survey, position, or formal theory." +} +\ No newline at end of file diff --git a/papers/invalidator-automated-patch-2023/paper_type.json b/papers/invalidator-automated-patch-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes INVALIDATOR method, implements it, and reports quantitative experimental results (81% accuracy, 0.87 F1-score) on Defects4J benchmarks with baseline comparisons and ablation study." +} +\ No newline at end of file diff --git a/papers/inverse-reinforcement-learning-2025/paper_type.json b/papers/inverse-reinforcement-learning-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes DR-IRL method and validates it through experiments on multiple safety benchmarks (StrongReject, XsTest, WildChat, stereotypes) across different models, with ablation studies demonstrating experimental contributions." +} +\ No newline at end of file diff --git a/papers/inverserlignment-inverse-reinforcement-2024/paper_type.json b/papers/inverserlignment-inverse-reinforcement-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a method for LLM alignment grounded in inverse RL theory, then validates it through experiments on benchmark tasks with quantitative comparisons, making the primary contribution the empirical findings rather than the theoretical framework alone." +} +\ No newline at end of file diff --git a/papers/investigating-intersectional-bias-2025/paper_type.json b/papers/investigating-intersectional-bias-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The primary contribution is experimental findings about confidence disparities across intersectional demographic attributes, with WinoIdentity as the enabling benchmark rather than the main deliverable." +} +\ No newline at end of file diff --git a/papers/investigating-vulnerability-llmasajudge-2025/paper_type.json b/papers/investigating-vulnerability-llmasajudge-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs controlled experiments testing LLM-as-a-Judge vulnerabilities to adversarial attacks, reporting quantitative success rates and comparing multiple attack methods with control conditions." +} +\ No newline at end of file diff --git a/papers/investigation-group-query-2025/paper_type.json b/papers/investigation-group-query-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments investigating how Group Query Attacks degrade LLM performance across different model types and tasks, reporting quantitative empirical findings." +} +\ No newline at end of file diff --git a/papers/ipiguard-novel-tool-2025/paper_type.json b/papers/ipiguard-novel-tool-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes IPIGuard defense mechanism and validates it through extensive experiments on AgentDojo benchmark with quantitative results across six LLMs, four attack types, and ablation studies." +} +\ No newline at end of file diff --git a/papers/isheep-selfalignment-llm-2024/paper_type.json b/papers/isheep-selfalignment-llm-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes I-SHEEP method and validates it through experiments on AlpacaEval and IFEval benchmarks, reporting quantitative improvements across model sizes and iterations." +} +\ No newline at end of file diff --git a/papers/jailbreaking-mitigation-vulnerabilities-2024/paper_type.json b/papers/jailbreaking-mitigation-vulnerabilities-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "The paper explicitly surveys and categorizes existing jailbreak attacks and defenses across multiple modalities, synthesizing the field rather than primarily contributing new experiments or benchmarks." +} +\ No newline at end of file diff --git a/papers/jailbreaking-safety-aligned-llms-2024/paper_type.json b/papers/jailbreaking-safety-aligned-llms-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments testing adaptive jailbreaking attacks on multiple leading LLMs and reports quantitative results (100% success rates), with the primary contribution being experimental findings rather than a new benchmark, survey, or theoretical analysis." +} +\ No newline at end of file diff --git a/papers/jatmo-prompt-injection-2023/paper_type.json b/papers/jatmo-prompt-injection-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes Jatmo and demonstrates its effectiveness through experiments across 7 tasks with quantitative metrics (attack success rates, quality comparisons), making experimental findings the primary contribution." +} +\ No newline at end of file diff --git a/papers/javabench-benchmark-objectoriented-2024/paper_type.json b/papers/javabench-benchmark-objectoriented-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces JavaBench, a new project-level Java code generation benchmark with specific design (4 OOP projects, 389 methods, 106 classes), validated by student performance; LLM evaluation serves to establish benchmark utility rather than being the primary contribution." +} +\ No newline at end of file diff --git a/papers/jenius-agent-experiencedriven-2026/paper_type.json b/papers/jenius-agent-experiencedriven-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Primary contribution is the Jenius-Agent system and its experimental results (76.5% task completion vs 56.6% baseline), with the custom benchmark used as an evaluation instrument rather than the main contribution." +} +\ No newline at end of file diff --git a/papers/joint-continual-learning-2026/paper_type.json b/papers/joint-continual-learning-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes DA-GRPO method and validates it through experiments on math reasoning and code generation benchmarks, reporting quantitative improvements over baselines." +} +\ No newline at end of file diff --git a/papers/judging-llmasajudge-mtbench-2023/paper_type.json b/papers/judging-llmasajudge-mtbench-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Primary contribution is quantitative experimental findings on LLM judge performance (80%+ agreement rates, bias identification, and mitigation strategies) across controlled and crowdsourced settings, with MT-Bench serving as an evaluation vehicle rather than the main contribution." +} +\ No newline at end of file diff --git a/papers/kernelband-steering-llmbased-2025/paper_type.json b/papers/kernelband-steering-llmbased-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes KernelBand method and reports quantitative experimental results (speedups, success rates, ablations) on TritonBench-G across multiple GPU architectures and LLMs." +} +\ No newline at end of file diff --git a/papers/knapspec-selfspeculative-decoding-2026/paper_type.json b/papers/knapspec-selfspeculative-decoding-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes an optimization method (knapsack-based layer selection) and validates it with quantitative experiments across multiple models, reporting wall-clock speedups and throughput improvements as primary contributions." +} +\ No newline at end of file diff --git a/papers/knod-domain-knowledge-2023/paper_type.json b/papers/knod-domain-knowledge-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes KNOD method for automated program repair and validates it through quantitative experiments on three established benchmarks (Defects4J v1.2, v2.0, QuixBugs) with ablation studies measuring component contributions." +} +\ No newline at end of file diff --git a/papers/kormo-korean-open-2025/paper_type.json b/papers/kormo-korean-open-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper trains a new model (KORMo) and reports quantitative benchmark results (64.2 English avg, 58.2 Korean avg) along with experimental findings about synthetic data diversity's effect on training." +} +\ No newline at end of file diff --git a/papers/kornat-llm-alignment-2024/paper_type.json b/papers/kornat-llm-alignment-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces KorNAT, a novel benchmark with 4K social value questions and 6K knowledge questions, with baseline LLM evaluations as secondary contribution." +} +\ No newline at end of file diff --git a/papers/ktester-leveraging-domain-2025/paper_type.json b/papers/ktester-leveraging-domain-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes KTester framework and validates it through experiments comparing against baselines with quantitative metrics (5.03% pass rate improvement, 11.67% coverage gain) and ablation studies on the HITS dataset." +} +\ No newline at end of file diff --git a/papers/laafd-llmbased-agents-2026/paper_type.json b/papers/laafd-llmbased-agents-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes an LLM agent workflow for FPGA design and reports quantitative experimental results on 15 HPC kernels with performance comparisons against hand-tuned and domain-specific baselines." +} +\ No newline at end of file diff --git a/papers/laagrv-llm-assisted-2024/paper_type.json b/papers/laagrv-llm-assisted-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes LAAG-RV method and evaluates it experimentally on 6 OpenTitan designs with quantitative results (assertion counts, comparison with ChIRAAG baseline)." +} +\ No newline at end of file diff --git a/papers/lacy-what-small-2026/paper_type.json b/papers/lacy-what-small-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes and experimentally validates LaCy method on a cascading LM setup, reporting quantitative results (FactScore improvements, fact leakage metrics) on benchmark evaluation." +} +\ No newline at end of file diff --git a/papers/lamda-language-models-2022/paper_type.json b/papers/lamda-language-models-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments fine-tuning language models on dialog tasks and reports quantitative results (73.2% groundedness, 65% citation accuracy, safety improvements), with primary contribution being experimental findings on how supervised fine-tuning improves dialog quality." +} +\ No newline at end of file diff --git a/papers/language-model-behavioral-2025/paper_type.json b/papers/language-model-behavioral-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments on 1,418 model checkpoints across three architectures and scales, reporting quantitative findings about behavioral phases and variance explained by heuristics." +} +\ No newline at end of file diff --git a/papers/language-models-code-2025/paper_type.json b/papers/language-models-code-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Systematic literature review synthesizing 53 primary studies on LM approaches to code optimization, analyzing adoption patterns and identifying research gaps rather than conducting novel experiments." +} +\ No newline at end of file diff --git a/papers/large-language-model-2024-2/paper_type.json b/papers/large-language-model-2024-2/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "The paper explicitly states it is a comprehensive survey that categorizes and synthesizes existing LLM applications in telecommunications across four domains, reviews key techniques, and identifies challenges—a meta-analysis contribution focused on field synthesis rather than original experiments." +} +\ No newline at end of file diff --git a/papers/large-language-model-2024/paper_type.json b/papers/large-language-model-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Explicitly a survey (meta-analysis) that categorizes and synthesizes existing LLM-based multi-agent systems research along multiple axes rather than introducing new experimental findings or benchmarks." +} +\ No newline at end of file diff --git a/papers/large-language-model-2025-2/paper_type.json b/papers/large-language-model-2025-2/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes and experimentally validates a multi-LLM coordination architecture for dispatch optimization, reporting quantitative results (98% pass rates) and ablation studies demonstrating which components contribute to performance." +} +\ No newline at end of file diff --git a/papers/large-language-model-2025-3/paper_type.json b/papers/large-language-model-2025-3/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Systematic literature review analyzing 102 papers on LLM-based Verilog code generation, synthesizing trends and patterns across the field." +} +\ No newline at end of file diff --git a/papers/large-language-model-2025/paper_type.json b/papers/large-language-model-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes PROD method for code unlearning and validates it experimentally across three tasks, comparing quantitative trade-offs against multiple baselines." +} +\ No newline at end of file diff --git a/papers/large-language-model-2026/paper_type.json b/papers/large-language-model-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Demonstrates an LLM agent framework through two case studies on chemical process simulations, reporting quantitative results (values extracted, suggestion ratings) and qualitative findings from experimental evaluation." +} +\ No newline at end of file diff --git a/papers/large-language-models-2022/paper_type.json b/papers/large-language-models-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes a self-verification method and validates it through quantitative experiments on 8 reasoning benchmarks (e.g., +4.33% on GSM8K), analyzing performance improvements and emergent abilities across model sizes—the primary contribution is experimental findings." +} +\ No newline at end of file diff --git a/papers/large-language-models-2024-2/paper_type.json b/papers/large-language-models-2024-2/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Systematic review and meta-analysis of 8 existing LLM-based automated program repair systems, comparing their architectures and performance rather than presenting original experiments." +} +\ No newline at end of file diff --git a/papers/large-language-models-2024/paper_type.json b/papers/large-language-models-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Paper explicitly titled as a survey that provides a taxonomy of LLM applications across ML workflow stages and synthesizes existing approaches into categories." +} +\ No newline at end of file diff --git a/papers/large-language-models-2025-2/paper_type.json b/papers/large-language-models-2025-2/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes Trailblazer framework and validates it through quantitative experiments on benchmarks (ABR, CJS) plus a real-world A/B test with 150K+ users, making the primary contribution the experimental findings demonstrating LLMs' effectiveness as network policies." +} +\ No newline at end of file diff --git a/papers/large-language-models-2025-4/paper_type.json b/papers/large-language-models-2025-4/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments comparing LLM models on fault localization tasks, reports quantitative results across multiple datasets and ablations, with primary contribution being the experimental findings rather than a new benchmark or theoretical analysis." +} +\ No newline at end of file diff --git a/papers/large-language-models-2026/paper_type.json b/papers/large-language-models-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "This is an explicit literature review of 57 papers synthesizing findings across LLM applications in software documentation and modeling, with meta-analysis of patterns across task categories and techniques." +} +\ No newline at end of file diff --git a/papers/largescale-independent-comprehensive-2024/paper_type.json b/papers/largescale-independent-comprehensive-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs large-scale experiments comparing LLM test generation against baselines (EvoSuite) and reports quantitative results (readability improvements, compilation rates, mutation scores, prompting strategy impacts), with primary contribution being experimental findings." +} +\ No newline at end of file diff --git a/papers/latent-collaboration-multiagent-2025/paper_type.json b/papers/latent-collaboration-multiagent-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports quantitative experimental results across 9 benchmarks (accuracy improvement, token reduction, inference speed) as primary contributions, with theoretical analysis of efficiency as a supporting component." +} +\ No newline at end of file diff --git a/papers/latentmem-customizing-latent-2026/paper_type.json b/papers/latentmem-customizing-latent-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes a learnable latent memory framework and validates it through extensive experiments across 6 benchmarks and 4 frameworks with quantitative performance comparisons, making the primary contribution experimental findings rather than a new benchmark, survey, or theoretical analysis." +} +\ No newline at end of file diff --git a/papers/layer-truth-probing-2025/paper_type.json b/papers/layer-truth-probing-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Systematically experiments on LLMs (0.5B-7B) to measure quantitative effects of poisoning on belief shifts, layer-specific effects, and benchmark performance degradation, with primary contribution being empirical findings." +} +\ No newline at end of file diff --git a/papers/layeraware-representation-filtering-2025/paper_type.json b/papers/layeraware-representation-filtering-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces LARF method and validates it empirically on Llama3.1 with quantitative Attack Success Rate metrics, demonstrating its effectiveness at filtering safety-degrading fine-tuning data." +} +\ No newline at end of file diff --git a/papers/ldscene-llmguided-diffusion-2025/paper_type.json b/papers/ldscene-llmguided-diffusion-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes LD-Scene method, runs experiments on nuScenes dataset, reports quantitative results (collision rates, success rates) compared to baselines, and validates the LLM-guided approach—primary contribution is experimental findings, not benchmark creation." +} +\ No newline at end of file diff --git a/papers/leakage-code-generation-2024/paper_type.json b/papers/leakage-code-generation-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper empirically investigates contamination in existing benchmarks through observational analysis and validates findings with quantitative comparisons across models; while LBPP is released, the primary contribution is the empirical discovery of contamination mechanisms." +} +\ No newline at end of file diff --git a/papers/leaksealer-semisupervised-defense-2025/paper_type.json b/papers/leaksealer-semisupervised-defense-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes LeakSealer, runs experiments on multiple datasets (PII leakage, ToxicChat, OpenAI), and reports quantitative metrics (AUPRC, F1, recall) compared to baselines, making the primary contribution experimental validation of the method." +} +\ No newline at end of file diff --git a/papers/learn-code-sustainably-2024/paper_type.json b/papers/learn-code-sustainably-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs controlled experiments evaluating three LLM systems on coding tasks with quantitative measurements (runtime, memory, energy consumption) and reports empirical findings about their green code generation capabilities." +} +\ No newline at end of file diff --git a/papers/learning-code-preference-2024/paper_type.json b/papers/learning-code-preference-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces CODEFAVOR framework and validates it experimentally with quantitative results (28.8% improvement, cost-efficiency gains, human-model comparisons), where the primary contribution is the empirical findings." +} +\ No newline at end of file diff --git a/papers/learning-configure-agentic-2026/paper_type.json b/papers/learning-configure-agentic-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces an RL framework (ARC) and reports quantitative experimental results comparing it to baselines across multiple benchmarks, with key findings about accuracy-cost tradeoffs and model transfer capabilities." +} +\ No newline at end of file diff --git a/papers/learning-decentralized-llm-2026/paper_type.json b/papers/learning-decentralized-llm-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes two multi-agent actor-critic methods (CoLLM-CC and CoLLM-DC) and validates them experimentally across benchmarks (Minecraft, coding, writing) with quantitative performance comparisons against baselines." +} +\ No newline at end of file diff --git a/papers/learning-from-negative-2025/paper_type.json b/papers/learning-from-negative-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments measuring how models respond to warning-framed training data, reports quantitative vulnerability rates, and tests mechanistic explanations via SAE analysis and interventions." +} +\ No newline at end of file diff --git a/papers/learning-guarantee-type-2025/paper_type.json b/papers/learning-guarantee-type-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces TyFlow, a novel neural code synthesis system, and validates it through quantitative experiments on SuFu and Java benchmarks, reporting error rates and pass@10 improvements." +} +\ No newline at end of file diff --git a/papers/learning-partneraware-collaborators-2025/paper_type.json b/papers/learning-partneraware-collaborators-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes ICR method with counterfactual KL-divergence regularization and validates it experimentally against baselines, reporting quantitative results on task accuracy and common ground metrics." +} +\ No newline at end of file diff --git a/papers/leasttomost-prompting-enables-2022/paper_type.json b/papers/leasttomost-prompting-enables-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes least-to-most prompting and validates it experimentally with quantitative results across multiple benchmarks (SCAN, last-letter-concatenation, GSM8K), demonstrating the primary contribution is the empirical findings showing performance improvements." +} +\ No newline at end of file diff --git a/papers/leetcodedataset-temporal-dataset-2025/paper_type.json b/papers/leetcodedataset-temporal-dataset-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces LeetCodeDataset with 2,869 curated Python problems and temporal contamination-aware splits as the primary contribution, using empirical results to validate the dataset's utility." +} +\ No newline at end of file diff --git a/papers/less-training-more-2022/paper_type.json b/papers/less-training-more-2022/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes AlphaRepair method and reports quantitative experimental results on Defects4J and QuixBugs benchmarks, with ablation study showing contributions; primary contribution is empirical findings on effectiveness of zero-shot APR." +} +\ No newline at end of file diff --git a/papers/lessleakbench-first-investigation-2025/paper_type.json b/papers/lessleakbench-first-investigation-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper systematically investigates data leakage across 83 existing benchmarks, reports quantitative leakage percentages and performance inflation metrics, and identifies empirical patterns—the primary contribution is experimental findings, not the benchmark itself." +} +\ No newline at end of file diff --git a/papers/lesson-multilabel-adversarial-2024/paper_type.json b/papers/lesson-multilabel-adversarial-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Presents the LESSON adversarial attack framework and reports quantitative experimental results (100% attack success on 118-bus system, performance variation with scale) on deep learning-based locational detectors." +} +\ No newline at end of file diff --git a/papers/lessons-from-trenches-2024/paper_type.json b/papers/lessons-from-trenches-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The primary contribution is the lm-eval library—a standardized evaluation framework with version tracking and configurable task definitions—supported by empirical case studies demonstrating reproducibility challenges." +} +\ No newline at end of file diff --git a/papers/let-barbarians-how-2025/paper_type.json b/papers/let-barbarians-how-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Paper runs experiments with three ADRS frameworks across ten case studies and reports quantitative performance improvements (13x faster, 35% savings), with primary contribution being experimental findings rather than benchmark design, survey, theory, or position." +} +\ No newline at end of file diff --git a/papers/lethe-purifying-backdoored-2025/paper_type.json b/papers/lethe-purifying-backdoored-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes a backdoor defense method (LETHE) and reports quantitative experimental results (98% attack reduction, <3% accuracy drop) across 5 LLMs compared to 8 baselines, with robustness testing." +} +\ No newline at end of file diff --git a/papers/leveraging-large-language-2023/paper_type.json b/papers/leveraging-large-language-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper conducts systematic experiments evaluating prompt engineering strategies and model variants (GPT-3.5 vs GPT-4), reporting quantitative results on code executability—the primary contribution is experimental findings about effectiveness." +} +\ No newline at end of file diff --git a/papers/leveraging-large-language-2024/paper_type.json b/papers/leveraging-large-language-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes LLM4PatchCorrect method for patch assessment and validates it through experiments across 22 APR tools, reporting quantitative metrics (84.4% accuracy, 86.5% F1) and comparisons with baseline methods." +} +\ No newline at end of file diff --git a/papers/leveraging-mutation-analysis-2026/paper_type.json b/papers/leveraging-mutation-analysis-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments comparing different LLM repair configurations on Bugs4Q benchmark, reporting quantitative success rates and evaluation metrics to validate that mutation analysis improves quantum program repair." +} +\ No newline at end of file diff --git a/papers/leveraging-rust-types-2023/paper_type.json b/papers/leveraging-rust-types-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper implements RusSOL and validates it through experiments on synthesis tasks, reporting quantitative results (115/117 tasks solved, performance metrics, real-world crate evaluation)." +} +\ No newline at end of file diff --git a/papers/lhdeception-simulating-understanding-2025/paper_type.json b/papers/lhdeception-simulating-understanding-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Evaluates 11 frontier LLMs with quantitative deception rates and correlations across long-horizon interactions, with primary contributions being experimental findings about behavioral patterns rather than the benchmark framework itself." +} +\ No newline at end of file diff --git a/papers/library-hallucinations-llm-2025/paper_type.json b/papers/library-hallucinations-llm-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports quantitative experiments measuring LLM susceptibility to library hallucinations (84%, 26%, 99% rates) and tests prompt engineering mitigations with measurable outcomes across models." +} +\ No newline at end of file diff --git a/papers/library-llm-intrinsics-2025/paper_type.json b/papers/library-llm-intrinsics-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Implements a library of 8 RAG intrinsics as LoRA adapters and reports quantitative performance results including 22pp retrieval recall improvements and comparative benchmarks." +} +\ No newline at end of file diff --git a/papers/lidl-llm-integration-2026/paper_type.json b/papers/lidl-llm-integration-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces LIDL system and reports quantitative results on real-world LLM integration defect localization, including accuracy/MAP metrics, baseline comparisons, and ablation studies." +} +\ No newline at end of file diff --git a/papers/limagents-multiagent-llms-2025/paper_type.json b/papers/limagents-multiagent-llms-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments comparing multi-agent LLM configurations, reports quantitative improvements on ground truth coverage metrics, and evaluates performance across different models and agent setups." +} +\ No newline at end of file diff --git a/papers/limits-layer-pruning-2026/paper_type.json b/papers/limits-layer-pruning-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts experiments testing layer pruning at various ratios, reports quantitative performance metrics across classification and reasoning tasks, and provides empirical findings about degradation mechanisms and recovery methods." +} +\ No newline at end of file diff --git a/papers/linguistics-theory-meets-2024/paper_type.json b/papers/linguistics-theory-meets-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes ECT-constrained LLM approach for code-switching and validates it through experiments with statistically significant quantitative results across multiple language pairs and a correlation study of automatic metrics." +} +\ No newline at end of file diff --git a/papers/literature-review-aipowered-2025/paper_type.json b/papers/literature-review-aipowered-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Paper explicitly conducts a literature review of 20 existing papers on code navigation tools, summarizing their methodologies and findings in tabular format without developing or evaluating an original system." +} +\ No newline at end of file diff --git a/papers/live-swe-agent-self-evolve-2025/paper_type.json b/papers/live-swe-agent-self-evolve-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes and experimentally validates LIVE-SWE-AGENT on existing benchmarks (SWE-bench Verified/Pro), reporting SOTA quantitative results and ablation studies; primary contribution is experimental findings." +} +\ No newline at end of file diff --git a/papers/livebench-challenging-contaminationfree-2024/paper_type.json b/papers/livebench-challenging-contaminationfree-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper's primary contribution is introducing LiveBench, a new contamination-resistant LLM benchmark with 18 tasks and objective ground-truth scoring; the model evaluation results serve to validate and demonstrate the benchmark rather than being the main finding." +} +\ No newline at end of file diff --git a/papers/livebench-challenging-contaminationlimited-2024/paper_type.json b/papers/livebench-challenging-contaminationlimited-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The primary contribution is introducing LiveBench, a new LLM benchmark with frequent updates and objective scoring; experimental results validate the benchmark's quality rather than being the main research finding." +} +\ No newline at end of file diff --git a/papers/livecodebench-2024/paper_type.json b/papers/livecodebench-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces LiveCodeBench, a new continuously-updated contamination-free evaluation benchmark for LLMs on code with multiple scenarios, where experimental results on models are secondary to the benchmark contribution itself." +} +\ No newline at end of file diff --git a/papers/llama-3-herd-2024/paper_type.json b/papers/llama-3-herd-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs extensive experiments evaluating Llama 3 across multiple benchmarks (knowledge, coding, math, multilingual), reports quantitative results, validates scaling laws empirically, and includes human evaluations—the primary contribution is the experimental findings on model performance." +} +\ No newline at end of file diff --git a/papers/llama-open-efficient-2023/paper_type.json b/papers/llama-open-efficient-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper trains foundation language models and reports quantitative benchmark results demonstrating that LLaMA models achieve state-of-the-art performance, with the primary contribution being the experimental findings on model efficiency and scaling." +} +\ No newline at end of file diff --git a/papers/llm-agent-fire-2024/paper_type.json b/papers/llm-agent-fire-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper reports quantitative experimental results testing FoamPilot (an LLM agent) on FireFOAM tasks with measured success rates across complexity levels, where the primary contribution is understanding the agent's performance and identifying bottlenecks." +} +\ No newline at end of file diff --git a/papers/llm-agentic-failures-qualitative-2025/paper_type.json b/papers/llm-agentic-failures-qualitative-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments on 900 agentic execution traces across three LLMs, reports quantitative performance metrics (e.g., 2/30 accuracy), and contributes experimental findings about failure archetypes rather than introducing a novel benchmark or dataset as the primary contribution." +} +\ No newline at end of file diff --git a/papers/llm-agents-generating-2025/paper_type.json b/papers/llm-agents-generating-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs systematic experiments with LLM agents on microservice specifications, reports quantitative performance metrics (83-94% unit test pass rates), and evaluates different approaches (fine-grained vs service-level code generation, reflection strategies) across models, making experimental findings the primary contribution." +} +\ No newline at end of file diff --git a/papers/llm-agents-interaction-2024/paper_type.json b/papers/llm-agents-interaction-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments with personality-conditioned LLM agents and reports quantitative results (98.5% classification accuracy, consistency variations, linguistic alignment metrics) using established measurement tools." +} +\ No newline at end of file diff --git a/papers/llm-agents-se-survey-2024/paper_type.json b/papers/llm-agents-se-survey-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Explicitly a survey that collects and meta-analyzes 124 papers on LLM-based agents for software engineering, categorizing them from multiple perspectives and synthesizing field patterns." +} +\ No newline at end of file diff --git a/papers/llm-alignment-as-2025/paper_type.json b/papers/llm-alignment-as-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes LARPO, an IR-based alignment method, and validates it with quantitative experiments on AlpacaEval2 and MixEval-Hard, with primary contributions being empirical findings about relative performance and design choices." +} +\ No newline at end of file diff --git a/papers/llm-assistants-productivity-slr-2025/paper_type.json b/papers/llm-assistants-productivity-slr-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Explicitly a systematic literature review synthesizing findings from 37 peer-reviewed studies with meta-analysis of research patterns and landscape." +} +\ No newline at end of file diff --git a/papers/llm-code-review-benchmarking-2025/paper_type.json b/papers/llm-code-review-benchmarking-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces SWR-Bench, a 1000-PR code review benchmark with full project context, while evaluating existing ACR tools on this benchmark." +} +\ No newline at end of file diff --git a/papers/llm-code-security-slr-2024/paper_type.json b/papers/llm-code-security-slr-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Explicitly a systematic literature review (SLR) that synthesizes findings from 20 studies, identifying vulnerability categories and patterns across existing LLM code security research." +} +\ No newline at end of file diff --git a/papers/llm-deception-self-preservation-2025/paper_type.json b/papers/llm-deception-self-preservation-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts experiments (case-study prompts) with DeepSeek R1 and reports qualitative empirical findings about observed deceptive behaviors, making the primary contribution the experimental observations rather than a novel benchmark, survey, theoretical proof, or position argument." +} +\ No newline at end of file diff --git a/papers/llm-fuzzing-challenges-2024/paper_type.json b/papers/llm-fuzzing-challenges-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "The paper explicitly reviews the intersection of LLMs and fuzzing, categorizes existing approaches, and synthesizes findings from prior work rather than conducting original experiments." +} +\ No newline at end of file diff --git a/papers/llm-hallucinations-code-practical-2024/paper_type.json b/papers/llm-hallucinations-code-practical-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments on six models, quantitatively categorizes hallucination phenomena, identifies root causes, and empirically validates a RAG-based mitigation approach." +} +\ No newline at end of file diff --git a/papers/llm-harms-taxonomy-2025/paper_type.json b/papers/llm-harms-taxonomy-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "This is a systematic review of 200 papers with meta-analysis methodology that synthesizes existing literature on LLM harms and proposes an organizing taxonomy derived from that synthesis." +} +\ No newline at end of file diff --git a/papers/llm-impact-code-review-2025/paper_type.json b/papers/llm-impact-code-review-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Observational study of 25,473 GitHub PRs reporting quantitative metrics on LLM impact (merge times, review times) with manual analysis of 310 cases; primary contribution is experimental findings." +} +\ No newline at end of file diff --git a/papers/llm-long-term-memory-eval-2024/paper_type.json b/papers/llm-long-term-memory-eval-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments comparing multiple LLM approaches (base models, long-context LLMs, RAG, summarization) on long conversational memory tasks and reports quantitative performance results; the primary contribution is the empirical findings about which approaches work best, not the benchmark itself." +} +\ No newline at end of file diff --git a/papers/llm-pros-icpc-competitive-2025/paper_type.json b/papers/llm-pros-icpc-competitive-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Evaluates multiple LLMs on existing ICPC competitive programming problems with quantitative performance metrics, contributing experimental findings rather than a new benchmark or dataset." +} +\ No newline at end of file diff --git a/papers/llm-requirements-engineering-slr-2025/paper_type.json b/papers/llm-requirements-engineering-slr-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Explicitly a systematic literature review (SLR) that synthesizes findings from 74 papers on LLMs for Requirements Engineering, analyzing trends and patterns across the field." +} +\ No newline at end of file diff --git a/papers/llm-secure-code-gen-empirical-2025/paper_type.json b/papers/llm-secure-code-gen-empirical-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts experiments measuring vulnerability rates in LLM-generated code (9.8%-42.1%) and quantifies the effectiveness of different guidance strategies, with the primary contribution being empirical findings on code generation flaws and mitigation techniques." +} +\ No newline at end of file diff --git a/papers/llm-strategic-deception-under-pressure-2023/paper_type.json b/papers/llm-strategic-deception-under-pressure-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments with LLMs in a simulated trading scenario, reports quantitative findings (75%+ insider trading, ~90% deception rates), and measures behavioral variation across conditions and models." +} +\ No newline at end of file diff --git a/papers/llm-test-generation-2025/paper_type.json b/papers/llm-test-generation-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes Panta method, runs experiments on Defects4J benchmark, reports quantitative coverage metrics, and includes ablation study comparing against baselines." +} +\ No newline at end of file diff --git a/papers/llm-test-script-2023/paper_type.json b/papers/llm-test-script-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs case studies with ChatGPT to evaluate capabilities and limitations for test script generation, reporting qualitative findings about what the model can and cannot do." +} +\ No newline at end of file diff --git a/papers/llm-theory-mind-2024/paper_type.json b/papers/llm-theory-mind-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Maps opportunities and risks of LLM theory of mind for alignment, proposing conceptual frameworks without experimental validation or formal mathematical analysis." +} +\ No newline at end of file diff --git a/papers/llm-unit-test-generation-empirical-2024/paper_type.json b/papers/llm-unit-test-generation-empirical-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports quantitative experimental findings comparing LLM unit test generation across multiple models with measured metrics (syntactic validity, coverage, defect detection), testing how design choices affect performance." +} +\ No newline at end of file diff --git a/papers/llm4cve-enabling-iterative-2025/paper_type.json b/papers/llm4cve-enabling-iterative-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes an iterative vulnerability repair pipeline and reports quantitative experimental results across multiple LLMs with metrics (correctness scores, CodeBLEU improvements), making empirical findings the primary contribution." +} +\ No newline at end of file diff --git a/papers/llmaided-testbench-generation-2024/paper_type.json b/papers/llmaided-testbench-generation-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports quantitative experimental results (90–100% coverage, iteration counts) comparing GPT-3.5 and GPT-4 performance on testbench generation and bug detection tasks across FSM cases, with the primary contribution being the empirical findings about iterative feedback effectiveness." +} +\ No newline at end of file diff --git a/papers/llmailinject-dataset-from-2025/paper_type.json b/papers/llmailinject-dataset-from-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper introduces LLMail-Inject, a new dataset of 208,095 prompt injection attacks from a challenge, with evaluation baselines; the primary contribution is the benchmark itself, not novel experimental findings on existing datasets." +} +\ No newline at end of file diff --git a/papers/llmalign-utilizing-large-2024/paper_type.json b/papers/llmalign-utilizing-large-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes LLM-Align method, runs experiments on DBP15K benchmark with quantitative metrics (Hits@1 scores), and includes ablation studies showing component contributions—primary contribution is experimental findings of SOTA performance." +} +\ No newline at end of file diff --git a/papers/llmassisted-static-analysis-2024/paper_type.json b/papers/llmassisted-static-analysis-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Presents IRIS system with quantitative experimental results on CWE-Bench-Java, comparing vulnerability detection rates across different models and baseline approaches." +} +\ No newline at end of file diff --git a/papers/llmbased-framework-support-2025/paper_type.json b/papers/llmbased-framework-support-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes and evaluates a tool through case-study methodology on a real specification, reporting findings about the approach's limitations despite limited scale." +} +\ No newline at end of file diff --git a/papers/llmbased-multiagent-systems-2024/paper_type.json b/papers/llmbased-multiagent-systems-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "This is a systematic literature review synthesizing 71 primary studies on LLM-based multi-agent systems in software engineering; the case studies with ChatDev are illustrative examples of findings from the review, not the primary contribution." +} +\ No newline at end of file diff --git a/papers/llmbased-retrievalaugmented-control-2024/paper_type.json b/papers/llmbased-retrievalaugmented-control-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Demonstrates feasibility through 3 spot-sample experiments with GPT-4, reporting quantitative results (2 of 3 required manual corrections) from retrieval-augmented control code generation." +} +\ No newline at end of file diff --git a/papers/llmbased-unit-test-2024/paper_type.json b/papers/llmbased-unit-test-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces APT tool and runs experiments on 1515 Java methods, reporting quantitative results (60.2% test execution, 54.2% coverage) with comparative analysis against baselines; primary contribution is empirical performance findings." +} +\ No newline at end of file diff --git a/papers/llmbased-vulnerability-detection-2026/paper_type.json b/papers/llmbased-vulnerability-detection-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments across 222 known vulnerabilities and 24 open-source projects, reports quantitative recall/SFDR metrics, and analyzes error patterns through manual sampling—primary contribution is experimental findings about LLM-based vulnerability detection effectiveness." +} +\ No newline at end of file diff --git a/papers/llmbscvm-llmbased-blockchain-2025/paper_type.json b/papers/llmbscvm-llmbased-blockchain-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a six-agent LLM-based framework and validates it through experiments on a benchmark dataset, reporting quantitative results (91.1% accuracy, 91.0% F1) with comparative baselines." +} +\ No newline at end of file diff --git a/papers/llmcoordination-evaluating-analyzing-2023/paper_type.json b/papers/llmcoordination-evaluating-analyzing-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments evaluating LLMs on coordination games (Overcooked-AI, Hanabi), reports quantitative results comparing against RL baselines, and provides fine-grained performance analysis—primary contribution is experimental findings rather than a new benchmark or framework." +} +\ No newline at end of file diff --git a/papers/llmpowered-test-case-2024-2/paper_type.json b/papers/llmpowered-test-case-2024-2/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Reports quantitative experimental results (F1 scores) comparing the TrickCatcher approach against baselines on established benchmarks (TrickyBugs, EvalPlus), with the primary contribution being the empirical findings of the method's performance." +} +\ No newline at end of file diff --git a/papers/llmpowered-test-case-2024/paper_type.json b/papers/llmpowered-test-case-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes TrickCatcher, implements the LLM-based approach, and evaluates it with quantitative F1 scores on existing benchmarks (TrickyBugs, EvalPlus) with ablation studies, making experimental findings the primary contribution." +} +\ No newline at end of file diff --git a/papers/llmrouterbench-massive-benchmark-2026/paper_type.json b/papers/llmrouterbench-massive-benchmark-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces LLMRouterBench as the primary contribution—a new benchmark and unified evaluation framework for LLM routing methods—evaluated across 21 datasets and 33 models, with method performance comparison as secondary analysis." +} +\ No newline at end of file diff --git a/papers/llms-as-zeroshot-2024/paper_type.json b/papers/llms-as-zeroshot-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes TEA-GLM and experimentally validates it against baselines (GraphGPT, LLaGA) across multiple datasets and tasks, reporting quantitative results (accuracy, AUC) as the primary contribution." +} +\ No newline at end of file diff --git a/papers/llms-encode-their-2026/paper_type.json b/papers/llms-encode-their-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Trains linear probes on model activations and reports quantitative experimental results (AUROC > 0.7) evaluating success prediction on math and coding benchmarks; primary contribution is the experimental finding that pre-generation activations encode failure information." +} +\ No newline at end of file diff --git a/papers/llms-prescient-continuous-2024/paper_type.json b/papers/llms-prescient-continuous-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts continuous evaluation experiments measuring LLM forecasting accuracy over time, reporting quantitative performance degradation results (21.55%, 11.33%) and testing interventions like RAG and gold article access." +} +\ No newline at end of file diff --git a/papers/llms-se-systematic-review-2023/paper_type.json b/papers/llms-se-systematic-review-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Systematic literature review of 395 papers that maps the LLM4SE landscape, synthesizes findings across dimensions, and identifies research gaps—the primary contribution is synthesis of existing work, not new experiments or benchmarks." +} +\ No newline at end of file diff --git a/papers/llms-software-security-2025/paper_type.json b/papers/llms-software-security-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "The paper explicitly surveys 58 existing papers on LLM-based vulnerability detection, synthesizing patterns in model usage, target languages, and techniques with a meta-analysis methodology." +} +\ No newline at end of file diff --git a/papers/llmsecconfig-llmbased-approach-2025/paper_type.json b/papers/llmsecconfig-llmbased-approach-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The primary contribution is experimental validation of LLMSecConfig, with quantitative results (94.3% pass rate) and ablation studies on real-world configurations demonstrating the system's effectiveness." +} +\ No newline at end of file diff --git a/papers/local-llm-ensembles-2025/paper_type.json b/papers/local-llm-ensembles-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments testing an ensemble pipeline on existing Portuguese NER datasets and reports quantitative performance comparisons, making experimental findings the primary contribution." +} +\ No newline at end of file diff --git a/papers/lockin-phase-hypothesis-2025/paper_type.json b/papers/lockin-phase-hypothesis-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes the Lock-In Phase Hypothesis and validates it through experiments on 4 models, reporting quantitative measurements of behavioral consolidation (Refusal Elasticity) and capability side-effects across model scales—making the primary contribution experimental findings, not just the conceptual framework." +} +\ No newline at end of file diff --git a/papers/logically-constrained-decoding-2025/paper_type.json b/papers/logically-constrained-decoding-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Presents a logically constrained decoding technique and demonstrates its effectiveness through quantitative experiments on chess and proof generation tasks, with specific numerical results showing dramatic improvements over unconstrained baselines." +} +\ No newline at end of file diff --git a/papers/lost-translation-study-2024/paper_type.json b/papers/lost-translation-study-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs controlled experiments testing multiple LLMs on code translation, reports quantitative success rates and bug statistics across 1,748 translations, and analyzes bug patterns empirically—the primary contribution is experimental findings about LLM code translation reliability and failure modes." +} +\ No newline at end of file diff --git a/papers/lumen-developer-agency-2025/paper_type.json b/papers/lumen-developer-agency-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a tool and interaction paradigm supported by cognitive walkthrough analysis rather than empirical user study, making prescriptive claims about context assembly without experimental validation." +} +\ No newline at end of file diff --git a/papers/magicagent-generalized-agent-2026/paper_type.json b/papers/magicagent-generalized-agent-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a training method and algorithm, validates through experiments on five existing benchmarks with quantitative performance comparisons." +} +\ No newline at end of file diff --git a/papers/make-every-move-2024/paper_type.json b/papers/make-every-move-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper presents a method (MCTS-guided LLM-based RTL generation) and validates it experimentally on 15 circuit modules, reporting quantitative improvements over baselines—a primary contribution of experimental findings rather than a new benchmark or theoretical result." +} +\ No newline at end of file diff --git a/papers/making-llms-reliable-2025/paper_type.json b/papers/making-llms-reliable-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs qualitative experiments with 7 frontier LLMs across 3 simulated scenarios and reports specific empirical findings about the architecture's effectiveness (e.g., one-shot prompting failed, 5 of 7 models achieved partnership after calibration)." +} +\ No newline at end of file diff --git a/papers/malice-agentland-down-2025/paper_type.json b/papers/malice-agentland-down-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments testing backdoor attacks on AI agents across multiple threat models, reports quantitative attack success rates and poisoning thresholds, and evaluates existing defense mechanisms—primary contribution is experimental findings." +} +\ No newline at end of file diff --git a/papers/mama-gametheoretic-approach-2026/paper_type.json b/papers/mama-gametheoretic-approach-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces MaMa framework and validates through experiments across four environments, reporting quantitative safety improvements against baselines with validated metrics." +} +\ No newline at end of file diff --git a/papers/manatee-inferencetime-lightweight-2026/paper_type.json b/papers/manatee-inferencetime-lightweight-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes MANATEE defense mechanism and validates it experimentally across three attack datasets and three LLM models, reporting quantitative ASR reduction results." +} +\ No newline at end of file diff --git a/papers/manipulating-llm-web-2025/paper_type.json b/papers/manipulating-llm-web-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Paper runs experiments demonstrating prompt injection attacks on real websites, reports quantitative attack success rates (ASR 0.83–1.0), and tests transferability across models with multiple attack scenarios." +} +\ No newline at end of file diff --git a/papers/manipulating-multimodal-agents-2025/paper_type.json b/papers/manipulating-multimodal-agents-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Demonstrates CrossInject attack with quantitative ASR metrics (97% vs 0%), includes ablation studies (18.7% and 24.8% component contributions), and tests on multimodal agent benchmarks/case studies—primary contribution is experimental findings on attack effectiveness." +} +\ No newline at end of file diff --git a/papers/many-ai-analysts-2026/paper_type.json b/papers/many-ai-analysts-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs 4,946 experimental trials across three datasets measuring quantitative outcomes (effect sizes, p-values, dispersion metrics) of autonomous AI analysts under different personas and models." +} +\ No newline at end of file diff --git a/papers/marshal-incentivizing-multiagent-2025/paper_type.json b/papers/marshal-incentivizing-multiagent-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes MARSHAL RL framework and validates it through quantitative experiments showing 28.7% improvement on held-out games and 10% gains on AIME, with ablation studies demonstrating critical components." +} +\ No newline at end of file diff --git a/papers/martingale-score-unsupervised-2025/paper_type.json b/papers/martingale-score-unsupervised-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper's primary contribution is proposing a new unsupervised metric (Martingale Score) for evaluating LLM reasoning; experiments across 6 models and 3 domains validate the metric rather than serving as the main contribution." +} +\ No newline at end of file diff --git a/papers/masked-hardattention-transformers-2023/paper_type.json b/papers/masked-hardattention-transformers-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "Establishes exact characterizations of masked hard-attention transformers through formal language theory, proving they recognize precisely the star-free languages through mathematical analysis rather than empirical evaluation." +} +\ No newline at end of file diff --git a/papers/mathematical-methods-human-2026/paper_type.json b/papers/mathematical-methods-human-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Argues a viewpoint on AI as evolution of human cognitive tools, proposes a conceptual Copernican framework for human-centered development, and makes prescriptive claims about responsible AI integration without experimental validation." +} +\ No newline at end of file diff --git a/papers/matplotagent-method-evaluation-2024/paper_type.json b/papers/matplotagent-method-evaluation-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The primary contribution is the MatPlotAgent framework, validated through quantitative experiments showing performance improvements across multiple LLMs; the benchmark and evaluation methodology are secondary contributions supporting the empirical findings." +} +\ No newline at end of file diff --git a/papers/may-i-have-2025/paper_type.json b/papers/may-i-have-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes ASTRA attack method and reports quantitative experimental results (72.5-96% success rates) comparing it against existing defenses and attack baselines." +} +\ No newline at end of file diff --git a/papers/maybe-we-need-2025/paper_type.json b/papers/maybe-we-need-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts a qualitative empirical study with 54 developers through paired interviews to identify behavioral and organizational factors driving GenAI tool adoption, reporting findings about developer mindsets and organizational dynamics." +} +\ No newline at end of file diff --git a/papers/mcp-safety-audit-2025/paper_type.json b/papers/mcp-safety-audit-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper demonstrates concrete security exploits on LLMs (Claude, Llama) through case studies and introduces a novel attack (RADE), making the primary contribution experimental findings rather than a benchmark, survey, position, or theoretical analysis." +} +\ No newline at end of file diff --git a/papers/mcp-security-bench-2025/paper_type.json b/papers/mcp-security-bench-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces MSB, a new security benchmark with 12 MCP-specific attack types and 2,000 instances for evaluating LLM agent vulnerabilities; evaluates baselines to validate the benchmark." +} +\ No newline at end of file diff --git a/papers/mcp-security-risks-governance-2025/paper_type.json b/papers/mcp-security-risks-governance-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a defense-in-depth security framework and governance model for MCP with threat analysis and control mapping to standards, without experimental validation of the proposed controls." +} +\ No newline at end of file diff --git a/papers/mcp-security-sok-2025/paper_type.json b/papers/mcp-security-sok-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Systematization of Knowledge (SoK) papers are surveys that create taxonomies and organize existing knowledge; this paper provides the first academic taxonomy of security and safety risks in MCP without running new experiments or introducing a benchmark." +} +\ No newline at end of file diff --git a/papers/measuring-agents-production-2025/paper_type.json b/papers/measuring-agents-production-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts primary observational research combining 20 case studies and 306 practitioner surveys to report quantitative findings about production agent deployment practices." +} +\ No newline at end of file diff --git a/papers/measuring-ai-ability-2025/paper_type.json b/papers/measuring-ai-ability-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Measures AI model performance on long software tasks, reports quantitative results on task completion times and trends (50% horizon doubling ~every 7 months), and analyzes capability improvements across multiple models and task characteristics." +} +\ No newline at end of file diff --git a/papers/measuring-impact-programming-2023/paper_type.json b/papers/measuring-impact-programming-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The primary contribution is quantitative experimental findings about how balanced programming language distributions impact multilingual code model performance across multiple training scales and tasks, using BabelCode as the evaluation infrastructure." +} +\ No newline at end of file diff --git a/papers/measuring-mid2025-llmassistance-2026/paper_type.json b/papers/measuring-mid2025-llmassistance-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts a randomized controlled trial (n=153) measuring LLM effects on novice biology lab performance, reporting quantitative experimental findings with statistical analysis and Bayesian modeling." +} +\ No newline at end of file diff --git a/papers/measuring-technical-debt-2024/paper_type.json b/papers/measuring-technical-debt-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "A scoping review of 72 studies that systematizes existing knowledge into 18 categorized types of technical debt in AI systems; the primary contribution is synthesis and meta-analysis of the field, not experimental findings or benchmark creation." +} +\ No newline at end of file diff --git a/papers/measuring-what-matters-2025/paper_type.json b/papers/measuring-what-matters-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "Systematic review and meta-analysis of 445 LLM benchmark papers, synthesizing construct validity weaknesses across the field rather than introducing new experiments, benchmarks, or formal theory." +} +\ No newline at end of file diff --git a/papers/mechanistic-emergence-symbol-2025/paper_type.json b/papers/mechanistic-emergence-symbol-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Trains neural networks from scratch with controlled architecture variations (Transformers, Mamba-2, LSTMs) and reports quantitative measurements of symbol grounding emergence, layer-wise analysis, and information-theoretic metrics across model types." +} +\ No newline at end of file diff --git a/papers/mechanistic-exploration-backdoored-2025/paper_type.json b/papers/mechanistic-exploration-backdoored-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts experiments analyzing attention patterns in backdoored LLMs with quantitative findings on layer-specific effects and head counts needed to patch behaviors." +} +\ No newline at end of file diff --git a/papers/melon-provable-defense-2025/paper_type.json b/papers/melon-provable-defense-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces MELON defense method and validates it through experiments on AgentDojo benchmark across multiple models with quantitative results (attack success rates, utility metrics) and ablation studies." +} +\ No newline at end of file diff --git a/papers/memgpt-llms-as-2023/paper_type.json b/papers/memgpt-llms-as-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "MemGPT proposes a system and validates it through quantitative experiments on memory retrieval, document QA, and nested lookup tasks, demonstrating concrete performance improvements over baselines." +} +\ No newline at end of file diff --git a/papers/memorize-generalize-evaluating-2025/paper_type.json b/papers/memorize-generalize-evaluating-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes an evaluation methodology (code rewriting and Memorization Risk Index) to experimentally investigate how harmful memorization behaves in LLM code generation across different scales, training methods, and task difficulties, with the primary contribution being the empirical findings about memorization patterns rather than the methodology itself." +} +\ No newline at end of file diff --git a/papers/mentorcollab-selective-largetosmall-2026/paper_type.json b/papers/mentorcollab-selective-largetosmall-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a method (MentorCollab) and experimentally validates it across 15 generator-mentor pairs and 3 domains, reporting quantitative accuracy improvements and ablation analysis results." +} +\ No newline at end of file diff --git a/papers/mercury-code-efficiency-2024/paper_type.json b/papers/mercury-code-efficiency-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces Mercury, a novel code efficiency benchmark with 1,889 Python tasks and a new Beyond metric; baseline experiments demonstrate the benchmark's utility rather than constituting the primary contribution." +} +\ No newline at end of file diff --git a/papers/mercury-efficiency-benchmark-2024/paper_type.json b/papers/mercury-efficiency-benchmark-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces Mercury, a new code efficiency benchmark comprising 1,889 Python tasks with a novel runtime-percentile-based metric (Beyond), with baseline evaluations as secondary contributions." +} +\ No newline at end of file diff --git a/papers/mergerepair-exploratory-study-2024/paper_type.json b/papers/mergerepair-exploratory-study-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments merging task-specific LoRA adapters and reports quantitative results (pass@1/pass@10 improvements) on HumanEvalFix, with primary contribution being experimental findings about adapter merging strategies." +} +\ No newline at end of file diff --git a/papers/meta-secalign-secure-2025/paper_type.json b/papers/meta-secalign-secure-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Fine-tunes Llama-3.3-70B using SecAlign++ DPO and reports quantitative security/utility results on prompt injection benchmarks." +} +\ No newline at end of file diff --git a/papers/metacognitive-selfcorrection-multiagent-2025/paper_type.json b/papers/metacognitive-selfcorrection-multiagent-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Introduces the MASC method and validates it through quantitative experiments on six benchmarks, with reported improvements (8.47% AUC-ROC, 1.29% average gains)." +} +\ No newline at end of file diff --git a/papers/metagpt-multi-agent-framework-2023/paper_type.json b/papers/metagpt-multi-agent-framework-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The primary contribution is demonstrating that the MetaGPT framework achieves strong quantitative results through experiments on HumanEval, MBPP, and a custom benchmark, supported by ablation studies—not introducing new benchmarks or proposing a framework without validation." +} +\ No newline at end of file diff --git a/papers/metalearning-transformers-improve-2025/paper_type.json b/papers/metalearning-transformers-improve-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments comparing meta-learning training strategies on datasets, reporting quantitative findings about cross-domain generalization, sequential learning, and curriculum effects." +} +\ No newline at end of file diff --git a/papers/metarewarding-language-models-2024/paper_type.json b/papers/metarewarding-language-models-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes Meta-Rewarding method and runs experiments across 4 iterations, reporting quantitative improvements on AlpacaEval 2 (22.9%→39.4%) and Arena-Hard (20.6%→29.1%), with analysis of learned biases." +} +\ No newline at end of file diff --git a/papers/method-counteracting-manipulative-2025/paper_type.json b/papers/method-counteracting-manipulative-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes and experimentally validates a Multi-Head DistilBERT classifier for detecting prompt injection attacks, reporting quantitative results (0.99 Recall) against baselines on real-world holdout data." +} +\ No newline at end of file diff --git a/papers/metr-rct-2025/paper_type.json b/papers/metr-rct-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Randomized controlled trial with 16 developers and 246 tasks reporting quantitative productivity measurements as the primary contribution." +} +\ No newline at end of file diff --git a/papers/metrex-benchmark-verilog-2024/paper_type.json b/papers/metrex-benchmark-verilog-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper's primary contribution is MetRex, a large-scale benchmark of 25,868 annotated Verilog designs; the empirical work (fine-tuning, baseline comparisons) serves to validate and demonstrate the benchmark's utility." +} +\ No newline at end of file diff --git a/papers/metric-assessment-protocol-2025/paper_type.json b/papers/metric-assessment-protocol-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments evaluating MCQ metrics, reports quantitative results (R² correlations, performance comparisons), and proposes a novel metric with validated performance numbers." +} +\ No newline at end of file diff --git a/papers/microsaccadeinspired-probing-positional-2025/paper_type.json b/papers/microsaccadeinspired-probing-positional-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes the MIP method for detecting LLM misbehaviors through positional encoding perturbations and reports quantitative experimental results (AUC scores) across multiple detection tasks and models, with comparison to baselines." +} +\ No newline at end of file diff --git a/papers/mind2web-generalist-agent-2023/paper_type.json b/papers/mind2web-generalist-agent-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces Mind2Web, the first large-scale dataset for web agents, with experimental baselines to validate the benchmark; the dataset itself is the primary novel contribution." +} +\ No newline at end of file diff --git a/papers/minicodeprops-minimal-benchmark-2024/paper_type.json b/papers/minicodeprops-minimal-benchmark-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The primary contribution is introducing miniCodeProps, a new 201-property benchmark in Lean 4 for evaluating neural theorem provers; baseline experiments with GPT-4o and specialized models are secondary to demonstrating the benchmark's design and difficulty characteristics." +} +\ No newline at end of file diff --git a/papers/mitigating-indirect-prompt-2025/paper_type.json b/papers/mitigating-indirect-prompt-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Presents a novel defense framework (IntentGuard) with quantitative experimental results on benchmarks (Mind2Web) showing attack mitigation rates under specific adversarial conditions." +} +\ No newline at end of file diff --git a/papers/mixofgranularity-optimize-chunking-2024/paper_type.json b/papers/mixofgranularity-optimize-chunking-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes MoG method for dynamic chunking granularity selection and validates it experimentally with quantitative results across five medical QA benchmarks and multiple LLM backbones." +} +\ No newline at end of file diff --git a/papers/mixrevdetect-detecting-aigenerated-2025/paper_type.json b/papers/mixrevdetect-detecting-aigenerated-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes a detection method (MixRevDetect) and validates it experimentally on a constructed peer review dataset, achieving quantitative results (88.86% F1) and robustness comparisons against baseline detectors." +} +\ No newline at end of file diff --git a/papers/mixtureofmodels-unifying-heterogeneous-2026/paper_type.json b/papers/mixtureofmodels-unifying-heterogeneous-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments on AIME 2025, LiveCodeBench Hard, and DarkBench benchmarks, reports quantitative performance metrics for ensemble methods, and validates an empirical Efficiency-Fatigue model—making experimental findings the primary contribution." +} +\ No newline at end of file diff --git a/papers/mlebench-evaluating-machine-2024/paper_type.json b/papers/mlebench-evaluating-machine-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The paper introduces MLE-bench, a new evaluation framework with 75 Kaggle ML engineering competitions; while it reports experimental results on baseline agents, the primary contribution is the benchmark itself rather than novel findings about agent capabilities." +} +\ No newline at end of file diff --git a/papers/mmlucf-contaminationfree-multitask-2024/paper_type.json b/papers/mmlucf-contaminationfree-multitask-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Primary contribution is MMLU-CF, a new contamination-free benchmark with 20,000 questions and decontamination methodology; experiments validate the benchmark rather than being the primary finding." +} +\ No newline at end of file diff --git a/papers/mmrbench-comprehensive-benchmark-2026/paper_type.json b/papers/mmrbench-comprehensive-benchmark-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces MMR-Bench, a new offline benchmark with 11,000 instances for evaluating multimodal LLM routing strategies across 10 models and 7 vision-language benchmarks." +} +\ No newline at end of file diff --git a/papers/mmrlhf-next-step-2025/paper_type.json b/papers/mmrlhf-next-step-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper introduces MM-RLHF as a method with both a dataset and reward modeling approach, with the primary contribution being the experimental demonstration of SOTA performance and substantial alignment improvements across multiple benchmarks." +} +\ No newline at end of file diff --git a/papers/mobilityaware-cache-framework-2026/paper_type.json b/papers/mobilityaware-cache-framework-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes MobCache framework and validates it experimentally on Beijing and NYC datasets, reporting quantitative results (42-52% inference time reduction, 79-93% cost reduction) compared to direct LLM calls." +} +\ No newline at end of file diff --git a/papers/moco-onestop-shop-2026/paper_type.json b/papers/moco-onestop-shop-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "Introduces MoCo, a new benchmark framework with 25 datasets for evaluating model collaboration algorithms, serving as a unified resource for the community." +} +\ No newline at end of file diff --git a/papers/model-cascading-code-2024/paper_type.json b/papers/model-cascading-code-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Presents a cascaded multi-model framework and validates it through experiments quantifying cost reduction (26% average, up to 70%) and accuracy improvements on code completion benchmarks." +} +\ No newline at end of file diff --git a/papers/modeldriven-quantum-code-2025/paper_type.json b/papers/modeldriven-quantum-code-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs controlled experiments comparing prompt engineering and RAG configurations for quantum code generation, reporting quantitative results (CodeBLEU scores, precision/recall metrics) as primary contributions." +} +\ No newline at end of file diff --git a/papers/modular-layout-synthesis-2025/paper_type.json b/papers/modular-layout-synthesis-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper presents a framework (MLS) and reports quantitative experimental results comparing it against baselines across multiple metrics, though critically all experiments use synthetically generated data rather than real datasets, substantially undermining the validity of the empirical claims." +} +\ No newline at end of file diff --git a/papers/modular-pluralism-pluralistic-2024/paper_type.json b/papers/modular-pluralism-pluralistic-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes Modular Pluralism method and validates it with quantitative experimental results across multiple objectives (Overton, steerable, distributional) showing measurable improvements in value coverage, accuracy, and distributional alignment." +} +\ No newline at end of file diff --git a/papers/monitorguided-decoding-code-2023/paper_type.json b/papers/monitorguided-decoding-code-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Presents a novel Monitor-Guided Decoding method and reports quantitative experimental results showing 21-25% compilation rate improvements across models on the PRAGMATICCODE benchmark." +} +\ No newline at end of file diff --git a/papers/monte-carlo-tree-2026/paper_type.json b/papers/monte-carlo-tree-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs experiments on SWE-bench Lite with quantitative results and ablation studies; primary contribution is the experimental finding that MCTS improves program repair performance." +} +\ No newline at end of file diff --git a/papers/moral-alignment-llm-2024/paper_type.json b/papers/moral-alignment-llm-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs controlled experiments fine-tuning LLM agents with different moral reward schemes via PPO and reports quantitative behavioral results across multiple game-theoretic benchmarks." +} +\ No newline at end of file diff --git a/papers/moral-turing-test-2024/paper_type.json b/papers/moral-turing-test-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs a randomized controlled trial (N=230) reporting quantitative findings about LLM behavior in moral framing, human preferences for AI-generated justifications, and detection bias—the primary contribution is experimental results, not the evaluation framework itself." +} +\ No newline at end of file diff --git a/papers/more-bang-buck-2026/paper_type.json b/papers/more-bang-buck-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "The primary contribution is a formal analysis using renewal theory to derive optimal strategies and prove that τ=1 is optimal, with experiments serving as validation of the theoretical predictions." +} +\ No newline at end of file diff --git a/papers/more-code-less-2025/paper_type.json b/papers/more-code-less-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts a survey of 868 scientists reporting quantitative adoption patterns and empirical findings about relationships between programming experience, development practices, and perceived productivity with AI tools." +} +\ No newline at end of file diff --git a/papers/more-llm-calls-2024/paper_type.json b/papers/more-llm-calls-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "theoretical", + "reason": "The paper derives analytical scaling models to predict compound AI system performance and explain non-monotonic scaling through formal analysis of query difficulty distributions." +} +\ No newline at end of file diff --git a/papers/mose-mixture-slimmable-2026/paper_type.json b/papers/mose-mixture-slimmable-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes MoSE and validates it through experiments on multiple model sizes and datasets, reporting quantitative results (FLOP savings, perplexity) as the primary contribution." +} +\ No newline at end of file diff --git a/papers/mpib-benchmark-medical-2026/paper_type.json b/papers/mpib-benchmark-medical-2026/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "MPIB is a new benchmark for evaluating prompt injection attacks and clinical safety across LLMs; while experiments are run on 12 models, the primary contribution is the evaluation framework and benchmark itself." +} +\ No newline at end of file diff --git a/papers/mtbench101-finegrained-benchmark-2024/paper_type.json b/papers/mtbench101-finegrained-benchmark-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "benchmark-creation", + "reason": "The primary contribution is MT-Bench-101, a new hierarchical multi-turn dialogue benchmark with 1388 dialogues and 4208 turns; experimental evaluation of 21 LLMs is conducted on this benchmark rather than being the main contribution." +} +\ No newline at end of file diff --git a/papers/multi-agent-byzantine-fault-2025/paper_type.json b/papers/multi-agent-byzantine-fault-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes a Byzantine fault tolerance mechanism (CP-WBFT) and validates it through experiments comparing LLM-based agents against traditional agents, reporting quantitative results (85.7% fault rate, 100% accuracy) across network topologies." +} +\ No newline at end of file diff --git a/papers/multi-agent-collaboration-survey-2025/paper_type.json b/papers/multi-agent-collaboration-survey-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "survey", + "reason": "The title explicitly identifies it as a survey, the methodology is meta-analysis, and the key findings describe a framework synthesizing existing work on multi-agent collaboration without reporting novel experimental results." +} +\ No newline at end of file diff --git a/papers/multi-agent-defense-prompt-injection-2025/paper_type.json b/papers/multi-agent-defense-prompt-injection-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper proposes defense architectures and reports quantitative experimental results (0% ASR, 20-30% baseline vulnerability) across 400 evaluations, with the primary contribution being the empirical findings about defense effectiveness." +} +\ No newline at end of file diff --git a/papers/multi-agent-trust-paradox-2025/paper_type.json b/papers/multi-agent-trust-paradox-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Empirically validates the Trust-Vulnerability Paradox across 1,488 agent-agent interaction chains with quantitative metrics spanning multiple LLM backends and orchestration frameworks." +} +\ No newline at end of file diff --git a/papers/multi-turn-jailbreak-2025/paper_type.json b/papers/multi-turn-jailbreak-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes an agent-driven jailbreak method and conducts controlled experiments on AdvBench, reporting quantitative attack success rates and comparative analysis against existing techniques, making the primary contribution experimental findings." +} +\ No newline at end of file diff --git a/papers/multiagent-codeorchestrated-generation-2025/paper_type.json b/papers/multiagent-codeorchestrated-generation-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes MACOG method and validates it through experiments on IaC-Eval, comparing against baselines and conducting ablations to quantify component contributions across 10 models." +} +\ No newline at end of file diff --git a/papers/multiagent-collaboration-harnessing-2023/paper_type.json b/papers/multiagent-collaboration-harnessing-2023/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a formal framework for multi-agent systems without empirical validation or mathematical proofs, primarily contributing a conceptual structure for thinking about agent architectures." +} +\ No newline at end of file diff --git a/papers/multiagent-collaborative-fuzzing-2025/paper_type.json b/papers/multiagent-collaborative-fuzzing-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper presents SmartFuzz, a new LLM-driven system for smart contract vulnerability detection, and validates it through quantitative experiments showing percentage improvements over baselines and real-world detection rates; the primary contribution is experimental findings, not a benchmark dataset itself." +} +\ No newline at end of file diff --git a/papers/multiagent-evolve-llm-2025/paper_type.json b/papers/multiagent-evolve-llm-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Proposes MAE method and validates through experiments on 22 benchmarks with quantitative improvements and ablation studies." +} +\ No newline at end of file diff --git a/papers/multiagent-onboarding-assistant-2025/paper_type.json b/papers/multiagent-onboarding-assistant-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts a user study with 8 programmers reporting quantitative results (Likert scale ratings), though the evaluation is limited in rigor (small sample, no baselines, no formal statistics)." +} +\ No newline at end of file diff --git a/papers/multidataset-evaluation-models-2025/paper_type.json b/papers/multidataset-evaluation-models-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Runs comparative experiments evaluating existing models (CodeT5, CodeBERT) on multiple vulnerability repair datasets, reporting quantitative performance metrics across different data conditions and transfer scenarios." +} +\ No newline at end of file diff --git a/papers/multilanguage-perspective-robustness-2025/paper_type.json b/papers/multilanguage-perspective-robustness-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs systematic experiments evaluating LLM code generation robustness across multiple languages and perturbation types, reporting quantitative comparative results as the primary contribution." +} +\ No newline at end of file diff --git a/papers/multilevel-explanations-generative-2024/paper_type.json b/papers/multilevel-explanations-generative-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper extends LIME/SHAP methods and tests multiple instantiations across existing benchmarks (XSUM, CNN/DM, SQuAD) with quantitative evaluation metrics and a user study, making experimental findings the primary contribution." +} +\ No newline at end of file diff --git a/papers/multilingual-blending-llm-2024/paper_type.json b/papers/multilingual-blending-llm-2024/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs experiments testing multilingual blending on LLMs and reports quantitative results on safety bypass rates across different language combinations, with the primary contribution being the experimental findings." +} +\ No newline at end of file diff --git a/papers/multimodal-prompt-injection-2025/paper_type.json b/papers/multimodal-prompt-injection-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "The paper runs systematic experiments testing four prompt injection attack types against eight commercial LLMs, reporting quantitative results on which models resist which attacks; the primary contribution is the experimental vulnerability findings." +} +\ No newline at end of file diff --git a/papers/multiple-llm-agents-2025/paper_type.json b/papers/multiple-llm-agents-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "empirical", + "reason": "Conducts experiments on the NORMAD-ETI benchmark to quantify how multi-agent debate improves cultural alignment accuracy, reporting specific performance metrics and comparative results across model combinations." +} +\ No newline at end of file diff --git a/papers/multistakeholder-alignment-llmpowered-2025/paper_type.json b/papers/multistakeholder-alignment-llmpowered-2025/paper_type.json @@ -0,0 +1,4 @@ +{ + "paper_type": "position", + "reason": "Proposes a conceptual governance framework (Advisory Governance Layer) and policy taxonomy for LLM-powered systems with no implementation or empirical validation." +} +\ No newline at end of file diff --git a/scripts/classify-paper-type.py b/scripts/classify-paper-type.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +""" +Preliminary paper type classification using Haiku. + +Reads existing scan.json (title + key_findings + methodology_tags) and +classifies into: empirical, benchmark-creation, survey, position, theoretical. + +Writes result to papers/{slug}/paper_type.json (separate file, non-destructive). + +Usage: + python3 scripts/classify-paper-type.py # All unclassified + python3 scripts/classify-paper-type.py --limit 50 # First N + python3 scripts/classify-paper-type.py --parallel 8 # Concurrent (Haiku is fast+cheap) + python3 scripts/classify-paper-type.py --id metr-rct-2025 # Specific paper + python3 scripts/classify-paper-type.py --force # Re-classify all +""" + +import json +import subprocess +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +PAPERS_DIR = ROOT / "papers" + +PROMPT = """Classify this research paper into exactly ONE category. + +Categories: +1. **empirical** — runs experiments, reports quantitative results on benchmarks or datasets. The primary contribution is experimental findings. +2. **benchmark-creation** — introduces a new benchmark, dataset, or evaluation framework. May run baselines, but the primary contribution is the benchmark itself. +3. **survey** — reviews, surveys, or meta-analyzes existing work. Primary contribution is synthesis of the field. +4. **position** — argues a viewpoint, proposes a conceptual framework, or makes prescriptive claims without experimental validation. Includes vision papers and opinion pieces. +5. **theoretical** — proves something mathematically or analyzes properties formally. Primary contribution is theorems, proofs, or formal analysis. + +Paper information: +Title: {title} +Methodology tags: {tags} +Key findings: {key_findings} + +Respond with ONLY a JSON object: +{{"paper_type": "<one of: empirical, benchmark-creation, survey, position, theoretical>", "reason": "<one sentence>"}}""" + + +def classify_one(paper_id, force=False): + """Classify one paper. Returns (paper_id, type, reason) or (paper_id, None, error).""" + scan_path = PAPERS_DIR / paper_id / "scan.json" + type_path = PAPERS_DIR / paper_id / "paper_type.json" + + if type_path.exists() and not force: + with open(type_path) as f: + existing = json.load(f) + return paper_id, existing.get("paper_type"), "already classified" + + if not scan_path.exists(): + return paper_id, None, "no scan.json" + + with open(scan_path) as f: + scan = json.load(f) + + if scan.get("scan_version", 1) < 2: + return paper_id, None, "v1 scan" + + paper = scan.get("paper", {}) + prompt = PROMPT.format( + title=paper.get("title", ""), + tags=", ".join(scan.get("methodology_tags", [])), + key_findings=scan.get("key_findings", "")[:500], + ) + + try: + result = subprocess.run( + ["claude", "-p", "-", "--model", "haiku", "--max-turns", "1"], + input=prompt, + capture_output=True, text=True, timeout=30, + cwd=str(ROOT), + ) + + if result.returncode != 0: + return paper_id, None, f"claude exit {result.returncode}" + + output = result.stdout.strip() + json_start = output.find("{") + json_end = output.rfind("}") + 1 + if json_start == -1 or json_end == 0: + return paper_id, None, "no JSON in output" + + parsed = json.loads(output[json_start:json_end]) + paper_type = parsed.get("paper_type", "") + reason = parsed.get("reason", "") + + valid_types = ["empirical", "benchmark-creation", "survey", "position", "theoretical"] + if paper_type not in valid_types: + return paper_id, None, f"invalid type: {paper_type}" + + # Write separate file (non-destructive) + with open(type_path, "w") as f: + json.dump({"paper_type": paper_type, "reason": reason}, f, ensure_ascii=False, indent=2) + + return paper_id, paper_type, reason + + except json.JSONDecodeError as e: + return paper_id, None, f"JSON parse error: {e}" + except subprocess.TimeoutExpired: + return paper_id, None, "timeout" + except Exception as e: + return paper_id, None, f"error: {e}" + + +def main(): + args = sys.argv[1:] + force = "--force" in args + limit = None + specific_id = None + parallel = 1 + + for i, arg in enumerate(args): + if arg == "--limit" and i + 1 < len(args): + limit = int(args[i + 1]) + if arg == "--id" and i + 1 < len(args): + specific_id = args[i + 1] + if arg == "--parallel" and i + 1 < len(args): + parallel = int(args[i + 1]) + + # Collect candidates + candidates = [] + for scan_path in sorted(PAPERS_DIR.glob("*/scan.json")): + pid = scan_path.parent.name + if specific_id and pid != specific_id: + continue + with open(scan_path) as f: + s = json.load(f) + if s.get("scan_version", 1) < 2: + continue + type_path = scan_path.parent / "paper_type.json" + if type_path.exists() and not force and not specific_id: + continue + candidates.append(pid) + + if limit: + candidates = candidates[:limit] + + if not candidates: + print("No papers to classify.") + return + + print(f"Classifying {len(candidates)} papers" + f"{f' (parallel={parallel})' if parallel > 1 else ''}:\n") + + from collections import Counter + type_counts = Counter() + failures = 0 + + if parallel > 1: + with ThreadPoolExecutor(max_workers=parallel) as executor: + futures = {executor.submit(classify_one, pid, force): pid for pid in candidates} + for future in as_completed(futures): + pid, ptype, reason = future.result() + if ptype: + type_counts[ptype] += 1 + else: + failures += 1 + print(f" FAIL: {pid} — {reason}") + else: + for i, pid in enumerate(candidates): + _, ptype, reason = classify_one(pid, force) + if ptype: + type_counts[ptype] += 1 + else: + failures += 1 + print(f" FAIL: {pid} — {reason}") + if (i + 1) % 50 == 0: + print(f" ... {i+1}/{len(candidates)} done") + + total = sum(type_counts.values()) + print(f"\nDone. Classified: {total}, Failed: {failures}") + print(f"Distribution:") + for ptype, count in type_counts.most_common(): + print(f" {ptype:20s} {count:>4d} ({count/total*100:.0f}%)") + + +if __name__ == "__main__": + main()