scan-v5.json (25702B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "LimAgents: Multi-Agent LLMs for Generating Research Limitations", 6 "authors": [ 7 "Ibrahim Al Azher", 8 "Zhishuai Guo", 9 "Hamed Alhoori" 10 ], 11 "year": 2025, 12 "venue": "arXiv.org", 13 "arxiv_id": "2601.11578", 14 "doi": "10.48550/arXiv.2601.11578" 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "The +15.51% coverage gain for GPT-4o mini (64.94% vs 49.43% zero-shot) and +4.41% for Llama 3 8B (66.45% vs 62.04%) are directly confirmed by Table I.", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": true, 27 "justification": "The paper makes causal claims about agent decomposition improving performance, supported by sequential ablation studies (Section VIII, Tables VI–VII) that add agents one at a time and measure incremental impact.", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": false, 33 "justification": "The abstract and conclusion present findings broadly ('LLM agents consistently outperform zero-shot prompting') without adequately bounding claims to NeurIPS 2022–2023 papers and two models; the limitations section acknowledges this only after the fact.", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper does not consider alternative explanations for coverage gains, such as whether improvements stem simply from generating more limitations (quantity effect) rather than agent specialization; only the agent-design hypothesis is explored.", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "The paper explicitly distinguishes between Ground Truth Coverage (recall of a reference set) and output quality, and discusses why CGT is preferred over BLEU/ROUGE as a proxy for the broader goal of comprehensive limitation generation.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": true, 53 "justification": "Section XI 'Limitations and Future Work' is a dedicated section listing specific methodological constraints.", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": true, 59 "justification": "Specific threats are stated: dataset restricted to NeurIPS, two models only, input truncation for Llama 3 8B, human evaluation confined to extraction task, and prompt incompatibility with DeepSeek.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": true, 65 "justification": "The paper explicitly limits scope to NeurIPS 2022–2023 and two models (Llama 3 8B and GPT-4o mini), stating these restrict generalizability.", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding acknowledgment or grant information appears anywhere in the paper.", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "All three authors list 'Department of Computer Science, Northern Illinois University' with email addresses on the title page.", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": false, 84 "answer": false, 85 "justification": "No funding is disclosed, so this criterion is not applicable.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests or financial interests statement is present anywhere in the paper.", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "'LLM agents' is explicitly defined as 'LLM instances configured with specific responsibilities that collaborate within a multi-agent workflow'; Ground Truth Coverage is formally defined with notation in Section V.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "The paper clearly states it contributes: (1) the LimAgents multi-agent framework, (2) a dataset of 2,700 NeurIPS papers with limitations, and (3) a pointwise LLM-as-Judge evaluation protocol.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "The related work section engages specifically with OpenReviewer, DeepReview, ReviewRobot, AgentReview, and the BAGELS benchmark, explicitly positioning LimAgents as the first agent-based approach to limitation generation.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": true, 122 "justification": "A GitHub URL is provided in the abstract: https://github.com/IbrahimAlAzhar/LimAgents_limitation_generation_with_LLM_Agents.", 123 "source": "haiku" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": true, 128 "justification": "A HuggingFace dataset URL is provided in the abstract for the full 51,300-limitation dataset.", 129 "source": "haiku" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "No requirements.txt, Dockerfile, or dependency specifications are mentioned; tools referenced (ScienceParse, OpenAlex, FAISS, BM25) are not versioned.", 135 "source": "haiku" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "No step-by-step reproduction instructions are included in the paper; code is released but pipeline setup is not described in actionable detail.", 141 "source": "haiku" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": false, 148 "justification": "All results in Tables I–VII are point estimates; no confidence intervals or error bars are reported for any comparison.", 149 "source": "haiku" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": false, 154 "justification": "No statistical significance tests are used; comparative claims (e.g., +15.51% coverage improvement) are made without p-values or hypothesis testing.", 155 "source": "haiku" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Percentage improvements with baseline context are reported throughout (e.g., +15.51%, +4.41%, -12.62 CGT from feedback), providing interpretable effect sizes.", 161 "source": "haiku" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": false, 166 "justification": "The 2,700-paper corpus and 500-sample human evaluation are not accompanied by power analysis or justification for adequacy.", 167 "source": "haiku" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": false, 172 "justification": "No standard deviations, variance, or results across multiple runs are reported; each configuration appears to have been run once.", 173 "source": "haiku" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": true, 180 "justification": "Zero-shot baselines for both Llama 3 8B and GPT-4o mini are included in Table I for direct comparison.", 181 "source": "haiku" 182 }, 183 "baselines_contemporary": { 184 "applies": true, 185 "answer": true, 186 "justification": "GPT-4o mini and Llama 3 8B are contemporary (2024) zero-shot baselines; the paper also references BAGELS [1] as a related benchmark baseline.", 187 "source": "haiku" 188 }, 189 "ablation_study": { 190 "applies": true, 191 "answer": true, 192 "justification": "Section VIII provides a comprehensive ablation study testing citation agent context, agent quantity, input granularity, and sequential agent contributions with detailed tables.", 193 "source": "haiku" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": true, 198 "justification": "Five metrics are used: Ground Truth Coverage (CGT), ROUGE-L, BLEU, Cosine Similarity, and Jaccard Similarity.", 199 "source": "haiku" 200 }, 201 "human_evaluation": { 202 "applies": true, 203 "answer": true, 204 "justification": "Three graduate students with ML/NLP expertise evaluated 500 samples for extraction faithfulness, and two independent annotators validated LLM-as-Judge agreement (0.98 and 0.95).", 205 "source": "haiku" 206 }, 207 "held_out_test_set": { 208 "applies": false, 209 "answer": false, 210 "justification": "The framework uses pre-trained LLMs in inference mode with no training step, so a held-out test set distinction is not applicable.", 211 "source": "haiku" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": true, 216 "justification": "Results are broken down per model, per agent configuration, and per individual agent (Tables IV, V, VI, VII) with separate quality metrics per agent.", 217 "source": "haiku" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Gemini 1.5 Flash's failure in the Extractor role, DeepSeek's prompt incompatibility, Graph Agent's 50-point performance collapse, and Llama 3 1B's zero-shot failure are all discussed.", 223 "source": "haiku" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": true, 228 "justification": "Feedback reducing CGT by 12.62 points for Llama 3 8B, the 9-agent configuration underperforming 3-agent by 16.64 points, and second-round feedback degrading GPT-4o mini results are all reported.", 229 "source": "haiku" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": false, 236 "justification": "'GPT-4o mini' and 'Llama 3 8B' are named without snapshot/API version dates; 'DeepSeek R1 Qwen Distil' is partially versioned but the GPT-4o mini snapshot used is unspecified.", 237 "source": "haiku" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": true, 242 "justification": "Full prompt text for all five agents (Extractor, Analyzer, Reviewer, Citation, Master, Judge, Evaluation) is provided in Figures 3–9 of the appendix.", 243 "source": "haiku" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": false, 248 "justification": "Temperature is mentioned only for the data extraction step ('zero temperature for consistency'); temperature and other hyperparameters for the main agent runs are not reported.", 249 "source": "haiku" 250 }, 251 "scaffolding_described": { 252 "applies": true, 253 "answer": true, 254 "justification": "The multi-agent workflow is described in detail in Section IV with a framework diagram (Figure 1), including the sequential/parallel execution, Self-Feedback Agent threshold (8/10), and Master Agent consolidation steps.", 255 "source": "haiku" 256 }, 257 "data_preprocessing_documented": { 258 "applies": true, 259 "answer": true, 260 "justification": "Section III documents the full preprocessing pipeline: ScienceParse for text extraction, keyword scanning for limitations, LLM refinement, Selenium-based OpenReview scraping, and LLM-based deduplication merger.", 261 "source": "haiku" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": true, 268 "justification": "The 51,300-limitation dataset is released on HuggingFace with a direct URL in the abstract.", 269 "source": "haiku" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "Section III describes NeurIPS 2022–2023 corpus assembly via ScienceParse, OpenReview scraping, two-stream extraction for author vs. reviewer limitations, and the LLM merger step.", 275 "source": "haiku" 276 }, 277 "recruitment_methods_described": { 278 "applies": true, 279 "answer": true, 280 "justification": "Three graduate students with ML/NLP expertise were recruited for human evaluation; two independent annotators validated LLM-as-Judge outputs on 100 random pairs.", 281 "source": "haiku" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": true, 286 "justification": "The full pipeline from PDF collection through ScienceParse parsing, limitation extraction, OpenReview scraping, LLM refinement, and ground-truth construction is documented in Section III.", 287 "source": "haiku" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "Training data cutoffs for GPT-4o mini and Llama 3 8B are not stated, despite the evaluation corpus consisting of NeurIPS 2022–2023 papers that fall within those training windows.", 295 "source": "haiku" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": true, 299 "answer": false, 300 "justification": "The possibility that NeurIPS 2022–2023 papers and their OpenReview comments appeared in the LLMs' training data is never discussed, which is a meaningful threat to evaluation validity.", 301 "source": "haiku" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": true, 305 "answer": false, 306 "justification": "NeurIPS 2022–2023 papers are clearly within the training window of GPT-4o mini and Llama 3 8B; the models may have memorized limitations or review comments from these papers, which is not addressed.", 307 "source": "haiku" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": false, 313 "answer": false, 314 "justification": "Human annotators are used for validation tasks, not as study participants in a human subjects research sense; pre-registration is not applicable.", 315 "source": "haiku" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": false, 319 "answer": false, 320 "justification": "Graduate student annotators performing NLP annotation tasks do not require IRB approval under standard research protocols.", 321 "source": "haiku" 322 }, 323 "demographics_reported": { 324 "applies": false, 325 "answer": false, 326 "justification": "Not a human subjects study; annotator demographics beyond 'graduate students with ML/NLP expertise' are not required.", 327 "source": "haiku" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": false, 331 "answer": false, 332 "justification": "Not applicable for annotation tasks performed by expert graduate students.", 333 "source": "haiku" 334 }, 335 "randomization_described": { 336 "applies": false, 337 "answer": false, 338 "justification": "Not applicable; random sampling of 500 evaluation examples is noted but this is not a randomized controlled trial.", 339 "source": "haiku" 340 }, 341 "blinding_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "Not applicable to this annotation validation task.", 345 "source": "haiku" 346 }, 347 "attrition_reported": { 348 "applies": false, 349 "answer": false, 350 "justification": "Not applicable; no attrition risk in annotation tasks.", 351 "source": "haiku" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": true, 357 "answer": false, 358 "justification": "The paper claims the framework avoids 'significant computational expense' but provides no actual cost or latency figures for running the multi-agent pipeline on 2,700 papers.", 359 "source": "haiku" 360 }, 361 "compute_budget_stated": { 362 "applies": true, 363 "answer": false, 364 "justification": "No total compute budget, API call counts, or wall-clock time estimates are stated for the experiments.", 365 "source": "haiku" 366 } 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "RAG + multi-agent GPT-4o mini achieves +15.51% Ground Truth Coverage gain over zero-shot baseline", 373 "evidence": "Table I: GPT-4o mini 4-agent 64.94% vs zero-shot 49.43% on CGT metric across 2,700 NeurIPS papers", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Llama 3 8B multi-agent (3 agents) achieves +4.41% improvement over its zero-shot baseline", 378 "evidence": "Table I: Llama 3 8B 3-agent 66.45% vs zero-shot 62.04%; confirmed by secondary Full Text Coverage metric in Table III", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Optimal agent configuration is model-dependent: smaller models perform best with 3 agents, larger models with 4", 383 "evidence": "Tables I and VII show 3-agent Llama 3 8B is optimal while Citation Agent hurts it (-12.61 CGT when added), but GPT-4o mini benefits from the Citation Agent (+7.09 points)", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "Self-feedback refinement trades breadth (coverage) for depth (quality): feedback decreases CGT by 12.62 for Llama 3 8B while improving LLM-generated coverage", 388 "evidence": "Table II: CGT drops from 66.45 to 53.83 with feedback while CLLM improves from 36.59 to 44.77", 389 "supported": "strong" 390 }, 391 { 392 "claim": "Pointwise LLM-as-Judge evaluation is more reliable than traditional NLP metrics for measuring limitation coverage", 393 "evidence": "Human annotator agreement with LLM judge scores of 0.98 and 0.95 on 100 random pairs; ROUGE/cosine fail on valid paraphrasing as argued in Section V", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "3-agent Llama 3 8B outperforms zero-shot GPT-4o mini by +17.02 coverage points at lower cost", 398 "evidence": "Table I: Llama 3 8B 3-agent 66.45% vs GPT-4o mini zero-shot 49.43%; cost efficiency claimed but not quantified", 399 "supported": "moderate" 400 } 401 ], 402 "methodology_tags": [ 403 "benchmark-eval", 404 "empirical" 405 ], 406 "key_findings": "LimAgents demonstrates that decomposing scientific limitation generation into specialized LLM agents (Extractor, Analyzer, Reviewer, Citation) substantially outperforms zero-shot prompting, with optimal configurations yielding +4.41% (Llama 3 8B) and +15.51% (GPT-4o mini) Ground Truth Coverage gains. The optimal agent count depends on model capacity: smaller models are overwhelmed by diverse cited-paper context, while larger models leverage it effectively. A fundamental quality-breadth trade-off is identified: iterative self-feedback produces more polished limitations but reduces coverage, with diminishing returns after one feedback round. The paper introduces a pointwise LLM-as-Judge evaluation protocol validated by human annotators (agreement 0.95–0.98) as superior to n-gram metrics for this task.", 407 "red_flags": [ 408 { 409 "flag": "Contamination unaddressed", 410 "detail": "NeurIPS 2022–2023 papers and their OpenReview comments are almost certainly in GPT-4o mini and Llama 3 8B training data; model knowledge of the papers' actual limitations is never discussed and could inflate all performance metrics." 411 }, 412 { 413 "flag": "No statistical significance testing", 414 "detail": "All comparisons between configurations are point estimates without confidence intervals, error bars, or significance tests; it is unclear whether +4.41 or similar gains are reliable given single-run measurements." 415 }, 416 { 417 "flag": "No variance across runs", 418 "detail": "Every configuration appears to have been executed once; LLM-based pipelines have non-negligible output variance, and no repeated runs or standard deviations are reported." 419 }, 420 { 421 "flag": "Circular evaluation", 422 "detail": "GPT-4o mini serves simultaneously as the Judge agent, the Evaluation agent, and one of the evaluated agent models; this creates a self-favorability risk in quality scoring." 423 }, 424 { 425 "flag": "Incomplete ground truth by design", 426 "detail": "The authors acknowledge the ground truth (author + OpenReview limitations) is 'inherently incomplete,' which means CGT is measuring recall against an unknown fraction of true limitations, making absolute numbers uninterpretable." 427 }, 428 { 429 "flag": "Funding not disclosed", 430 "detail": "No funding source or competing interests statement is present, which is non-standard for published academic work." 431 } 432 ], 433 "cited_papers": [ 434 { 435 "title": "BAGELS: Benchmarking the Automated Generation and Extraction of Limitations from Scholarly Text", 436 "relevance": "Direct predecessor benchmark for the same task; provides the baseline evaluation framework and zero-shot results that LimAgents builds upon" 437 }, 438 { 439 "title": "Are we there yet? Revealing the risks of utilizing large language models in scholarly peer review", 440 "relevance": "Documents LLM peer review failures (hallucination, institution bias) that motivate the structured agent approach" 441 }, 442 { 443 "title": "AgentReview: Exploring peer review dynamics with LLM agents", 444 "relevance": "Prior multi-agent peer review system modeling reviewer/author/area-chair roles; direct methodological predecessor" 445 }, 446 { 447 "title": "OpenReviewer: A specialized large language model for generating critical scientific paper reviews", 448 "relevance": "Fine-tuned LLM baseline for structured review generation, compared as an alternative approach" 449 }, 450 { 451 "title": "DeepReview: Improving LLM-based paper review with human-like deep thinking process", 452 "relevance": "Multi-stage reasoning framework for reducing hallucinations in LLM reviews, related system" 453 }, 454 { 455 "title": "Can large language models provide useful feedback on research papers? A large-scale empirical analysis", 456 "relevance": "Large-scale evaluation of LLM feedback quality vs humans; establishes context for LLM review capabilities" 457 }, 458 { 459 "title": "Why do multi-agent LLM systems fail?", 460 "relevance": "Catalogs failure modes of multi-agent LLM systems including specification issues and inter-agent misalignment" 461 }, 462 { 463 "title": "LimTopic: LLM-based topic modeling and text summarization for analyzing scientific articles limitations", 464 "relevance": "Prior work by same first author on limitation analysis; baseline for LLM-based limitation discovery" 465 } 466 ], 467 "engagement_factors": { 468 "practical_relevance": { 469 "score": 2, 470 "justification": "Directly addresses a real pain point in peer review and manuscript preparation; code and dataset are released for practitioners to use." 471 }, 472 "surprise_contrarian": { 473 "score": 1, 474 "justification": "Multi-agent outperforming zero-shot is expected; the interesting but unsurprising finding is the quality-breadth trade-off from feedback loops." 475 }, 476 "fear_safety": { 477 "score": 0, 478 "justification": "No AI safety or risk concerns; the application is benign scientific writing assistance." 479 }, 480 "drama_conflict": { 481 "score": 1, 482 "justification": "Challenges the adequacy of BLEU/ROUGE for evaluating generation quality, which is a recurring debate in NLP evaluation." 483 }, 484 "demo_ability": { 485 "score": 2, 486 "justification": "Code is released on GitHub and the dataset is on HuggingFace; a practitioner could run the pipeline on their own paper." 487 }, 488 "brand_recognition": { 489 "score": 0, 490 "justification": "Northern Illinois University is not a high-profile AI lab; no industry collaboration mentioned." 491 } 492 }, 493 "hn_data": { 494 "threads": [ 495 { 496 "hn_id": "46806618", 497 "title": "ARM MTE Performance in Practice (Extended Version)", 498 "points": 3, 499 "comments": 0, 500 "url": "https://news.ycombinator.com/item?id=46806618", 501 "created_at": "2026-01-29T06:39:14Z" 502 }, 503 { 504 "hn_id": "46977450", 505 "title": "ARM MTE Performance in Practice (Extended Version)", 506 "points": 2, 507 "comments": 0, 508 "url": "https://news.ycombinator.com/item?id=46977450", 509 "created_at": "2026-02-11T16:57:57Z" 510 } 511 ], 512 "top_points": 3, 513 "total_points": 5, 514 "total_comments": 0 515 } 516 }