scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25702B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LimAgents: Multi-Agent LLMs for Generating Research Limitations",
      6     "authors": [
      7       "Ibrahim Al Azher",
      8       "Zhishuai Guo",
      9       "Hamed Alhoori"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2601.11578",
     14     "doi": "10.48550/arXiv.2601.11578"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The +15.51% coverage gain for GPT-4o mini (64.94% vs 49.43% zero-shot) and +4.41% for Llama 3 8B (66.45% vs 62.04%) are directly confirmed by Table I.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper makes causal claims about agent decomposition improving performance, supported by sequential ablation studies (Section VIII, Tables VI–VII) that add agents one at a time and measure incremental impact.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The abstract and conclusion present findings broadly ('LLM agents consistently outperform zero-shot prompting') without adequately bounding claims to NeurIPS 2022–2023 papers and two models; the limitations section acknowledges this only after the fact.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper does not consider alternative explanations for coverage gains, such as whether improvements stem simply from generating more limitations (quantity effect) rather than agent specialization; only the agent-design hypothesis is explored.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper explicitly distinguishes between Ground Truth Coverage (recall of a reference set) and output quality, and discusses why CGT is preferred over BLEU/ROUGE as a proxy for the broader goal of comprehensive limitation generation.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section XI 'Limitations and Future Work' is a dedicated section listing specific methodological constraints.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats are stated: dataset restricted to NeurIPS, two models only, input truncation for Llama 3 8B, human evaluation confined to extraction task, and prompt incompatibility with DeepSeek.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper explicitly limits scope to NeurIPS 2022–2023 and two models (Llama 3 8B and GPT-4o mini), stating these restrict generalizability.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding acknowledgment or grant information appears anywhere in the paper.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors list 'Department of Computer Science, Northern Illinois University' with email addresses on the title page.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding is disclosed, so this criterion is not applicable.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is present anywhere in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "'LLM agents' is explicitly defined as 'LLM instances configured with specific responsibilities that collaborate within a multi-agent workflow'; Ground Truth Coverage is formally defined with notation in Section V.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper clearly states it contributes: (1) the LimAgents multi-agent framework, (2) a dataset of 2,700 NeurIPS papers with limitations, and (3) a pointwise LLM-as-Judge evaluation protocol.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The related work section engages specifically with OpenReviewer, DeepReview, ReviewRobot, AgentReview, and the BAGELS benchmark, explicitly positioning LimAgents as the first agent-based approach to limitation generation.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "A GitHub URL is provided in the abstract: https://github.com/IbrahimAlAzhar/LimAgents_limitation_generation_with_LLM_Agents.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "A HuggingFace dataset URL is provided in the abstract for the full 51,300-limitation dataset.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No requirements.txt, Dockerfile, or dependency specifications are mentioned; tools referenced (ScienceParse, OpenAlex, FAISS, BM25) are not versioned.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions are included in the paper; code is released but pipeline setup is not described in actionable detail.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "All results in Tables I–VII are point estimates; no confidence intervals or error bars are reported for any comparison.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No statistical significance tests are used; comparative claims (e.g., +15.51% coverage improvement) are made without p-values or hypothesis testing.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Percentage improvements with baseline context are reported throughout (e.g., +15.51%, +4.41%, -12.62 CGT from feedback), providing interpretable effect sizes.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The 2,700-paper corpus and 500-sample human evaluation are not accompanied by power analysis or justification for adequacy.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No standard deviations, variance, or results across multiple runs are reported; each configuration appears to have been run once.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Zero-shot baselines for both Llama 3 8B and GPT-4o mini are included in Table I for direct comparison.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "GPT-4o mini and Llama 3 8B are contemporary (2024) zero-shot baselines; the paper also references BAGELS [1] as a related benchmark baseline.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Section VIII provides a comprehensive ablation study testing citation agent context, agent quantity, input granularity, and sequential agent contributions with detailed tables.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Five metrics are used: Ground Truth Coverage (CGT), ROUGE-L, BLEU, Cosine Similarity, and Jaccard Similarity.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Three graduate students with ML/NLP expertise evaluated 500 samples for extraction faithfulness, and two independent annotators validated LLM-as-Judge agreement (0.98 and 0.95).",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": false,
    209           "answer": false,
    210           "justification": "The framework uses pre-trained LLMs in inference mode with no training step, so a held-out test set distinction is not applicable.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results are broken down per model, per agent configuration, and per individual agent (Tables IV, V, VI, VII) with separate quality metrics per agent.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Gemini 1.5 Flash's failure in the Extractor role, DeepSeek's prompt incompatibility, Graph Agent's 50-point performance collapse, and Llama 3 1B's zero-shot failure are all discussed.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Feedback reducing CGT by 12.62 points for Llama 3 8B, the 9-agent configuration underperforming 3-agent by 16.64 points, and second-round feedback degrading GPT-4o mini results are all reported.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "'GPT-4o mini' and 'Llama 3 8B' are named without snapshot/API version dates; 'DeepSeek R1 Qwen Distil' is partially versioned but the GPT-4o mini snapshot used is unspecified.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Full prompt text for all five agents (Extractor, Analyzer, Reviewer, Citation, Master, Judge, Evaluation) is provided in Figures 3–9 of the appendix.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "Temperature is mentioned only for the data extraction step ('zero temperature for consistency'); temperature and other hyperparameters for the main agent runs are not reported.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "The multi-agent workflow is described in detail in Section IV with a framework diagram (Figure 1), including the sequential/parallel execution, Self-Feedback Agent threshold (8/10), and Master Agent consolidation steps.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section III documents the full preprocessing pipeline: ScienceParse for text extraction, keyword scanning for limitations, LLM refinement, Selenium-based OpenReview scraping, and LLM-based deduplication merger.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "The 51,300-limitation dataset is released on HuggingFace with a direct URL in the abstract.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section III describes NeurIPS 2022–2023 corpus assembly via ScienceParse, OpenReview scraping, two-stream extraction for author vs. reviewer limitations, and the LLM merger step.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": true,
    279           "answer": true,
    280           "justification": "Three graduate students with ML/NLP expertise were recruited for human evaluation; two independent annotators validated LLM-as-Judge outputs on 100 random pairs.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The full pipeline from PDF collection through ScienceParse parsing, limitation extraction, OpenReview scraping, LLM refinement, and ground-truth construction is documented in Section III.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Training data cutoffs for GPT-4o mini and Llama 3 8B are not stated, despite the evaluation corpus consisting of NeurIPS 2022–2023 papers that fall within those training windows.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "The possibility that NeurIPS 2022–2023 papers and their OpenReview comments appeared in the LLMs' training data is never discussed, which is a meaningful threat to evaluation validity.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "NeurIPS 2022–2023 papers are clearly within the training window of GPT-4o mini and Llama 3 8B; the models may have memorized limitations or review comments from these papers, which is not addressed.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "Human annotators are used for validation tasks, not as study participants in a human subjects research sense; pre-registration is not applicable.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "Graduate student annotators performing NLP annotation tasks do not require IRB approval under standard research protocols.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "Not a human subjects study; annotator demographics beyond 'graduate students with ML/NLP expertise' are not required.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "Not applicable for annotation tasks performed by expert graduate students.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "Not applicable; random sampling of 500 evaluation examples is noted but this is not a randomized controlled trial.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "Not applicable to this annotation validation task.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "Not applicable; no attrition risk in annotation tasks.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "The paper claims the framework avoids 'significant computational expense' but provides no actual cost or latency figures for running the multi-agent pipeline on 2,700 papers.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No total compute budget, API call counts, or wall-clock time estimates are stated for the experiments.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "RAG + multi-agent GPT-4o mini achieves +15.51% Ground Truth Coverage gain over zero-shot baseline",
    373       "evidence": "Table I: GPT-4o mini 4-agent 64.94% vs zero-shot 49.43% on CGT metric across 2,700 NeurIPS papers",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Llama 3 8B multi-agent (3 agents) achieves +4.41% improvement over its zero-shot baseline",
    378       "evidence": "Table I: Llama 3 8B 3-agent 66.45% vs zero-shot 62.04%; confirmed by secondary Full Text Coverage metric in Table III",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Optimal agent configuration is model-dependent: smaller models perform best with 3 agents, larger models with 4",
    383       "evidence": "Tables I and VII show 3-agent Llama 3 8B is optimal while Citation Agent hurts it (-12.61 CGT when added), but GPT-4o mini benefits from the Citation Agent (+7.09 points)",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Self-feedback refinement trades breadth (coverage) for depth (quality): feedback decreases CGT by 12.62 for Llama 3 8B while improving LLM-generated coverage",
    388       "evidence": "Table II: CGT drops from 66.45 to 53.83 with feedback while CLLM improves from 36.59 to 44.77",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Pointwise LLM-as-Judge evaluation is more reliable than traditional NLP metrics for measuring limitation coverage",
    393       "evidence": "Human annotator agreement with LLM judge scores of 0.98 and 0.95 on 100 random pairs; ROUGE/cosine fail on valid paraphrasing as argued in Section V",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "3-agent Llama 3 8B outperforms zero-shot GPT-4o mini by +17.02 coverage points at lower cost",
    398       "evidence": "Table I: Llama 3 8B 3-agent 66.45% vs GPT-4o mini zero-shot 49.43%; cost efficiency claimed but not quantified",
    399       "supported": "moderate"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "benchmark-eval",
    404     "empirical"
    405   ],
    406   "key_findings": "LimAgents demonstrates that decomposing scientific limitation generation into specialized LLM agents (Extractor, Analyzer, Reviewer, Citation) substantially outperforms zero-shot prompting, with optimal configurations yielding +4.41% (Llama 3 8B) and +15.51% (GPT-4o mini) Ground Truth Coverage gains. The optimal agent count depends on model capacity: smaller models are overwhelmed by diverse cited-paper context, while larger models leverage it effectively. A fundamental quality-breadth trade-off is identified: iterative self-feedback produces more polished limitations but reduces coverage, with diminishing returns after one feedback round. The paper introduces a pointwise LLM-as-Judge evaluation protocol validated by human annotators (agreement 0.95–0.98) as superior to n-gram metrics for this task.",
    407   "red_flags": [
    408     {
    409       "flag": "Contamination unaddressed",
    410       "detail": "NeurIPS 2022–2023 papers and their OpenReview comments are almost certainly in GPT-4o mini and Llama 3 8B training data; model knowledge of the papers' actual limitations is never discussed and could inflate all performance metrics."
    411     },
    412     {
    413       "flag": "No statistical significance testing",
    414       "detail": "All comparisons between configurations are point estimates without confidence intervals, error bars, or significance tests; it is unclear whether +4.41 or similar gains are reliable given single-run measurements."
    415     },
    416     {
    417       "flag": "No variance across runs",
    418       "detail": "Every configuration appears to have been executed once; LLM-based pipelines have non-negligible output variance, and no repeated runs or standard deviations are reported."
    419     },
    420     {
    421       "flag": "Circular evaluation",
    422       "detail": "GPT-4o mini serves simultaneously as the Judge agent, the Evaluation agent, and one of the evaluated agent models; this creates a self-favorability risk in quality scoring."
    423     },
    424     {
    425       "flag": "Incomplete ground truth by design",
    426       "detail": "The authors acknowledge the ground truth (author + OpenReview limitations) is 'inherently incomplete,' which means CGT is measuring recall against an unknown fraction of true limitations, making absolute numbers uninterpretable."
    427     },
    428     {
    429       "flag": "Funding not disclosed",
    430       "detail": "No funding source or competing interests statement is present, which is non-standard for published academic work."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "BAGELS: Benchmarking the Automated Generation and Extraction of Limitations from Scholarly Text",
    436       "relevance": "Direct predecessor benchmark for the same task; provides the baseline evaluation framework and zero-shot results that LimAgents builds upon"
    437     },
    438     {
    439       "title": "Are we there yet? Revealing the risks of utilizing large language models in scholarly peer review",
    440       "relevance": "Documents LLM peer review failures (hallucination, institution bias) that motivate the structured agent approach"
    441     },
    442     {
    443       "title": "AgentReview: Exploring peer review dynamics with LLM agents",
    444       "relevance": "Prior multi-agent peer review system modeling reviewer/author/area-chair roles; direct methodological predecessor"
    445     },
    446     {
    447       "title": "OpenReviewer: A specialized large language model for generating critical scientific paper reviews",
    448       "relevance": "Fine-tuned LLM baseline for structured review generation, compared as an alternative approach"
    449     },
    450     {
    451       "title": "DeepReview: Improving LLM-based paper review with human-like deep thinking process",
    452       "relevance": "Multi-stage reasoning framework for reducing hallucinations in LLM reviews, related system"
    453     },
    454     {
    455       "title": "Can large language models provide useful feedback on research papers? A large-scale empirical analysis",
    456       "relevance": "Large-scale evaluation of LLM feedback quality vs humans; establishes context for LLM review capabilities"
    457     },
    458     {
    459       "title": "Why do multi-agent LLM systems fail?",
    460       "relevance": "Catalogs failure modes of multi-agent LLM systems including specification issues and inter-agent misalignment"
    461     },
    462     {
    463       "title": "LimTopic: LLM-based topic modeling and text summarization for analyzing scientific articles limitations",
    464       "relevance": "Prior work by same first author on limitation analysis; baseline for LLM-based limitation discovery"
    465     }
    466   ],
    467   "engagement_factors": {
    468     "practical_relevance": {
    469       "score": 2,
    470       "justification": "Directly addresses a real pain point in peer review and manuscript preparation; code and dataset are released for practitioners to use."
    471     },
    472     "surprise_contrarian": {
    473       "score": 1,
    474       "justification": "Multi-agent outperforming zero-shot is expected; the interesting but unsurprising finding is the quality-breadth trade-off from feedback loops."
    475     },
    476     "fear_safety": {
    477       "score": 0,
    478       "justification": "No AI safety or risk concerns; the application is benign scientific writing assistance."
    479     },
    480     "drama_conflict": {
    481       "score": 1,
    482       "justification": "Challenges the adequacy of BLEU/ROUGE for evaluating generation quality, which is a recurring debate in NLP evaluation."
    483     },
    484     "demo_ability": {
    485       "score": 2,
    486       "justification": "Code is released on GitHub and the dataset is on HuggingFace; a practitioner could run the pipeline on their own paper."
    487     },
    488     "brand_recognition": {
    489       "score": 0,
    490       "justification": "Northern Illinois University is not a high-profile AI lab; no industry collaboration mentioned."
    491     }
    492   },
    493   "hn_data": {
    494     "threads": [
    495       {
    496         "hn_id": "46806618",
    497         "title": "ARM MTE Performance in Practice (Extended Version)",
    498         "points": 3,
    499         "comments": 0,
    500         "url": "https://news.ycombinator.com/item?id=46806618",
    501         "created_at": "2026-01-29T06:39:14Z"
    502       },
    503       {
    504         "hn_id": "46977450",
    505         "title": "ARM MTE Performance in Practice (Extended Version)",
    506         "points": 2,
    507         "comments": 0,
    508         "url": "https://news.ycombinator.com/item?id=46977450",
    509         "created_at": "2026-02-11T16:57:57Z"
    510       }
    511     ],
    512     "top_points": 3,
    513     "total_points": 5,
    514     "total_comments": 0
    515   }
    516 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs