scan.json (23647B)
1 { 2 "paper": { 3 "title": "Towards Temporal Knowledge-Base Creation for Fine-Grained Opinion Analysis with Language Models", 4 "authors": ["Gaurav Negi", "Atul Kr. Ojha", "Omnia Zayed", "Paul Buitelaar"], 5 "year": 2025, 6 "venue": "KBC-LM @ ISWC 2025", 7 "arxiv_id": "2509.02363", 8 "doi": "10.48550/arXiv.2509.02363" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "The paper proposes a DSPy-based LLM annotation pipeline for creating temporal opinion knowledge bases using three fine-grained opinion data models (ACOS, SSA, UOC). Best F1 scores on human-annotated test sets are 59.92 (ACOS), 45.91 (SSA), and 58.17 (UOC) using 8B parameter models. Inter-LLM annotation agreement varies widely across opinion components, with classification labels (sentiment polarity) showing higher agreement than span-based elements (entity, aspect terms).", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "A GitHub URL is provided: https://github.com/ANON-1221/KBC-LM-Temporal-Opinions for the annotated dataset. However, this appears to be an anonymized URL, which may not resolve. The paper states 'The annotated dataset is available publicly' at that URL." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses publicly available datasets (ACOS, SSA/SemEval 2022 Task 10, UOC, StockMotions, Politifact) and claims to release the annotated knowledge base at the GitHub URL." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions using SGLang for serving and an NVIDIA RTX 4090, but provides no requirements.txt, Dockerfile, or library versions beyond the model names." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided. The pipeline description is high-level (DSPy programs, MIPRO optimizer) without runnable commands or scripts." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results in Tables 3 and 4 are point estimates (P, R, F1) with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper compares models and settings (e.g., Llama vs Ministral, COT vs no COT) but no statistical significance tests are applied. Differences are discussed informally." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": false, 51 "justification": "Raw F1 scores are reported but no effect sizes (e.g., Cohen's d) or contextualized magnitude measures are provided for comparisons between settings." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The training sample sizes (150, 152, 30 examples for DSPy) and test sample sizes are stated but not justified. No power analysis or discussion of whether these sizes are adequate." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures across runs are reported. Temperature is set to 0.0 for determinism, but DSPy optimization involves stochastic elements (MIPRO) and no repeated-run variance is shown." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Zero-shot performance serves as a baseline, compared against MIPRO-optimized settings with varying ICL examples (5, 10, 15 shot). Two models are also compared against each other." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": false, 73 "justification": "The paper does not compare against any prior LLM-based annotation methods or state-of-the-art opinion extraction systems. The only baselines are zero-shot versions of the same models." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper systematically varies ICL examples (0, 5, 10, 15) and COT (yes/no) across both models, which functions as an ablation of pipeline components (Table 3)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Precision, recall, and F1 are all reported for each configuration in Table 3." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation of the LLM-generated annotations is performed beyond comparison to existing human-annotated test sets. The temporal KB annotations are only evaluated via inter-LLM agreement, not human judgment." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The paper explicitly separates training samples (for DSPy/ICL configuration) from test samples for evaluation. Table 1 shows separate train/eval/test splits." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 3 breaks down by data model (SSA, ACOS, UOC). Table 4 provides per-component agreement (Target, Holder, Aspect Term, Polarity, etc.) across data models and datasets." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 6.1 discusses low-agreement components (entity, aspect terms) and Section 6.2 provides qualitative examples showing where different data models produce different or inconsistent annotations." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that COT reasoning 'does not seem to have a definitive effect' and shows cases where more ICL examples hurt performance (e.g., SSA with 15-shot COT at 39.52 vs 10-shot COT at 45.91)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims a 'scalable method' for temporal KB creation with 'rigorous quantitative evaluation' and 'inter-annotator agreement.' Tables 3 and 4 support these claims. The claims are appropriately scoped." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper's causal claims are modest — the ablation design (varying ICL count, COT) with controlled single-variable manipulation is adequate for the claims made about pipeline configuration effects." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper explicitly limits scope to two 8B models and acknowledges in Limitations (Section 8) that results may improve with larger or proprietary models. Claims are bounded to the tested setting." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not discuss alternative explanations for observed results. For example, low inter-LLM agreement on spans could be due to model architecture differences, training data differences, or inherent task ambiguity — none of these are explored." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper uses F1 between two LLMs as a 'proxy for IRR' without discussing the gap between inter-LLM agreement and actual annotation quality. Two models could agree on wrong answers. This proxy limitation is not acknowledged." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Exact model identifiers are provided: 'mistralai/Ministral-8B-Instruct-2410' and 'meta-llama/Llama-3.1-8B-Instruct' (Section 5, footnotes 3-4)." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper uses DSPy for prompt optimization but does not provide the actual optimized prompts or the DSPy signatures used. Only the pipeline structure is described at a high level." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 5 reports temperature=0.0, context window=128K, output length=4096, and training sample sizes for DSPy compilation (150, 152, 30)." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The pipeline is a straightforward DSPy program with LLM API calls, not an agentic system." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 4.2 describes the outlier estimation and stratified sampling protocol in detail, including IQR-based upper bounds and the rationale for excluding outliers. Figure 5 visualizes the distributions." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 8 is titled 'Limitations and Future Work' and discusses three specific limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 8 mentions specific limitations: only 8B parameter models used, only open-weight models tested, and lack of annotation ensembling. These are specific to this study." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 8 explicitly states what was not tested: larger models, proprietary models, and annotation ensembling methods. The scope is bounded to the two tested models." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The annotated knowledge base is claimed to be publicly available at the GitHub URL. The source datasets (StockMotions, Politifact, ACOS, SSA, UOC) are also publicly available." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Table 2 provides temporal coverage and sizes of the KB datasets. Table 1 describes the data model datasets with domain, split sizes. The source datasets are referenced with citations." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. All data comes from existing public datasets (StockMotions from StockTwits, Politifact from fact-checking)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Figure 4 documents the three-stage pipeline (configuration → evaluation → annotation). Section 4.2 details the sampling protocol with outlier exclusion criteria and stratified sampling." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Section 9 (Acknowledgments) discloses SFI Grant Number SFI/12/RC/2289_P2 (Insight_2) and Research Ireland Postdoctoral Fellowship GOIPD/2023/1556." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All authors are affiliated with Insight SFI Research Ireland Centre for Data Analytics, University of Galway, clearly stated in the header." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "The funder is Science Foundation Ireland, a public research funding agency with no commercial interest in the paper's outcome." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is provided in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper evaluates Llama-3.1-8B and Ministral-8B on opinion extraction tasks using existing annotated datasets but does not state the training data cutoff dates for either model." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "The ACOS and SSA datasets are publicly available and could be in the training data of both models. This overlap risk is not discussed." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "The test datasets (ACOS from ACL 2021, SSA from SemEval 2022) predate both models' training and were likely available online. Contamination risk is not addressed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference cost, latency, or tokens consumed are reported despite running annotation pipelines over ~31K examples across multiple settings." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "The paper mentions using an NVIDIA RTX 4090 but provides no GPU hours, wall-clock time, or total compute budget for pipeline configuration, evaluation, or annotation." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of multiple random seeds. Temperature is 0.0 for LLM inference, but DSPy MIPRO optimization has stochastic components and no seed sensitivity is reported." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is not stated. It appears results are from single runs." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "The MIPRO optimizer is used to select prompts and ICL examples, but the search budget (number of configurations explored, compute spent) is not reported." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "The paper systematically reports all tested configurations in Table 3 (zero-shot, 5/10/15-shot, with/without COT) and selects the best for temporal annotation based on test set F1." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "Multiple comparisons are made across models, settings, and data models with no statistical tests at all, let alone corrections." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement and evaluate their own pipeline without acknowledging potential bias in their implementation or evaluation choices." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "Different ICL settings have different compute costs (more examples = more tokens) but performance is not analyzed as a function of compute budget." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper uses component-level exact match F1 as its metric and discusses why tuple-level exact match is too harsh (citing Schaeffer et al.), but does not discuss whether F1 on these benchmarks actually measures annotation quality for downstream temporal analysis tasks." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved; the DSPy pipeline is the method being tested, not a scaffold wrapping a model." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "The ACOS (2021) and SSA (2022) benchmark datasets predate both models' training. The paper does not discuss whether models may have memorized these datasets." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "ICL examples are drawn from the same dataset distribution as test examples. No discussion of whether this introduces information leakage." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "Train and test splits come from the same datasets. No discussion of potential structural similarities or non-independence between training examples for DSPy and the test set." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "The DSPy-based declarative annotation pipeline achieves best F1 of 59.92 on ACOS, 45.91 on SSA, and 58.17 on UOC against human-annotated test sets.", 365 "evidence": "Table 3 reports these scores: ACOS best is Llama-3.1-8B with 15-shot no COT (59.92), SSA best is Llama-3.1-8B 10-shot with COT (45.91), UOC best is Ministral-8B 5-shot with COT (58.17).", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "The framework eliminates the need for manual prompt engineering when annotating fine-grained opinions with LLMs.", 370 "evidence": "Section 4 describes the DSPy pipeline where MIPRO automatically optimizes prompts and selects ICL examples. However, DSPy signatures and program structures still require manual design.", 371 "supported": "weak" 372 }, 373 { 374 "claim": "Classification labels (sentiment polarity, intensity) show higher inter-LLM agreement than span-based components (entity, aspect terms).", 375 "evidence": "Table 4 shows sentiment polarity agreement at 67-78% across models/datasets, while entity agreement is 14-21% and aspect terms 14-37%.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "COT reasoning does not have a definitive effect on performance across the tested models.", 380 "evidence": "Table 3 shows mixed results: COT helps in zero-shot settings on average but sometimes hurts in few-shot settings (e.g., ACOS Llama 15-shot: 59.92 no-COT vs 56.69 with COT).", 381 "supported": "moderate" 382 } 383 ], 384 "red_flags": [ 385 { 386 "flag": "No statistical rigor in comparisons", 387 "detail": "All comparisons between models, ICL settings, and COT configurations are made by comparing point estimates with no significance tests, confidence intervals, or variance measures. Small F1 differences (e.g., 45.91 vs 45.69) are discussed as meaningful." 388 }, 389 { 390 "flag": "Inter-LLM agreement used as quality proxy", 391 "detail": "For the temporal KB annotations, quality is assessed only by agreement between two 8B LLMs. Two models could systematically agree on incorrect annotations. No human validation of the temporal KB is performed." 392 }, 393 { 394 "flag": "Benchmark contamination risk unaddressed", 395 "detail": "Both models were likely trained on the publicly available ACOS and SSA datasets. Without decontamination analysis, the pipeline evaluation results may be inflated." 396 }, 397 { 398 "flag": "Anonymized GitHub URL", 399 "detail": "The dataset release URL (github.com/ANON-1221/KBC-LM-Temporal-Opinions) appears to be an anonymized placeholder that may not resolve to an actual repository." 400 } 401 ], 402 "cited_papers": [ 403 { 404 "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines", 405 "authors": ["Omar Khattab", "Arnav Singhvi", "Paridhi Maheshwari"], 406 "year": 2024, 407 "relevance": "Core framework used for the LLM annotation pipeline; relevant to LLM programming methodology." 408 }, 409 { 410 "title": "Language Models are Few-Shot Learners", 411 "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"], 412 "year": 2020, 413 "relevance": "Foundational work on LLM few-shot capabilities that motivates the annotation approach." 414 }, 415 { 416 "title": "Large Language Models for Data Annotation and Synthesis: A Survey", 417 "authors": ["Zhen Tan", "Dawei Li", "Song Wang"], 418 "year": 2024, 419 "doi": "10.18653/v1/2024.emnlp-main.54", 420 "relevance": "Survey of LLM-based data annotation methods directly relevant to AI-assisted annotation quality assessment." 421 }, 422 { 423 "title": "Are Emergent Abilities of Large Language Models a Mirage?", 424 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 425 "year": 2023, 426 "relevance": "Discusses discontinuous evaluation metrics and emergent abilities — relevant to LLM capability evaluation methodology." 427 }, 428 { 429 "title": "Optimizing Instructions and Demonstrations for Multi-Stage Language Model Programs", 430 "authors": ["Krista Opsahl-Ong", "Michael J. Ryan", "Josh Purtell"], 431 "year": 2024, 432 "doi": "10.18653/v1/2024.emnlp-main.525", 433 "relevance": "MIPRO optimizer used in the annotation pipeline; relevant to LLM prompt optimization methodology." 434 }, 435 { 436 "title": "ProSA: Assessing and Understanding the Prompt Sensitivity of LLMs", 437 "authors": ["Jingming Zhuo", "Songyang Zhang", "Xinyu Fang"], 438 "year": 2024, 439 "doi": "10.18653/v1/2024.findings-emnlp.108", 440 "relevance": "Addresses LLM prompt sensitivity, which motivates the declarative DSPy approach used in this work." 441 }, 442 { 443 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 444 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 445 "year": 2022, 446 "relevance": "COT prompting technique evaluated as a pipeline component in this work." 447 } 448 ] 449 }