scan-v5.json (23721B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Evaluating Embeddable Language Models in Verbalizing Rule-based Inferences through Justifications", 6 "authors": [ 7 "Bastien Dussard", 8 "Aurélie Clodic", 9 "Guillaume Sarthou" 10 ], 11 "year": 2025, 12 "venue": "IEEE RO-MAN 2025", 13 "arxiv_id": null, 14 "doi": "10.1109/RO-MAN63969.2025.11217601" 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "All major claims in the abstract are supported: token sensitivity is discussed throughout; order effects are validated with p<3.6e-10 (Figure 6); rule context improves performance +10.0% (Figure 7).", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": true, 27 "justification": "Causal claims ('order decreases performance', 'rule improves performance') are tested via controlled conditions (baseline vs. shuffle vs. rule) with ANOVA, supporting causal inference.", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": true, 33 "justification": "Scope bounded to robotic action-oriented ontologies with four SWRL rules; authors note results 'should be comparable' to other semantically similar ontologies but acknowledge domain specificity.", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": true, 39 "justification": "Alternative explanations offered for order effect (SWRL reasoner exploration methods vary), rule effect (structure guides linking), and mistral anomaly (compact outputs increase spurious correlations).", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "Paper clearly distinguishes measured metrics (correctness/completeness) from claimed value (explainability); acknowledges that technical correctness is prerequisite, not proof of human understanding.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": false, 53 "justification": "No dedicated limitations or threats-to-validity section. Discussion is embedded in conclusion (e.g., 'evaluation conducted on robotic action-oriented ontology'), which does not count per criteria.", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": false, 59 "justification": "Single expert annotator mentioned passively ('to ensure consistency') but no systematic discussion of inter-rater reliability, annotation bias, or sample size limitations.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": false, 65 "justification": "Scope is described implicitly (four rules, robotic actions, six models) but not explicitly stated as boundaries of what the results do NOT show.", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": true, 73 "justification": "Explicitly supported by ELSA (ANR-21-CE33-0019) and HumFleet (ANR-23-CE33-0003) projects, stated in footnote.", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "All authors affiliated with LAAS-CNRS, Université de Toulouse. No evaluated models or systems are author-affiliated products.", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": true, 84 "answer": true, 85 "justification": "ANR (French national research agency) is independent funder; paper evaluates open-source models with no proprietary bias.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests statement provided. Patents, equity, or consulting arrangements not declared (absence of declaration treated as NO per criteria).", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "Key terms defined in context: 'embeddable' (locally runnable on robotic GPU), SWRL rules (background section), ontologies (explained with RDF triples example), 'justifications' (subset of semantic facts supporting inference).", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "Abstract explicitly states: 'reference evaluation of embeddable language models on a task of translation'; contribution is (1) dataset, (2) baseline evaluation, (3) factor analysis (order, complexity, rule context).", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section III systematically covers ontology verbalization (ACE, NaturalOWL), NLG refinement (SWAT), and LLM approaches (Hao et al., Zaitoun et al.); explicitly positions this as 'first baseline evaluation' of this task.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": true, 122 "justification": "GitHub link provided: 'https://github.com/RIS-WITH/inference_explanation_benchmark' with explicit statement 'Our code and dataset are available online'.", 123 "source": "haiku" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": true, 128 "justification": "Same GitHub link claims dataset is available online; synthetic dataset generation process fully documented (4 rules × 3 complexity × 20 variations × 3 conditions = 720 examples).", 129 "source": "haiku" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "Ollama tool and model versions (llama3.2:3b, etc.) specified, but no requirements.txt, Dockerfile, or Python version provided. Sampling parameters not documented.", 135 "source": "haiku" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "Methodology is detailed (Section IV), prompts fully shown (Figure 2), but no step-by-step reproduction instructions in paper itself. GitHub repo may contain them, but paper alone is insufficient.", 141 "source": "haiku" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": true, 148 "justification": "Standard deviations reported in Table I and visualized as probability distributions in Figures 4-7 via kernel density estimation.", 149 "source": "haiku" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": true, 154 "justification": "Three-way ANOVA performed; p-values reported throughout (p < 6.7e-14, p < 2.0e-16, p < 3.6e-10, p < 5.6e-6, p = 0.31, p < 1.4e-2).", 155 "source": "haiku" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Mean differences reported as percentages: complexity -11.9% (medium) and -18.1% (hard); shuffle -8.9% completeness/-20.0% correctness; rule +10.0% correctness.", 161 "source": "haiku" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": false, 166 "justification": "720 total examples (4 rules × 3 complexity × 20 variations × 3 conditions) but no power analysis or justification for this configuration provided.", 167 "source": "haiku" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": true, 172 "justification": "Standard deviations reported for each condition (Table I); distributions visualized with spread in Figures 4-7.", 173 "source": "haiku" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": false, 180 "justification": "No comparison with prior systems (ACE, NaturalOWL, SWAT mentioned in related work). Baseline/shuffle/rule are conditions, not baseline methods.", 181 "source": "haiku" 182 }, 183 "baselines_contemporary": { 184 "applies": false, 185 "answer": false, 186 "justification": "No baseline systems compared.", 187 "source": "haiku" 188 }, 189 "ablation_study": { 190 "applies": true, 191 "answer": true, 192 "justification": "Experimental conditions test effect of input structure: baseline (logical order) vs. shuffle (random order) vs. rule (additional context); measures impact of each factor.", 193 "source": "haiku" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": true, 198 "justification": "Two metrics: correctness (binary semantic validity) and completeness (% of concepts translated).", 199 "source": "haiku" 200 }, 201 "human_evaluation": { 202 "applies": true, 203 "answer": true, 204 "justification": "Single expert annotator manually evaluated all 720 model outputs for correctness and completeness per stated guidelines.", 205 "source": "haiku" 206 }, 207 "held_out_test_set": { 208 "applies": false, 209 "answer": false, 210 "justification": "Pre-trained models evaluated; no train/test split. All 720 synthetic examples are evaluation examples.", 211 "source": "haiku" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": true, 216 "justification": "Results broken down by model, model family, complexity level, and condition (baseline/shuffle/rule). Table I and Figures 4-7 show per-condition and per-model performance.", 217 "source": "haiku" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Figure 3 shows annotated failure example; text discusses why mistral models spuriously correlate concepts; incorrect handling of individual names and causal links identified.", 223 "source": "haiku" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": true, 228 "justification": "Completeness metric shows no significant improvement when rule added (p = 0.31); this null finding is reported in Figure 7 and discussion.", 229 "source": "haiku" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": true, 236 "justification": "Exact model versions: llama3.2:3b, llama3.1:8b, gemma2:2b, gemma2:9b, mistral-nemo:12b, mistral-small:22b with snapshot dates implicit in version numbers.", 237 "source": "haiku" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": true, 242 "justification": "Full task prompt provided in Figure 2 (green section); four in-context examples shown with one displayed in red; exact inference/justification pair in blue.", 243 "source": "haiku" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": false, 248 "justification": "Only mentions Ollama tool and 'truncated at first newline'; temperature, top-p, frequency penalty, other sampling parameters not reported.", 249 "source": "haiku" 250 }, 251 "scaffolding_described": { 252 "applies": true, 253 "answer": true, 254 "justification": "Chain-of-Thought prompting (4-shot) described; examples show structure; no unrelated concepts in examples per design.", 255 "source": "haiku" 256 }, 257 "data_preprocessing_documented": { 258 "applies": true, 259 "answer": true, 260 "justification": "Dataset generation fully documented: complexity levels (10/14/17 triples), token variations (concept synonyms, anonymous IDs, random values), conditions (baseline/shuffle/rule) all specified.", 261 "source": "haiku" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": true, 268 "justification": "GitHub link claims both code and dataset available online; synthetic dataset fully reproducible from documented generation process.", 269 "source": "haiku" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "Section IV.A fully describes dataset generation: four SWRL rules designed, complexity levels introduced via axiom chains, variations created via concept/ID/value randomization.", 275 "source": "haiku" 276 }, 277 "recruitment_methods_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants recruited; single expert annotator is evaluator, not study subject. N/A.", 281 "source": "haiku" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": true, 286 "justification": "Pipeline documented: rule design → complexity variation → token variation → condition application → annotation (correctness + completeness) → ANOVA analysis.", 287 "source": "haiku" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "Training cutoff dates for Llama 3.2/3.1, Gemma 2, Mistral models not explicitly stated in paper.", 295 "source": "haiku" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": true, 299 "answer": false, 300 "justification": "SWRL and ontologies are standard formats unlikely in training data; synthetic examples reduce overlap risk; but no explicit discussion of potential contamination.", 301 "source": "haiku" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": true, 305 "answer": false, 306 "justification": "Synthetic task with standard ontology/SWRL format; no discussion of whether robotics papers in training data could enable prior knowledge of similar inferences.", 307 "source": "haiku" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": false, 313 "answer": false, 314 "justification": "No human subjects; N/A.", 315 "source": "haiku" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": false, 319 "answer": false, 320 "justification": "No human subjects; N/A.", 321 "source": "haiku" 322 }, 323 "demographics_reported": { 324 "applies": false, 325 "answer": false, 326 "justification": "No human subjects; N/A.", 327 "source": "haiku" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": false, 331 "answer": false, 332 "justification": "No human subjects; N/A.", 333 "source": "haiku" 334 }, 335 "randomization_described": { 336 "applies": false, 337 "answer": false, 338 "justification": "No human subjects; N/A.", 339 "source": "haiku" 340 }, 341 "blinding_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "No human subjects; N/A.", 345 "source": "haiku" 346 }, 347 "attrition_reported": { 348 "applies": false, 349 "answer": false, 350 "justification": "No human subjects; N/A.", 351 "source": "haiku" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": true, 357 "answer": false, 358 "justification": "No inference time, latency, or token cost reported. Relevant for embeddable models on robotic platforms but not discussed.", 359 "source": "haiku" 360 }, 361 "compute_budget_stated": { 362 "applies": true, 363 "answer": false, 364 "justification": "Total computational budget not stated. Could infer from 720 examples × 6 models but requires external calculation.", 365 "source": "haiku" 366 } 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "Order of justifications significantly decreases model performance", 373 "evidence": "Figure 6 shows 8.9% decrease in completeness (p < 3.6e-10) and 20.0% decrease in correctness (p < 5.6e-6) when justifications shuffled vs. baseline logical order.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Model size correlates with better performance", 378 "evidence": "Figures 4-7 and Table I show larger versions (9b, 12b, 22b) consistently outperform smaller versions (2b, 3b, 8b) on both metrics across all conditions.", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Adding SWRL rule as context significantly improves correctness", 383 "evidence": "Figure 7 shows +10.0% improvement in correctness (p < 1.4e-2) when rule provided vs. baseline; completeness unchanged (p = 0.31).", 384 "supported": "strong" 385 }, 386 { 387 "claim": "Justification complexity degrades completeness", 388 "evidence": "Figure 5 shows medium complexity decreases completeness by 11.9% (p < 6.7e-14) and hard by 18.1% (p < 2.0e-16) vs. easy baseline.", 389 "supported": "strong" 390 }, 391 { 392 "claim": "Models are sensitive to token variations in justifications", 393 "evidence": "Figure 4 shows different concept sets (20 variations per inference) produce different completeness scores for same semantic content, visible as histogram spread.", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "Embeddable language models can reliably translate ontology inferences", 398 "evidence": "Table I: baseline correctness ranges 36-77% across models; best model (mistral-small:22b) achieves 77.1% correctness and 87.5% completeness on baseline.", 399 "supported": "moderate" 400 } 401 ], 402 "methodology_tags": [ 403 "benchmark-eval" 404 ], 405 "key_findings": "The paper evaluates six embeddable language models on translating formal SWRL ontology inferences into natural language explanations using a synthetic dataset of 720 examples (4 rules × 3 complexity levels × 20 variations × 3 conditions). Key findings: (1) justification ordering significantly impacts both correctness (-20.0%, p<5.6e-6) and completeness (-8.9%, p<3.6e-10), with shuffled order degrading performance; (2) model size correlates with better performance, though not uniformly across families; (3) providing the SWRL rule as additional context improves correctness by 10.0% (p<1.4e-2) without affecting completeness; (4) increased justification complexity (hard vs. easy) reduces completeness by 18.1% (p<2.0e-16). The largest model (Mistral-Small 22B) achieved 77.1% correctness on baseline, while the smallest (Llama 3.2 3B) achieved only 36.2%, suggesting practical feasibility depends on model selection and input structuring.", 406 "red_flags": [ 407 { 408 "flag": "Single annotator", 409 "detail": "Only one expert evaluated all 720 outputs. No inter-rater reliability check; potential annotation bias not assessed." 410 }, 411 { 412 "flag": "No baseline system comparison", 413 "detail": "Prior ontology verbalizers (ACE, NaturalOWL, SWAT) mentioned in related work but not empirically compared against." 414 }, 415 { 416 "flag": "Limited evaluation scope", 417 "detail": "Only 4 SWRL rules, all robotic action-oriented. Generalization to other ontology types and domains uncertain." 418 }, 419 { 420 "flag": "Synthetic dataset", 421 "detail": "All 720 examples synthetically generated. Real-world ontologies may have different complexity patterns, semantic noise, or redundancies." 422 }, 423 { 424 "flag": "Unspecified sampling parameters", 425 "detail": "Temperature, top-p, and other LLM sampling parameters not reported. Reproducibility depends on Ollama defaults." 426 }, 427 { 428 "flag": "No actual human explainability study", 429 "detail": "Paper claims models improve 'explainability to non-experts' but only measures technical correctness/completeness. No user study validating whether translations actually improve human understanding." 430 }, 431 { 432 "flag": "Binary correctness metric acknowledged as limiting", 433 "detail": "Authors note: 'it would be interesting to design a finer version of the correctness metric than just a binary metric' — metric may miss nuanced correctness degradation." 434 } 435 ], 436 "cited_papers": [ 437 { 438 "title": "Large language models for robotics: Opportunities, challenges, and perspectives", 439 "relevance": "Context for using LLMs in robotic systems and explainability needs." 440 }, 441 { 442 "title": "Do as I can, not as I say: Grounding language in robotic affordances", 443 "relevance": "Robotics task planning with language models; grounding formal knowledge in natural language." 444 }, 445 { 446 "title": "Attempto Controlled English for knowledge representation", 447 "relevance": "Prior approach to ontology verbalization using controlled natural language." 448 }, 449 { 450 "title": "Generating natural language descriptions from OWL ontologies: the NaturalOWL system", 451 "relevance": "Prior NLG-based ontology verbalization system; baseline for comparison." 452 }, 453 { 454 "title": "Analyzing llama 3-based approach for axiom translation from ontologies", 455 "relevance": "Recent work on LLM-based ontology verbalization; direct precedent." 456 }, 457 { 458 "title": "A peek into token bias: Large language models are not yet genuine reasoners", 459 "relevance": "Explains token sensitivity phenomenon observed in this paper's results." 460 }, 461 { 462 "title": "Premise order matters in reasoning with large language models", 463 "relevance": "Direct prior evidence that input order affects LLM reasoning, supporting hypothesis tested here." 464 } 465 ], 466 "engagement_factors": { 467 "practical_relevance": { 468 "score": 2, 469 "justification": "Embeddable models on robots is practically relevant, but highly specialized domain (SWRL rule translation); limited transferability to other tasks." 470 }, 471 "surprise_contrarian": { 472 "score": 1, 473 "justification": "Findings confirm intuitions: larger models better, ordering matters, context helps. No surprising reversals or counterintuitive results." 474 }, 475 "fear_safety": { 476 "score": 0, 477 "justification": "No safety, alignment, or risk concerns raised. Evaluation of formal reasoning translation is orthogonal to LLM safety." 478 }, 479 "drama_conflict": { 480 "score": 0, 481 "justification": "Straightforward technical evaluation; no controversy, no competing approaches with ideological stakes." 482 }, 483 "demo_ability": { 484 "score": 1, 485 "justification": "Could demonstrate with Ollama locally, but requires synthetic ontology setup; not immediately accessible demo." 486 }, 487 "brand_recognition": { 488 "score": 1, 489 "justification": "Evaluates well-known open models (Llama, Gemma, Mistral), but from second-tier venues (RO-MAN); not flagship AI research." 490 } 491 }, 492 "hn_data": { 493 "threads": [], 494 "top_points": 0, 495 "total_points": 0, 496 "total_comments": 0 497 } 498 }