ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (23721B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evaluating Embeddable Language Models in Verbalizing Rule-based Inferences through Justifications",
      6     "authors": [
      7       "Bastien Dussard",
      8       "Aurélie Clodic",
      9       "Guillaume Sarthou"
     10     ],
     11     "year": 2025,
     12     "venue": "IEEE RO-MAN 2025",
     13     "arxiv_id": null,
     14     "doi": "10.1109/RO-MAN63969.2025.11217601"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All major claims in the abstract are supported: token sensitivity is discussed throughout; order effects are validated with p<3.6e-10 (Figure 6); rule context improves performance +10.0% (Figure 7).",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Causal claims ('order decreases performance', 'rule improves performance') are tested via controlled conditions (baseline vs. shuffle vs. rule) with ANOVA, supporting causal inference.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Scope bounded to robotic action-oriented ontologies with four SWRL rules; authors note results 'should be comparable' to other semantically similar ontologies but acknowledge domain specificity.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Alternative explanations offered for order effect (SWRL reasoner exploration methods vary), rule effect (structure guides linking), and mistral anomaly (compact outputs increase spurious correlations).",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Paper clearly distinguishes measured metrics (correctness/completeness) from claimed value (explainability); acknowledges that technical correctness is prerequisite, not proof of human understanding.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No dedicated limitations or threats-to-validity section. Discussion is embedded in conclusion (e.g., 'evaluation conducted on robotic action-oriented ontology'), which does not count per criteria.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Single expert annotator mentioned passively ('to ensure consistency') but no systematic discussion of inter-rater reliability, annotation bias, or sample size limitations.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Scope is described implicitly (four rules, robotic actions, six models) but not explicitly stated as boundaries of what the results do NOT show.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Explicitly supported by ELSA (ANR-21-CE33-0019) and HumFleet (ANR-23-CE33-0003) projects, stated in footnote.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All authors affiliated with LAAS-CNRS, Université de Toulouse. No evaluated models or systems are author-affiliated products.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "ANR (French national research agency) is independent funder; paper evaluates open-source models with no proprietary bias.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement provided. Patents, equity, or consulting arrangements not declared (absence of declaration treated as NO per criteria).",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms defined in context: 'embeddable' (locally runnable on robotic GPU), SWRL rules (background section), ontologies (explained with RDF triples example), 'justifications' (subset of semantic facts supporting inference).",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Abstract explicitly states: 'reference evaluation of embeddable language models on a task of translation'; contribution is (1) dataset, (2) baseline evaluation, (3) factor analysis (order, complexity, rule context).",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section III systematically covers ontology verbalization (ACE, NaturalOWL), NLG refinement (SWAT), and LLM approaches (Hao et al., Zaitoun et al.); explicitly positions this as 'first baseline evaluation' of this task.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "GitHub link provided: 'https://github.com/RIS-WITH/inference_explanation_benchmark' with explicit statement 'Our code and dataset are available online'.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Same GitHub link claims dataset is available online; synthetic dataset generation process fully documented (4 rules × 3 complexity × 20 variations × 3 conditions = 720 examples).",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Ollama tool and model versions (llama3.2:3b, etc.) specified, but no requirements.txt, Dockerfile, or Python version provided. Sampling parameters not documented.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Methodology is detailed (Section IV), prompts fully shown (Figure 2), but no step-by-step reproduction instructions in paper itself. GitHub repo may contain them, but paper alone is insufficient.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Standard deviations reported in Table I and visualized as probability distributions in Figures 4-7 via kernel density estimation.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Three-way ANOVA performed; p-values reported throughout (p < 6.7e-14, p < 2.0e-16, p < 3.6e-10, p < 5.6e-6, p = 0.31, p < 1.4e-2).",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Mean differences reported as percentages: complexity -11.9% (medium) and -18.1% (hard); shuffle -8.9% completeness/-20.0% correctness; rule +10.0% correctness.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "720 total examples (4 rules × 3 complexity × 20 variations × 3 conditions) but no power analysis or justification for this configuration provided.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Standard deviations reported for each condition (Table I); distributions visualized with spread in Figures 4-7.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "No comparison with prior systems (ACE, NaturalOWL, SWAT mentioned in related work). Baseline/shuffle/rule are conditions, not baseline methods.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": false,
    185           "answer": false,
    186           "justification": "No baseline systems compared.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Experimental conditions test effect of input structure: baseline (logical order) vs. shuffle (random order) vs. rule (additional context); measures impact of each factor.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Two metrics: correctness (binary semantic validity) and completeness (% of concepts translated).",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Single expert annotator manually evaluated all 720 model outputs for correctness and completeness per stated guidelines.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": false,
    209           "answer": false,
    210           "justification": "Pre-trained models evaluated; no train/test split. All 720 synthetic examples are evaluation examples.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results broken down by model, model family, complexity level, and condition (baseline/shuffle/rule). Table I and Figures 4-7 show per-condition and per-model performance.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Figure 3 shows annotated failure example; text discusses why mistral models spuriously correlate concepts; incorrect handling of individual names and causal links identified.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Completeness metric shows no significant improvement when rule added (p = 0.31); this null finding is reported in Figure 7 and discussion.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Exact model versions: llama3.2:3b, llama3.1:8b, gemma2:2b, gemma2:9b, mistral-nemo:12b, mistral-small:22b with snapshot dates implicit in version numbers.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Full task prompt provided in Figure 2 (green section); four in-context examples shown with one displayed in red; exact inference/justification pair in blue.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "Only mentions Ollama tool and 'truncated at first newline'; temperature, top-p, frequency penalty, other sampling parameters not reported.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Chain-of-Thought prompting (4-shot) described; examples show structure; no unrelated concepts in examples per design.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Dataset generation fully documented: complexity levels (10/14/17 triples), token variations (concept synonyms, anonymous IDs, random values), conditions (baseline/shuffle/rule) all specified.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "GitHub link claims both code and dataset available online; synthetic dataset fully reproducible from documented generation process.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section IV.A fully describes dataset generation: four SWRL rules designed, complexity levels introduced via axiom chains, variations created via concept/ID/value randomization.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants recruited; single expert annotator is evaluator, not study subject. N/A.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "Pipeline documented: rule design → complexity variation → token variation → condition application → annotation (correctness + completeness) → ANOVA analysis.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Training cutoff dates for Llama 3.2/3.1, Gemma 2, Mistral models not explicitly stated in paper.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "SWRL and ontologies are standard formats unlikely in training data; synthetic examples reduce overlap risk; but no explicit discussion of potential contamination.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "Synthetic task with standard ontology/SWRL format; no discussion of whether robotics papers in training data could enable prior knowledge of similar inferences.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human subjects; N/A.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human subjects; N/A.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human subjects; N/A.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human subjects; N/A.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human subjects; N/A.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human subjects; N/A.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human subjects; N/A.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No inference time, latency, or token cost reported. Relevant for embeddable models on robotic platforms but not discussed.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Total computational budget not stated. Could infer from 720 examples × 6 models but requires external calculation.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Order of justifications significantly decreases model performance",
    373       "evidence": "Figure 6 shows 8.9% decrease in completeness (p < 3.6e-10) and 20.0% decrease in correctness (p < 5.6e-6) when justifications shuffled vs. baseline logical order.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Model size correlates with better performance",
    378       "evidence": "Figures 4-7 and Table I show larger versions (9b, 12b, 22b) consistently outperform smaller versions (2b, 3b, 8b) on both metrics across all conditions.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Adding SWRL rule as context significantly improves correctness",
    383       "evidence": "Figure 7 shows +10.0% improvement in correctness (p < 1.4e-2) when rule provided vs. baseline; completeness unchanged (p = 0.31).",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Justification complexity degrades completeness",
    388       "evidence": "Figure 5 shows medium complexity decreases completeness by 11.9% (p < 6.7e-14) and hard by 18.1% (p < 2.0e-16) vs. easy baseline.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Models are sensitive to token variations in justifications",
    393       "evidence": "Figure 4 shows different concept sets (20 variations per inference) produce different completeness scores for same semantic content, visible as histogram spread.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Embeddable language models can reliably translate ontology inferences",
    398       "evidence": "Table I: baseline correctness ranges 36-77% across models; best model (mistral-small:22b) achieves 77.1% correctness and 87.5% completeness on baseline.",
    399       "supported": "moderate"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "benchmark-eval"
    404   ],
    405   "key_findings": "The paper evaluates six embeddable language models on translating formal SWRL ontology inferences into natural language explanations using a synthetic dataset of 720 examples (4 rules × 3 complexity levels × 20 variations × 3 conditions). Key findings: (1) justification ordering significantly impacts both correctness (-20.0%, p<5.6e-6) and completeness (-8.9%, p<3.6e-10), with shuffled order degrading performance; (2) model size correlates with better performance, though not uniformly across families; (3) providing the SWRL rule as additional context improves correctness by 10.0% (p<1.4e-2) without affecting completeness; (4) increased justification complexity (hard vs. easy) reduces completeness by 18.1% (p<2.0e-16). The largest model (Mistral-Small 22B) achieved 77.1% correctness on baseline, while the smallest (Llama 3.2 3B) achieved only 36.2%, suggesting practical feasibility depends on model selection and input structuring.",
    406   "red_flags": [
    407     {
    408       "flag": "Single annotator",
    409       "detail": "Only one expert evaluated all 720 outputs. No inter-rater reliability check; potential annotation bias not assessed."
    410     },
    411     {
    412       "flag": "No baseline system comparison",
    413       "detail": "Prior ontology verbalizers (ACE, NaturalOWL, SWAT) mentioned in related work but not empirically compared against."
    414     },
    415     {
    416       "flag": "Limited evaluation scope",
    417       "detail": "Only 4 SWRL rules, all robotic action-oriented. Generalization to other ontology types and domains uncertain."
    418     },
    419     {
    420       "flag": "Synthetic dataset",
    421       "detail": "All 720 examples synthetically generated. Real-world ontologies may have different complexity patterns, semantic noise, or redundancies."
    422     },
    423     {
    424       "flag": "Unspecified sampling parameters",
    425       "detail": "Temperature, top-p, and other LLM sampling parameters not reported. Reproducibility depends on Ollama defaults."
    426     },
    427     {
    428       "flag": "No actual human explainability study",
    429       "detail": "Paper claims models improve 'explainability to non-experts' but only measures technical correctness/completeness. No user study validating whether translations actually improve human understanding."
    430     },
    431     {
    432       "flag": "Binary correctness metric acknowledged as limiting",
    433       "detail": "Authors note: 'it would be interesting to design a finer version of the correctness metric than just a binary metric' — metric may miss nuanced correctness degradation."
    434     }
    435   ],
    436   "cited_papers": [
    437     {
    438       "title": "Large language models for robotics: Opportunities, challenges, and perspectives",
    439       "relevance": "Context for using LLMs in robotic systems and explainability needs."
    440     },
    441     {
    442       "title": "Do as I can, not as I say: Grounding language in robotic affordances",
    443       "relevance": "Robotics task planning with language models; grounding formal knowledge in natural language."
    444     },
    445     {
    446       "title": "Attempto Controlled English for knowledge representation",
    447       "relevance": "Prior approach to ontology verbalization using controlled natural language."
    448     },
    449     {
    450       "title": "Generating natural language descriptions from OWL ontologies: the NaturalOWL system",
    451       "relevance": "Prior NLG-based ontology verbalization system; baseline for comparison."
    452     },
    453     {
    454       "title": "Analyzing llama 3-based approach for axiom translation from ontologies",
    455       "relevance": "Recent work on LLM-based ontology verbalization; direct precedent."
    456     },
    457     {
    458       "title": "A peek into token bias: Large language models are not yet genuine reasoners",
    459       "relevance": "Explains token sensitivity phenomenon observed in this paper's results."
    460     },
    461     {
    462       "title": "Premise order matters in reasoning with large language models",
    463       "relevance": "Direct prior evidence that input order affects LLM reasoning, supporting hypothesis tested here."
    464     }
    465   ],
    466   "engagement_factors": {
    467     "practical_relevance": {
    468       "score": 2,
    469       "justification": "Embeddable models on robots is practically relevant, but highly specialized domain (SWRL rule translation); limited transferability to other tasks."
    470     },
    471     "surprise_contrarian": {
    472       "score": 1,
    473       "justification": "Findings confirm intuitions: larger models better, ordering matters, context helps. No surprising reversals or counterintuitive results."
    474     },
    475     "fear_safety": {
    476       "score": 0,
    477       "justification": "No safety, alignment, or risk concerns raised. Evaluation of formal reasoning translation is orthogonal to LLM safety."
    478     },
    479     "drama_conflict": {
    480       "score": 0,
    481       "justification": "Straightforward technical evaluation; no controversy, no competing approaches with ideological stakes."
    482     },
    483     "demo_ability": {
    484       "score": 1,
    485       "justification": "Could demonstrate with Ollama locally, but requires synthetic ontology setup; not immediately accessible demo."
    486     },
    487     "brand_recognition": {
    488       "score": 1,
    489       "justification": "Evaluates well-known open models (Llama, Gemma, Mistral), but from second-tier venues (RO-MAN); not flagship AI research."
    490     }
    491   },
    492   "hn_data": {
    493     "threads": [],
    494     "top_points": 0,
    495     "total_points": 0,
    496     "total_comments": 0
    497   }
    498 }

Impressum · Datenschutz