scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24831B)
      1 {
      2   "paper": {
      3     "title": "Evaluating Embeddable Language Models in Verbalizing Rule-based Inferences through Justifications",
      4     "authors": ["Bastien Dussard", "Aurélie Clodic", "Guillaume Sarthou"],
      5     "year": 2025,
      6     "venue": "IEEE RO-MAN 2025",
      7     "doi": "10.1109/RO-MAN63969.2025.11217601"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Evaluates 6 embeddable language models on translating ontology-formatted inferences into natural language. Shuffling justification order significantly decreases both correctness (-20.0%, p<5.6e-6) and completeness (-8.9%, p<3.6e-10). Adding the SWRL rule as context significantly improves correctness (+10.0%, p<1.4e-2) but does not significantly improve completeness (p=0.31). Larger models generally perform better, but model family matters as much as size.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "GitHub repository URL provided in Section IV.A: 'Our code and dataset are available online' with link to https://github.com/RIS-WITH/inference_explanation_benchmark."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The generated dataset is stated to be available at the same GitHub repository referenced in Section IV.A."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, requirements.txt, or dependency versions mentioned beyond stating Ollama was used to run models locally."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions provided in the paper. Only states that code and dataset are available online."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Standard deviations are reported for completeness in Appendix Table I, but no confidence intervals or error bars are provided for correctness or for the ANOVA estimates."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Three-way ANOVA performed with p-values reported for all main effects: complexity medium (-11.9%, p<6.7e-14), complexity hard (-18.1%, p<2.0e-16), shuffle (-8.9%, p<3.6e-10), rule (+10.0%, p<1.4e-2)."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Effect sizes reported as percentage point changes with baselines: e.g., shuffle decreases completeness by -8.9% and correctness by -20.0%; rule improves correctness by +10.0%. Provides enough context to interpret magnitude."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification for why 20 variations per inference were chosen, no power analysis. The dataset size (720 inferences) is stated but not justified."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Standard deviations reported for completeness across variations in Appendix Table I (e.g., llama3.2:3b baseline: 63.7 ± 15.2%). Distributions also shown visually in histogram figures."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The baseline condition (ordered justifications) serves as the reference against which shuffle and rule conditions are compared. Six models compared against each other."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Models selected are contemporary at time of study: Llama 3.1/3.2, Gemma 2, Mistral-Nemo/Small. Paper notes deepseek-r1:7b did not exist at time of study."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The three conditions (baseline, shuffle, rule) and three complexity levels function as ablation-like manipulations, systematically varying factors to measure their individual impact."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Two metrics used: correctness (boolean, whether the explanation is logically valid) and completeness (percentage of concepts translated)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "A single expert annotator manually evaluated all 720 × 6 = 4320 model outputs for correctness and completeness, following detailed annotation guidelines (Section IV.B)."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No held-out test set design. All 720 inferences used for both prompting (4-shot CoT examples taken from outside the evaluated set) and evaluation. The CoT examples are stated to not overlap with evaluated questions, but there is no train/dev/test split."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results broken down per model (6 models), per condition (3 conditions), per complexity level (3 levels), and per rule type (4 SWRL rules). Full table in Appendix I."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Qualitative failure modes discussed: models correlating unrelated concepts, Mistral creating links between unrelated semantic triples, models including individual names. Example in Fig. 3 shows a failed explanation with annotations."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Several negative results reported: shuffle condition significantly hurts performance, adding rule does NOT significantly improve completeness (p=0.31), gemma2:9b correctness drops with rule condition, smaller models barely affected by shuffle because they already perform poorly."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims — order significantly decreases performance, adding rule significantly improves performance — are supported by ANOVA results with p-values in the results section."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims ('shuffling decreases performance', 'adding rule improves performance') are justified by controlled experimental manipulation: the conditions are the only variables changed while holding others constant, and statistical tests confirm significance."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Paper explicitly bounds scope: evaluates only embeddable models (≤14GB), only action-related SWRL rules, and states 'the conclusions derived from this study could reasonably be applied to larger models' as a hypothesis rather than a claim. Conclusion notes evaluation was on 'robotic action-oriented ontology but results should be comparable with other ontology knowledge bases.'"
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Paper discusses alternative explanations: Mistral models' unexpected correctness increase with complexity attributed to more compact explanations; gemma2:9b's correctness drop with rule condition discussed. Shuffle effect attributed to proximity-induced concept misattribution."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper clearly defines what it measures (correctness and completeness of verbalized explanations) and does not inflate these into broader claims about LM reasoning ability or general verbalization quality. Claims match measurement granularity."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Exact model identifiers provided with sizes: llama3.2:3b, llama3.1:8b, gemma2:2b, gemma2:9b, mistral-nemo:12b, mistral-small:22b. These are Ollama model tags which specify the version and quantization."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Full task prompt and one of four CoT examples shown in Fig. 2. The system prompt, output requirements, and example format are all provided verbatim."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No mention of temperature, top-p, or other generation hyperparameters. Only mentions that answers were truncated at the first newline character."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding used. Models are queried directly via Ollama with a single prompt per inference."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section IV.A documents the full dataset generation process: 4 rules × 3 complexity levels × 20 variations × 3 conditions = 720 inferences. Variation generation process (anonymization, random numbers, manual assessment of identifiers) is described step-by-step."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No dedicated limitations section. Some limitations are mentioned in the conclusion (e.g., binary correctness metric limitations, single ontology domain) but not in a structured section."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No specific threats to validity discussed. The conclusion mentions the correctness metric could be finer and the ontology domain could differ, but these are more future work than threat analysis."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Scope explicitly bounded: embeddable models only (≤14GB), action-oriented robotic ontology, SWRL rule-based inferences only. Conclusion states: 'This evaluation was conducted on a robotic action-oriented ontology but the results should be comparable with other ontology knowledge bases.'"
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Code and dataset available at GitHub repository. The generated dataset of 720 inference/justification pairs and model outputs should be verifiable."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section IV.A describes the full dataset generation process: 4 SWRL rules designed with specific antecedent structures, 3 complexity levels with specified triple counts (10, 14, 17), 20 variations with anonymization, 3 conditions."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants recruited for the study. The dataset is synthetically generated and the single annotator is an expert (likely an author). No standard benchmark used."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Full pipeline documented: rule design → complexity levels → variation generation (concept substitution, anonymization, numerical randomization) → condition application → query construction with CoT prompting → output truncation at first newline."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Funding acknowledged: 'This work was supported by the ELSA (ANR-21-CE33-0019) and the HumFleet (ANR-23-CE33-0003) projects.' These are French national research grants."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "All authors affiliated with LAAS-CNRS, Université de Toulouse, CNRS, Toulouse, France. No conflicts with evaluated models (all are open-source models from Meta, Google, Mistral)."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "ANR (French National Research Agency) grants have no financial interest in the performance of any specific language model evaluated."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement or financial interests declaration present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff dates stated for any of the 6 evaluated models. This matters because the ontology concepts used could overlap with training data."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether the generated dataset concepts or ontology structures might overlap with model training data. The related work section notes data contamination risk in prior work [18][19] but does not address it for their own evaluation."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "The dataset is synthetically generated with anonymized identifiers (random strings replacing meaningful names) and randomized numerical values, which substantially mitigates contamination risk. The paper explicitly notes prior work's contamination issues and designs their evaluation to avoid evaluating on well-known ontologies."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in the study. The single expert annotator is part of the research team, not a study participant."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants. The study evaluates language models on a synthetic dataset."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference cost, latency, or time-per-query reported despite running 4320 model queries (720 × 6 models) locally."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No mention of hardware used, GPU type, total compute time, or computational budget. Only states models were run via Ollama locally and must not exceed 14GB."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No mention of random seeds or seed sensitivity. The 20 variations per inference test token sensitivity but not model randomness across seeds."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No explicit statement of how many times each query was run. Appears to be single-run per query, but not stated."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search mentioned. Default Ollama settings apparently used but not stated."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": false,
    309         "answer": false,
    310         "justification": "No configuration selection — all models evaluated in their default configuration. No best-config selection needed."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Multiple statistical comparisons made (complexity levels, conditions, models) via ANOVA but no mention of multiple comparison correction (e.g., Bonferroni, Tukey HSD)."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "The paper evaluates existing open-source models, not their own system. No self-comparison bias applies."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "Model sizes range from 2b to 22b (11x difference) but performance is not explicitly plotted or discussed as a function of compute budget. Size effects are discussed qualitatively but not quantified in compute terms."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The paper explicitly discusses what the benchmark measures (ability to translate ontology inferences to natural language) and the limitations of its correctness metric as binary. Related work positions the benchmark as a 'reference evaluation' filling a gap in baseline assessments."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding involved. All models receive the same prompt directly via Ollama."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of whether models may have seen similar ontology structures or SWRL rules during training, despite training data cutoffs not being stated."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "The anonymization of identifiers (replacing meaningful names with random strings) and randomization of numerical values explicitly addresses feature leakage by removing semantic cues that models might exploit from training knowledge."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "The 20 variations per inference share the same underlying structure (same rule, same complexity). Non-independence of these structurally similar examples is not discussed, though this could inflate statistical significance."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No formal leakage detection method applied. Anonymization is a prevention strategy but no detection test was run."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Shuffling the order of justifications significantly decreases completeness by -8.9% (p < 3.6e-10)",
    364       "evidence": "Section V.C, ANOVA results comparing baseline vs shuffle conditions across all models",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Shuffling the order of justifications significantly decreases correctness by -20.0% (p < 5.6e-6)",
    369       "evidence": "Section V.C, ANOVA results and visual comparison in Fig. 6",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Adding the SWRL rule as context significantly improves correctness by +10.0% (p < 1.4e-2)",
    374       "evidence": "Section V.D, ANOVA results comparing baseline vs rule conditions",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Adding the SWRL rule does not significantly improve completeness (p = 0.31)",
    379       "evidence": "Section V.D, ANOVA results showing non-significant effect on completeness",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Increased complexity (number of semantic triples) significantly decreases completeness: medium -11.9% (p < 6.7e-14), hard -18.1% (p < 2.0e-16)",
    384       "evidence": "Section V.B, ANOVA results with p-values",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Larger models generally perform better in both correctness and completeness",
    389       "evidence": "Section V.A, comparison across model pairs within families shown in Fig. 4 and Appendix Table I",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Single annotator",
    396       "detail": "All 4320 model outputs evaluated by a single expert annotator. No inter-rater reliability assessed. The paper acknowledges this was for 'consistency between evaluations' but this also means no validation of annotation objectivity."
    397     },
    398     {
    399       "flag": "No generation hyperparameters reported",
    400       "detail": "Temperature, top-p, and other sampling parameters not stated for any model. These can significantly affect output quality and reproducibility."
    401     },
    402     {
    403       "flag": "Small synthetic dataset",
    404       "detail": "Only 4 SWRL rules with highly similar structures (all action-capability rules for a robot). Results may not generalize to structurally different ontology patterns. The 20 variations per inference are surface-level (concept swaps) not structural."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "Large language models for robotics: Opportunities, challenges, and perspectives",
    410       "authors": ["J. Wang", "E. Shi", "H. Hu"],
    411       "year": 2024,
    412       "relevance": "Survey on LLM applications in robotics, directly relevant to LLM capability assessment."
    413     },
    414     {
    415       "title": "Do as I can, not as I say: Grounding language in robotic affordances",
    416       "authors": ["M. Ahn", "A. Brohan", "N. Brown"],
    417       "year": 2022,
    418       "arxiv_id": "2204.01691",
    419       "relevance": "Foundational work on grounding LLMs for robotic task planning."
    420     },
    421     {
    422       "title": "A peek into token bias: Large language models are not yet genuine reasoners",
    423       "authors": ["B. Jiang", "Y. Xie", "Z. Hao"],
    424       "year": 2024,
    425       "arxiv_id": "2406.11050",
    426       "relevance": "Evaluates LLM reasoning limitations and token sensitivity, directly relevant to AI capability assessment."
    427     },
    428     {
    429       "title": "GSM-Symbolic: Understanding the limitations of mathematical reasoning in large language models",
    430       "authors": ["I. Mirzadeh", "K. Alizadeh", "H. Shahrokhi"],
    431       "year": 2024,
    432       "arxiv_id": "2410.05229",
    433       "relevance": "Documents LLM limitations in reasoning tasks with sensitivity to surface-level variations."
    434     },
    435     {
    436       "title": "Are emergent abilities of large language models a mirage?",
    437       "authors": ["R. Schaeffer", "B. Miranda", "S. Koyejo"],
    438       "year": 2023,
    439       "relevance": "Questions emergent reasoning capabilities of LLMs, relevant to LLM capability evaluation."
    440     },
    441     {
    442       "title": "Premise order matters in reasoning with large language models",
    443       "authors": ["X. Chen", "R. A. Chi", "X. Wang", "D. Zhou"],
    444       "year": 2024,
    445       "arxiv_id": "2402.08939",
    446       "relevance": "Directly related finding that premise order affects LLM reasoning performance."
    447     },
    448     {
    449       "title": "Beyond instruction following: Evaluating inferential rule following of large language models",
    450       "authors": ["W. Sun", "C. Zhang", "X. Zhang"],
    451       "year": 2024,
    452       "arxiv_id": "2407.08440",
    453       "relevance": "Evaluates LLM rule-following capabilities, directly relevant to LLM evaluation methodology."
    454     },
    455     {
    456       "title": "FOLIO: Natural language reasoning with first-order logic",
    457       "authors": ["S. Han", "H. Schoelkopf", "Y. Zhao"],
    458       "year": 2022,
    459       "arxiv_id": "2209.00840",
    460       "relevance": "Benchmark for LLM reasoning with formal logic, relevant to LLM capability evaluation."
    461     }
    462   ]
    463 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs