scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (29908B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evaluating Embeddable Language Models in Verbalizing Rule-based Inferences through Justifications",
      6     "authors": [
      7       "Bastien Dussard",
      8       "Aurélie Clodic",
      9       "Guillaume Sarthou"
     10     ],
     11     "year": 2025,
     12     "venue": "IEEE RO-MAN 2025",
     13     "arxiv_id": null,
     14     "doi": "10.1109/RO-MAN63969.2025.11217601"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims — order significantly decreases performance, adding rule significantly improves performance — are supported by ANOVA results with p-values in the results section.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Causal claims ('shuffling decreases performance', 'adding rule improves performance') are justified by controlled experimental manipulation: the conditions are the only variables changed while holding others constant, and statistical tests confirm significance.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Paper explicitly bounds scope: evaluates only embeddable models (≤14GB), only action-related SWRL rules, and states 'the conclusions derived from this study could reasonably be applied to larger models' as a hypothesis rather than a claim. Conclusion notes evaluation was on 'robotic action-oriented ontology but results should be comparable with other ontology knowledge bases.'",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Paper discusses alternative explanations: Mistral models' unexpected correctness increase with complexity attributed to more compact explanations; gemma2:9b's correctness drop with rule condition discussed. Shuffle effect attributed to proximity-induced concept misattribution.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper clearly defines what it measures (correctness and completeness of verbalized explanations) and does not inflate these into broader claims about LM reasoning ability or general verbalization quality. Claims match measurement granularity.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No dedicated limitations section. Some limitations are mentioned in the conclusion (e.g., binary correctness metric limitations, single ontology domain) but not in a structured section.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No specific threats to validity discussed. The conclusion mentions the correctness metric could be finer and the ontology domain could differ, but these are more future work than threat analysis.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Scope explicitly bounded: embeddable models only (≤14GB), action-oriented robotic ontology, SWRL rule-based inferences only. Conclusion states: 'This evaluation was conducted on a robotic action-oriented ontology but the results should be comparable with other ontology knowledge bases.'",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Funding acknowledged: 'This work was supported by the ELSA (ANR-21-CE33-0019) and the HumFleet (ANR-23-CE33-0003) projects.' These are French national research grants.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All authors affiliated with LAAS-CNRS, Université de Toulouse, CNRS, Toulouse, France. No conflicts with evaluated models (all are open-source models from Meta, Google, Mistral).",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "ANR (French National Research Agency) grants have no financial interest in the performance of any specific language model evaluated.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement or financial interests declaration present in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms defined: embeddable models (offline, <14GB), SWRL rules, ontologies, justifications, correctness (logically valid explanation), completeness (% concepts translated).",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Stated explicitly: 'main contribution is an evaluation of embeddable Language Models on a translation task from ontology-formatted inferences to natural language'—contribution as reference benchmark clear.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section III engages with ontology verbalization (CNL vs LLM approaches) and justification explanation work, positioning contribution as first 'baseline case' evaluation without fine-tuning.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "GitHub repository URL provided in Section IV.A: 'Our code and dataset are available online' with link to https://github.com/RIS-WITH/inference_explanation_benchmark.",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The generated dataset is stated to be available at the same GitHub repository referenced in Section IV.A.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No environment specifications, requirements.txt, or dependency versions mentioned beyond stating Ollama was used to run models locally.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions provided in the paper. Only states that code and dataset are available online.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Standard deviations are reported for completeness in Appendix Table I, but no confidence intervals or error bars are provided for correctness or for the ANOVA estimates.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Three-way ANOVA performed with p-values reported for all main effects: complexity medium (-11.9%, p<6.7e-14), complexity hard (-18.1%, p<2.0e-16), shuffle (-8.9%, p<3.6e-10), rule (+10.0%, p<1.4e-2).",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Effect sizes reported as percentage point changes with baselines: e.g., shuffle decreases completeness by -8.9% and correctness by -20.0%; rule improves correctness by +10.0%. Provides enough context to interpret magnitude.",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "No justification for why 20 variations per inference were chosen, no power analysis. The dataset size (720 inferences) is stated but not justified.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Standard deviations reported for completeness across variations in Appendix Table I (e.g., llama3.2:3b baseline: 63.7 ± 15.2%). Distributions also shown visually in histogram figures.",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "The baseline condition (ordered justifications) serves as the reference against which shuffle and rule conditions are compared. Six models compared against each other.",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Models selected are contemporary at time of study: Llama 3.1/3.2, Gemma 2, Mistral-Nemo/Small. Paper notes deepseek-r1:7b did not exist at time of study.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "The three conditions (baseline, shuffle, rule) and three complexity levels function as ablation-like manipulations, systematically varying factors to measure their individual impact.",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Two metrics used: correctness (boolean, whether the explanation is logically valid) and completeness (percentage of concepts translated).",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "A single expert annotator manually evaluated all 720 × 6 = 4320 model outputs for correctness and completeness, following detailed annotation guidelines (Section IV.B).",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": false,
    210           "justification": "No held-out test set design. All 720 inferences used for both prompting (4-shot CoT examples taken from outside the evaluated set) and evaluation. The CoT examples are stated to not overlap with evaluated questions, but there is no train/dev/test split.",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results broken down per model (6 models), per condition (3 conditions), per complexity level (3 levels), and per rule type (4 SWRL rules). Full table in Appendix I.",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Qualitative failure modes discussed: models correlating unrelated concepts, Mistral creating links between unrelated semantic triples, models including individual names. Example in Fig. 3 shows a failed explanation with annotations.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Several negative results reported: shuffle condition significantly hurts performance, adding rule does NOT significantly improve completeness (p=0.31), gemma2:9b correctness drops with rule condition, smaller models barely affected by shuffle because they already perform poorly.",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Exact model identifiers provided with sizes: llama3.2:3b, llama3.1:8b, gemma2:2b, gemma2:9b, mistral-nemo:12b, mistral-small:22b. These are Ollama model tags which specify the version and quantization.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Full task prompt and one of four CoT examples shown in Fig. 2. The system prompt, output requirements, and example format are all provided verbatim.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "No mention of temperature, top-p, or other generation hyperparameters. Only mentions that answers were truncated at the first newline character.",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding used. Models are queried directly via Ollama with a single prompt per inference.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section IV.A documents the full dataset generation process: 4 rules × 3 complexity levels × 20 variations × 3 conditions = 720 inferences. Variation generation process (anonymization, random numbers, manual assessment of identifiers) is described step-by-step.",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "Code and dataset available at GitHub repository. The generated dataset of 720 inference/justification pairs and model outputs should be verifiable.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section IV.A describes the full dataset generation process: 4 SWRL rules designed with specific antecedent structures, 3 complexity levels with specified triple counts (10, 14, 17), 20 variations with anonymization, 3 conditions.",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants recruited for the study. The dataset is synthetically generated and the single annotator is an expert (likely an author). No standard benchmark used.",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "Full pipeline documented: rule design → complexity levels → variation generation (concept substitution, anonymization, numerical randomization) → condition application → query construction with CoT prompting → output truncation at first newline.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "No training data cutoff dates stated for any of the 6 evaluated models. This matters because the ontology concepts used could overlap with training data.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of whether the generated dataset concepts or ontology structures might overlap with model training data. The related work section notes data contamination risk in prior work [18][19] but does not address it for their own evaluation.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": true,
    306           "justification": "The dataset is synthetically generated with anonymized identifiers (random strings replacing meaningful names) and randomized numerical values, which substantially mitigates contamination risk. The paper explicitly notes prior work's contamination issues and designs their evaluation to avoid evaluating on well-known ontologies.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in the study. The single expert annotator is part of the research team, not a study participant.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants. The study evaluates language models on a synthetic dataset.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No inference cost, latency, or time-per-query reported despite running 4320 model queries (720 × 6 models) locally.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No mention of hardware used, GPU type, total compute time, or computational budget. Only states models were run via Ollama locally and must not exceed 14GB.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "No mention of random seeds or seed sensitivity. The 20 variations per inference test token sensitivity but not model randomness across seeds.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": false,
    378           "justification": "No explicit statement of how many times each query was run. Appears to be single-run per query, but not stated.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "No hyperparameter search mentioned. Default Ollama settings apparently used but not stated.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": false,
    389           "answer": false,
    390           "justification": "No configuration selection — all models evaluated in their default configuration. No best-config selection needed.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": true,
    395           "answer": false,
    396           "justification": "Multiple statistical comparisons made (complexity levels, conditions, models) via ANOVA but no mention of multiple comparison correction (e.g., Bonferroni, Tukey HSD).",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": false,
    401           "answer": false,
    402           "justification": "The paper evaluates existing open-source models, not their own system. No self-comparison bias applies.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": false,
    408           "justification": "Model sizes range from 2b to 22b (11x difference) but performance is not explicitly plotted or discussed as a function of compute budget. Size effects are discussed qualitatively but not quantified in compute terms.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": true,
    414           "justification": "The paper explicitly discusses what the benchmark measures (ability to translate ontology inferences to natural language) and the limitations of its correctness metric as binary. Related work positions the benchmark as a 'reference evaluation' filling a gap in baseline assessments.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": false,
    419           "answer": false,
    420           "justification": "No scaffolding involved. All models receive the same prompt directly via Ollama.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "No discussion of whether models may have seen similar ontology structures or SWRL rules during training, despite training data cutoffs not being stated.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": true,
    434           "justification": "The anonymization of identifiers (replacing meaningful names with random strings) and randomization of numerical values explicitly addresses feature leakage by removing semantic cues that models might exploit from training knowledge.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "The 20 variations per inference share the same underlying structure (same rule, same complexity). Non-independence of these structurally similar examples is not discussed, though this could inflate statistical significance.",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No formal leakage detection method applied. Anonymization is a prevention strategy but no detection test was run.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "Order of justifications significantly decreases model performance on the verbalization task",
    455       "evidence": "Fig. 6 shows shuffle condition shifts distributions left (worse); -8.9% median completeness drop (p<3.6e-10), -20.0% correctness drop (p<5.6e-6)",
    456       "supported": "strong"
    457     },
    458     {
    459       "claim": "Adding the SWRL rule as additional context significantly improves correctness",
    460       "evidence": "Fig. 7 shows +10.0% correctness improvement (p<1.4e-2); most models show correctness dashed lines shifted right; completeness not significantly affected (p=0.31)",
    461       "supported": "strong"
    462     },
    463     {
    464       "claim": "Longer justifications (higher complexity) reduce both correctness and completeness",
    465       "evidence": "Fig. 5 shows complexity impact: medium -11.9% (p<6.7e-14), hard -18.1% (p<2.0e-16) completeness; correctness drops for smaller models",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "Larger models perform better on verbalization than smaller models",
    470       "evidence": "Fig. 4 shows larger versions have higher mean completeness, lower SD, and better correctness dashed lines; systematic improvement across model pairs (llama, gemma, mistral)",
    471       "supported": "moderate"
    472     },
    473     {
    474       "claim": "Model family affects performance independent of size",
    475       "evidence": "Mistral models show different patterns than Llama/Gemma of same/similar size; mistral-nemo scores similarly to gemma2:9b despite being 3B larger",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "Language models are sensitive to token variations in ontology verbalization",
    480       "evidence": "Fig. 4 shows histogram distributions (not single values) for completeness across 20 semantic-equivalent variations, confirming sensitivity to token changes",
    481       "supported": "strong"
    482     }
    483   ],
    484   "methodology_tags": [
    485     "benchmark-eval",
    486     "experimental"
    487   ],
    488   "key_findings": "Embeddable language models' performance on translating ontology inferences to natural language is significantly degraded (8.9% median drop) when justification order is randomized, and improved (10.0%) when the SWRL rule is provided as context. Larger models achieve higher completeness (concept coverage) with less variability, though correctness gains don't uniformly follow model size across families. Longer justifications reduce performance by up to 18.1% on completeness for all models, indicating length sensitivity affects small and large models differently.",
    489   "red_flags": [
    490     {
    491       "flag": "Single annotator",
    492       "detail": "No inter-rater reliability check; single expert annotator conducted all 720 evaluations with subjective metrics (correctness/completeness), risking systematic bias."
    493     },
    494     {
    495       "flag": "No hyperparameter specification",
    496       "detail": "No temperature, top-p, max_tokens, or other LLM hyperparameters reported; implicit Ollama defaults used and not documented, limiting reproducibility."
    497     },
    498     {
    499       "flag": "No confidence intervals",
    500       "detail": "Only standard deviations reported; no CIs computed for main metrics, limiting precision of uncertainty estimates."
    501     },
    502     {
    503       "flag": "Synthetic dataset limits generalizability",
    504       "detail": "Evaluation on four simplified robot action rules may not reflect real-world ontology complexity, diversity, or verbalization patterns."
    505     },
    506     {
    507       "flag": "No cost analysis for 'embeddable' claims",
    508       "detail": "Despite title emphasizing embeddable models, no inference time, latency, or GPU/CPU memory-in-use reported; impractical impact unclear."
    509     },
    510     {
    511       "flag": "Unexplained mistral behavior",
    512       "detail": "Mistral-nemo shows increasing correctness with increasing complexity (opposite of hypothesis) without adequate explanation beyond 'compact explanations.'"
    513     },
    514     {
    515       "flag": "Binary correctness metric",
    516       "detail": "Authors acknowledge correctness could be more nuanced than binary; no graded correctness or semantic similarity scoring used."
    517     }
    518   ],
    519   "cited_papers": [
    520     {
    521       "title": "Large language models for robotics: Opportunities, challenges, and perspectives",
    522       "relevance": "Motivates use of LLMs in robotic systems and discusses reasoning limitations."
    523     },
    524     {
    525       "title": "Do as I can, not as I say: Grounding language in robotic affordances",
    526       "relevance": "Relevant to grounding LLM outputs in robot capabilities and decision-making."
    527     },
    528     {
    529       "title": "FOLIO: Natural language reasoning with first-order logic",
    530       "relevance": "Addresses LM limitations in formal reasoning, supporting motivation for knowledge-based systems."
    531     },
    532     {
    533       "title": "Knowledge graph and knowledge reasoning: A systematic review",
    534       "relevance": "Surveys knowledge-based systems and reasoning approaches evaluated in this work."
    535     },
    536     {
    537       "title": "A peek into token bias: Large language models are not yet genuine reasoners",
    538       "relevance": "Directly supports hypothesis that LMs are sensitive to token variations in reasoning tasks."
    539     },
    540     {
    541       "title": "Premise order matters in reasoning with large language models",
    542       "relevance": "Prior work demonstrating order sensitivity in LM reasoning, motivates shuffle condition."
    543     },
    544     {
    545       "title": "Beyond instruction following: Evaluating inferential rule following of large language models",
    546       "relevance": "Evaluates LM ability to follow logical rules with explicit context, similar to rule condition tested."
    547     },
    548     {
    549       "title": "Verbalizing OWL in Attempto Controlled English",
    550       "relevance": "Prior work on ontology verbalization using controlled natural language."
    551     }
    552   ],
    553   "engagement_factors": {
    554     "practical_relevance": {
    555       "score": 2,
    556       "justification": "Verbalization of ontology justifications useful for robotics HRI systems but niche application; most ML practitioners won't need this."
    557     },
    558     "surprise_contrarian": {
    559       "score": 1,
    560       "justification": "Findings largely confirm expected hypotheses (order matters, rules help, larger models better); no counterintuitive discoveries beyond mistral anomaly."
    561     },
    562     "fear_safety": {
    563       "score": 0,
    564       "justification": "No AI safety or risk concerns raised; task is neutral translation problem without misalignment or capability risks."
    565     },
    566     "drama_conflict": {
    567       "score": 0,
    568       "justification": "No controversy, competing claims, or dramatic results; straightforward evaluation paper."
    569     },
    570     "demo_ability": {
    571       "score": 2,
    572       "justification": "Code/dataset released; can reproduce results locally; synthetic task lacks visual or user-facing demo appeal."
    573     },
    574     "brand_recognition": {
    575       "score": 0,
    576       "justification": "LAAS-CNRS is solid research institution but not flagship lab; no famous authors or institutions that would draw attention."
    577     }
    578   },
    579   "hn_data": {
    580     "threads": [],
    581     "top_points": 0,
    582     "total_points": 0,
    583     "total_comments": 0
    584   }
    585 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs