scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24767B)
      1 {
      2   "paper": {
      3     "title": "Beyond Semantic Entropy: Boosting LLM Uncertainty Quantification with Pairwise Semantic Similarity",
      4     "authors": ["Dang Nguyen", "Ali Payani", "Baharan Mirzasoleiman"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2506.00245"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The abstract states 'Our code is available at https://github.com/BigML-CS-UCLA/SNNE' — a concrete repository URL is provided."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available datasets: SQuAD, TriviaQA, NaturalQuestion, Svamp, BioASQ, XSUM, AESLC, and WMT-14, all downloaded from Hugging Face. No proprietary data was collected."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "Appendix B mentions 'NVIDIA RTX A6000 GPUs' and links to Google's ROUGE implementation and Hugging Face, but no requirements.txt, Dockerfile, or detailed library versions are provided in the paper."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The code repository link is given, but the paper itself does not contain a 'Reproducing Results' section or specific commands to run."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Tables 5 and 6 in the appendix report results with ± notation (e.g., '0.830 ± 0.003'). However, the main results in Tables 2-4 and Figures 2-3 report only point estimates without error bars or confidence intervals."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims SNNE/WSNNE 'consistently outperform' baselines across multiple settings, but no statistical significance tests (p-values, t-tests, etc.) are reported to support these comparative claims."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports absolute AUROC, AUARC, and PRR scores for all methods with baseline context (e.g., Table 2 shows WSNNE at 0.83 vs SE at 0.79 on Llama-3.1-8B). Figure 1 right shows 'AUROC(SNNE) - AUROC(SE)' differences explicitly. This provides enough context to assess effect magnitude."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for why 5 QA datasets, 2 summarization datasets, and 2 translation datasets were chosen, nor is there a power analysis or discussion of whether this sample is sufficient for the claims being made."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Appendix B states 'We conduct each experiment three times using NVIDIA RTX A6000 GPUs,' and Tables 5 and 6 report standard deviations across runs (e.g., '0.830 ± 0.003'). However, the main tables (2-4) omit variance, which is a partial gap."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares against numerous baselines including white-box methods (KLEfull, SE, NE, Eigenscore, SAR) and black-box methods (KLEheat, DSE, pTrue, NumSet, LexSim, SumEigv, Deg, Eccen, LUQ-Pair)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include recent methods: KLE (2024), SAR (2023), LUQ (2024), graph-based methods (Lin et al., 2024), and the original SE (2024, published in Nature). These represent the current state of the art in LLM uncertainty quantification."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Appendix C.2 contains ablation studies on (1) the choice of similarity function (ROUGE-L, entail, embed) in Table 5, (2) the scale factor τ in Table 6, and (3) the effect of number of generated answers and temperature in Figure 4."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses AUROC and AUARC for QA tasks, and PRR with both ROUGE-L and BERTScore as correctness measures for summarization and translation tasks."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a method for uncertainty quantification evaluated on automated metrics (AUROC, AUARC, PRR). Human evaluation of the system's outputs is not relevant to the claims — the method produces uncertainty scores, not generated text."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses standard benchmark test splits (SQuAD, TriviaQA, NaturalQuestion, Svamp, BioASQ, XSUM, AESLC, WMT-14) following established protocols from SE (Farquhar et al., 2024) and LM-Polygraph. The hyperparameter τ is tuned separately (selected from {0.1, 1, 10, 100}), though the paper does not explicitly describe a separate validation set for this selection."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Tables 2-4 provide per-model breakdowns across 6 models. Figure 1 right shows per-cluster-count breakdowns. Tables 5-6 provide per-task breakdowns for summarization and translation. The main figures (2-3) show aggregated averages, but the appendix provides detailed per-setting results."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 7 (Limitations) discusses specific scenarios where the method may fail: multi-sentence/paragraph generation, non-text data formats (math expressions, LaTeX, code), and the additional inference cost of sampling multiple answers."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The ablation in Table 5 shows that different similarity functions (entail, embed) sometimes perform worse than ROUGE-L. Figure 4 shows performance degradation at extreme temperatures (0.5 and 2.0). The paper also notes that pTrue performs poorly in several settings (e.g., 0.56 on Phi-3-mini in Table 2)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims (1) SNNE generalizes SE (supported by Theorems 4.1, 4.2), (2) effectiveness across Phi3 and Llama3 on QA, summarization, and translation (supported by Figures 2-3 and Tables 2-4), and (3) code availability (GitHub link provided). All claims are supported by results in the paper."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper claims that accounting for intra- and inter-cluster similarity improves uncertainty estimation. This is supported by ablation studies (Table 5 on similarity functions, Table 6 on τ, Figure 4 on generation parameters) and theoretical analysis (Theorems 4.1-4.2 showing SE is a special case). The ablation design provides controlled single-variable manipulation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper bounds its claims to the tested LLMs and tasks. The Limitations section (Section 7) explicitly states the method was not tested on multi-sentence generation, mathematical expressions, LaTeX, or code. Claims are framed as improvements over SE 'across question answering, summarization, and translation' — the specific tasks tested."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for why SNNE outperforms SE. For example, the improvement could be partly due to ROUGE-L being a better similarity metric than NLI-based clustering, rather than the SNNE formulation itself. The ablation on similarity functions (Table 5) partially addresses this but is not framed as ruling out alternatives."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper specifies 'Llama-3.1-8B', 'Phi-3-mini-4k-instruct', 'Llama2-7B', 'Llama2-13B', 'gemma-2-2b-it', and 'Mistral-Nemo-Instruct-2407' in Appendix B. These include specific size and variant identifiers sufficient to identify exact model weights on Hugging Face, though no snapshot dates are given."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Table 1 provides the instruction prompts used for each task (QA, XSUM, AESLC, WMT-14 de-en, WMT-14 fr-en). The paper also states '5-shot in-context demonstrations for QA tasks' following SE's protocol, and '0-shot setting' for TS/MT tasks."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper reports temperature (T=0.1 for correctness, T=10 for uncertainty sampling), number of generated answers (10), scale factor τ (searched over {0.1, 1, 10, 100}, default 1), correctness threshold (F1 > 50% for QA), and the specific similarity function (ROUGE-L). See Appendix B."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The method is a post-hoc uncertainty quantification technique that operates on sampled model outputs."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix B describes the generation pipeline: one answer at T=0.1 for correctness assessment, 10 answers at T=10 for uncertainty estimation, with 5-shot demonstrations for QA and 0-shot for TS/MT. Correctness measures and thresholds are specified. Datasets are described with references to their original sources."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 7 is titled 'Limitations' and provides substantive discussion of three specific limitations: multi-sentence generation, non-text data formats, and inference cost of sampling multiple answers."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 7 discusses specific threats: (1) the method is untested on multi-sentence or paragraph-length outputs, (2) different data formats like math expressions and code require new similarity functions, and (3) the method inherits the inference cost limitation of all sampling-based UQ methods. These are specific to this study, not generic disclaimers."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 explicitly states what the results do NOT show: 'we did not investigate uncertainty estimation in cases where the model generates multiple sentences or an entire paragraph' and 'for different data formats such as mathematical expressions, LaTeX equations, or code, our method requires further considerations.'"
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The paper does not release raw experimental data (e.g., per-question uncertainty scores, generated answers, or intermediate computations). Only aggregated results are reported in tables and figures."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data collection procedure is well-described: standard benchmark datasets are used (SQuAD, TriviaQA, NaturalQuestion, Svamp, BioASQ, XSUM, AESLC, WMT-14), downloaded from Hugging Face. The generation procedure (temperatures, number of samples) is documented in Appendix B."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. All data comes from standard public benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from data to results is documented: (1) download datasets from Hugging Face, (2) generate one answer at T=0.1 for correctness, (3) sample 10 answers at T=10, (4) compute similarity using ROUGE-L, (5) calculate SNNE/WSNNE, (6) evaluate using AUROC/AUARC/PRR. See Section 5.1 and Appendix B."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The Acknowledgements section states: 'This research was partially supported by the National Science Foundation CAREER Award 2146492 and Cisco Systems.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Dang Nguyen and Baharan Mirzasoleiman at UCLA CS, Ali Payani at Cisco Systems Inc. The Cisco affiliation is prominently displayed."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "NSF is an independent funder with no stake in the outcome. Cisco Systems is both a funder and employer of one author (Ali Payani), but the paper does not evaluate any Cisco product. The method is model-agnostic and evaluated on open-source LLMs (Llama, Phi, Gemma, Mistral), so Cisco has no direct financial interest in the specific results."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is included in the paper. While one author is employed by Cisco (a funder), there is no explicit declaration addressing potential conflicts."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper uses pre-trained LLMs (Llama-3.1-8B, Phi-3-mini, etc.) evaluated on public benchmarks (SQuAD, TriviaQA, etc.) but does not state the training data cutoff dates for any of these models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The paper does not discuss whether the benchmark datasets (SQuAD, TriviaQA, NaturalQuestion, etc.) may have been in the training data of the evaluated models. These are well-known public datasets that pre-date the models."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "All benchmark datasets used (SQuAD 2018, TriviaQA 2017, NaturalQuestion 2019, Svamp 2021, BioASQ 2023, XSUM 2018, AESLC 2019, WMT-14 2014) were published before the training cutoffs of the evaluated models (Llama 3.1, Phi-3, etc.). Contamination risk is not discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study. It is a benchmark evaluation of uncertainty quantification methods."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper acknowledges the inference cost of sampling multiple answers in Section 7 (Limitations) but does not quantify the actual cost (e.g., wall-clock time, tokens consumed, or API costs per example). Figure 4 explores the trade-off between number of generated answers and performance but does not report concrete cost figures."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "The paper mentions using 'NVIDIA RTX A6000 GPUs' and running experiments 3 times, but does not state the total GPU hours, training time, or computational budget."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "SNNE and WSNNE consistently outperform existing white-box and black-box UQ baselines across QA, summarization, and translation tasks.",
    286       "evidence": "Tables 2-4 show SNNE/WSNNE achieving highest AUROC, AUARC, and PRR scores across 6 models and 9 datasets. E.g., on Llama-3.1-8B QA, WSNNE achieves 0.83 AUROC vs 0.80 for KLEfull and 0.79 for SE (Table 2). On Phi-3-mini summarization with BERTScore, WSNNE achieves 0.65 PRR vs 0.54 for SE (Table 4).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "SNNE generalizes semantic entropy — (D)SE is a special case of (W)SNNE under specific similarity metrics.",
    291       "evidence": "Theorems 4.1 and 4.2 with proofs in Appendix A formally show that DSE and SE are recovered by SNNE and WSNNE respectively when specific similarity functions are used (constant intra-cluster + zero inter-cluster).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "SE becomes less effective as modern LLMs generate longer one-sentence responses, because it overlooks intra-cluster and inter-cluster similarity.",
    296       "evidence": "Figure 1 left shows Spearman correlation of 0.83 between output length and number of semantic clusters on SQuAD with Llama-3.1-8B. Figure 1 right shows SNNE's advantage over SE increases with number of clusters. Section 3 explains theoretically why DSE produces constant entropy when M approaches n.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "ROUGE-L is the best similarity function for SNNE across most tasks.",
    301       "evidence": "Table 5 in Appendix C shows ROUGE-L outperforms or ties with embedding-based and entailment-based similarity functions across summarization and translation tasks on Phi-3-mini.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "SNNE is robust to the choice of scale factor τ.",
    306       "evidence": "Table 6 shows minimal variation in AUROC and PRR scores across τ ∈ {0.1, 1, 10, 100} for different tasks and models (e.g., AUROC ranges from 0.830 to 0.833 on Llama-3.1-8B QA).",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval", "theoretical"],
    311   "key_findings": "The paper introduces Semantic Nearest Neighbor Entropy (SNNE), a black-box uncertainty quantification method for LLMs that accounts for both intra- and inter-cluster semantic similarities between generated answers, addressing limitations of Semantic Entropy (SE) for longer outputs. Theoretically, SNNE generalizes SE as a special case. Empirically, SNNE and its white-box variant WSNNE consistently outperform SE and other baselines across 6 LLMs, 9 datasets, and 3 text generation tasks (QA, summarization, translation), with the advantage being most pronounced when models generate longer responses that produce more semantic clusters.",
    312   "red_flags": [
    313     {
    314       "flag": "No statistical significance tests",
    315       "detail": "The paper claims SNNE/WSNNE 'consistently outperform' baselines but reports no significance tests. Many improvements are small (e.g., 0.01-0.03 AUROC points in Table 2), and standard deviations in Tables 5-6 suggest these differences may not all be statistically significant."
    316     },
    317     {
    318       "flag": "Main results lack error bars",
    319       "detail": "While Tables 5 and 6 in the appendix report ± values, the main result tables (2-4) and figures (2-3) report only point estimates. This selective reporting of uncertainty measures makes it harder to assess whether differences are meaningful."
    320     },
    321     {
    322       "flag": "Benchmark contamination not addressed",
    323       "detail": "All benchmark datasets predate the models' training cutoffs, creating contamination risk. Since the paper evaluates uncertainty calibration rather than task accuracy, contamination could affect the relationship between model confidence and correctness, potentially biasing the AUROC/AUARC/PRR comparisons."
    324     }
    325   ],
    326   "cited_papers": [
    327     {
    328       "title": "Detecting hallucinations in large language models using semantic entropy",
    329       "authors": ["Sebastian Farquhar", "Jannik Kossen", "Lorenz Kuhn", "Yarin Gal"],
    330       "year": 2024,
    331       "relevance": "The primary baseline and motivation for this work — the Semantic Entropy method published in Nature that SNNE generalizes."
    332     },
    333     {
    334       "title": "Kernel language entropy: Fine-grained uncertainty quantification for LLMs from semantic similarities",
    335       "authors": ["Alexander Nikitin", "Jannik Kossen", "Yarin Gal", "Pekka Marttinen"],
    336       "year": 2024,
    337       "arxiv_id": "2405.20003",
    338       "relevance": "A concurrent approach to fine-grained uncertainty quantification using kernel methods, serving as a key baseline."
    339     },
    340     {
    341       "title": "Generating with confidence: Uncertainty quantification for black-box large language models",
    342       "authors": ["Zhen Lin", "Shubhendu Trivedi", "Jimeng Sun"],
    343       "year": 2024,
    344       "relevance": "Graph-based UQ methods for black-box LLMs that serve as baselines (NumSet, SumEigv, Deg, Eccen)."
    345     },
    346     {
    347       "title": "Shifting attention to relevance: Towards the predictive uncertainty quantification of free-form large language models",
    348       "authors": ["Jinhao Duan"],
    349       "year": 2023,
    350       "arxiv_id": "2307.01379",
    351       "relevance": "SAR method that uses soft aggregation of word/sentence-level probabilities weighted by semantic similarity — a key white-box baseline."
    352     },
    353     {
    354       "title": "LUQ: Long-text uncertainty quantification for LLMs",
    355       "authors": ["Caiqi Zhang", "Fangyu Liu", "Marco Basaldella", "Nigel Collier"],
    356       "year": 2024,
    357       "arxiv_id": "2403.20279",
    358       "relevance": "Multi-sentence UQ method using atomic uncertainty scores; SNNE asymptotically recovers LUQ-Pair and provides better atomic scores."
    359     },
    360     {
    361       "title": "A survey of uncertainty estimation in LLMs: Theory meets practice",
    362       "authors": ["Hsiu-Yuan Huang", "Yutong Yang", "Zhaoxi Zhang", "Sanwoo Lee", "Yunfang Wu"],
    363       "year": 2024,
    364       "arxiv_id": "2410.15326",
    365       "relevance": "Comprehensive survey of LLM uncertainty estimation methods, providing context for the field this paper contributes to."
    366     },
    367     {
    368       "title": "Benchmarking uncertainty quantification methods for large language models with LM-Polygraph",
    369       "authors": ["Roman Vashurin"],
    370       "year": 2025,
    371       "relevance": "The LM-Polygraph benchmark used for evaluation of summarization and translation tasks, and source of the SAR baseline's prior state-of-the-art claim."
    372     },
    373     {
    374       "title": "INSIDE: LLMs' internal states retain the power of hallucination detection",
    375       "authors": ["Chao Chen", "Kai Liu", "Ze Chen", "Yi Gu", "Yue Wu", "Mingyuan Tao", "Zhihang Fu", "Jieping Ye"],
    376       "year": 2024,
    377       "arxiv_id": "2402.03744",
    378       "relevance": "EigenScore method using covariance of output embeddings for hallucination detection, serving as a white-box baseline."
    379     },
    380     {
    381       "title": "Language models (mostly) know what they know",
    382       "authors": ["Saurav Kadavath"],
    383       "year": 2022,
    384       "arxiv_id": "2207.05221",
    385       "relevance": "pTrue method for LLM self-evaluation of uncertainty, serving as a baseline in the experiments."
    386     }
    387   ]
    388 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs