ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (31171B)


      1 {
      2   "scan_version": 2,
      3   "active_modules": ["experimental_rigor", "data_leakage"],
      4   "paper": {
      5     "title": "Multi-Level Explanations for Generative Language Models",
      6     "authors": [
      7       "Lucas Monteiro Paes",
      8       "Dennis Wei",
      9       "Hyo Jin Do",
     10       "Hendrik Strobelt",
     11       "Ronny Luss",
     12       "Amit Dhurandhar",
     13       "Manish Nagireddy",
     14       "Karthikeyan Natesan Ramamurthy",
     15       "Prasanna Sattigeri",
     16       "Werner Geyer",
     17       "Soumya Ghosh"
     18     ],
     19     "year": 2024,
     20     "venue": "Annual Meeting of the Association for Computational Linguistics",
     21     "arxiv_id": "2403.14459",
     22     "doi": "10.48550/arXiv.2403.14459"
     23   },
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The abstract states 'We open-source code for MExGen as part of the ICX360 toolkit: https://github.com/IBM/ICX360.' A working repository URL is provided."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "All evaluation datasets are publicly available: XSUM (MIT license), CNN/DailyMail (Apache-2.0), and SQuAD (CC BY-SA 4.0). The paper uses these standard benchmarks without modification."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Appendix C.6 describes the hardware environment (V100/A100 GPUs, 32 GB CPU memory) but does not provide software dependency specifications such as requirements.txt, library versions, or Dockerfile."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided in the paper. The code is open-sourced but the paper itself does not include instructions for reproducing the experimental results."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Perturbation curve figures include shading showing 'one standard error above and below' (Section 4.2, Figures 4, 5). Standard errors are consistently reported across all perturbation curve visualizations."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The user study employs binomial tests (p < .05), paired t-tests, Bradley-Terry models, repeated ANOVA, and Bonferroni-adjusted pairwise comparisons (Tables 6-7, Appendix E)."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "AUPC values are reported with full baselines for context (e.g., MExGen C-LIME 13.6 vs P-SHAP 9.4 on XSUM/DistilBART in Table 1). User study proportions are reported (e.g., '57% to 35%' for BERT vs Log Prob fidelity perception)."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No justification given for sample sizes: 1000 test examples for DistilBART, 500 for Flan-UL2/Llama-3, 88 participants in user study. No power analysis is discussed."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Standard errors are reported via shading in all perturbation curve figures: 'Shading shows standard error in the mean' (Figures 4, 5, 8-12)."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper compares against PartitionSHAP, CaptumLIME, and LLM self-explanations (DeepSeek-V3 and Granite-3.3). Section 4.1 describes the baseline setup in detail."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "PartitionSHAP is from the SHAP library (ongoing development), CaptumLIME from Captum (2023), and DeepSeek-V3 is a very recent LLM used for self-explanation comparison. Baselines represent current state of the art."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper systematically compares three attribution algorithms (LOO, C-LIME, L-SHAP) and six scalarizers (Log Prob, BERT, Sim, SUMM, BART, Log NLI) as different instantiations of the framework. Section 4.2 evaluates scalarizer choices and Section 4.3 compares explainer algorithms."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Multiple evaluation scalarizers are used (Log Prob, BERTScore, SUMM), along with AUPC, Spearman rank correlation, cosine similarity between scalarizer explanations, and human evaluation (fidelity perception, preference)."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 5 describes a user study with 88 ML practitioners evaluating explanation quality. Participants assessed fidelity, preference, and granularity of attribution methods."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Evaluation uses standard test/validation splits: 'first 1000 test set examples' for XSUM/CNN-DM (Section 4.1), '1000 validation set examples at random from SQuAD.' No tuning is performed on the evaluation data."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Results are broken down by dataset (XSUM, CNN/DM, SQuAD), by model (DistilBART, Flan-UL2, Llama-3, Flan-T5-Large, DeepSeek-V3, Granite-3.3), and by scalarizer. Table 1 provides full per-dataset, per-model AUPC values."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "No qualitative failure case analysis is provided showing where MExGen produces misleading or incorrect explanations. The paper acknowledges the CNN/DM/Flan-UL2 exception (Appendix D.2) but does not analyze what makes specific examples fail."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The (CNN/DM, Flan-UL2) pair is reported as an exception where P-SHAP achieves higher AUPC (Table 1, Figure 11b). Appendix F reports that word infilling with BERT showed no 'quantifiable benefit.' The gap between MExGen and self-explanation is shown to narrow with DeepSeek-V3."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims MExGen provides 'more faithful explanations' than alternatives including LLM self-explanations, which is supported by AUPC comparisons in Tables 1 and 2 and perturbation curves in Figures 5 and 12. The code release claim is supported by the GitHub link."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper's causal claims (e.g., 'C-LIME is more effective' than LIME) are justified through controlled comparisons: 'This is a direct comparison between LIME (represented by CaptumLIME) and our modification C-LIME, using the same number of model queries and input segmentation' (Section 4.3). Single-variable ablations are adequate."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The limitations section explicitly states: 'the results reported in Section 4 could still change in other experimental settings.' Results are tied to specific models and datasets. The paper tests across multiple datasets and models rather than claiming universal superiority."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper discusses why P-SHAP might outperform at higher token fractions: 'P-SHAP perturbs larger subsets of the input than MExGen... These larger subsets may enable P-SHAP to find larger changes in output' (Appendix D.2). The limitations section discusses risks of post-hoc explanations being steered."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper clearly distinguishes between the proxy (perturbation curves measuring scalarizer change) and the outcome (faithful explanations). The limitations section acknowledges: 'Our user study analyzes the perception of participants of how well a method explains the predictions of a model, and not necessarily the fidelity of the explanation itself.'"
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Specific model identifiers are provided: 'distilbart-xsum-12-6' with HuggingFace link, 'Llama-3-8B-Instruct,' 'Flan-T5-Large' with HuggingFace link, 'Flan-UL2' (20B parameters), 'DeepSeek-V3,' and 'Granite-3.3-8B-Instruct' (Section 4.1, Appendix C.2)."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Full prompt text is provided: system prompts for summarization and QA are quoted in Appendix C.2, and the self-explanation prompt template is shown in Figure 6 with all fill values specified."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Appendix C details all hyperparameters: greedy decoding, max_new_tokens settings, C-LIME parameters (n/d ratios, K values), L-SHAP parameters (M, K), multi-level refinement thresholds (k, ϕ), and scalarizer model choices."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. MExGen is a post-hoc explanation framework that queries an LLM with perturbed inputs and analyzes outputs."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Appendix C documents preprocessing: linguistic segmentation via spaCy v3.6, phrase segmentation algorithm (Appendix B.2), units marked as not of interest (punctuation, prompt templates), and dataset selection details (Appendix C.1)."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "A dedicated 'Limitations' section discusses four specific limitations: post-hoc explanations are local only, results may change in other settings, user study measures perception not fidelity, and risk of obfuscation."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Specific threats are discussed: 'results reported in Section 4 could still change in other experimental settings,' 'our user study analyzes the perception... not necessarily the fidelity,' and 'post hoc explanations come with the risk of being steered to obfuscate undesirable behavior' with specific mitigations proposed."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The limitations explicitly state what results do NOT show: explanations 'do not fully characterize how models generate output and only provide local explanations.' The paper also bounds scope to 'context-grounded tasks like summarization and question answering.'"
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "Raw experimental results (individual perturbation curve data, per-example attribution scores, user study responses) are not released. Only aggregate results are reported in the paper."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Datasets are well-documented public benchmarks (XSUM, CNN/DM, SQuAD) with licenses stated (Appendix C.1). Selection criteria are explicit: 'first 1000 test set examples,' '1000 validation set examples at random from SQuAD because selecting the first 1000 examples yielded insufficient diversity.'"
    202       },
    203       "recruitment_methods_described": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "User study recruitment is described: 'We recruited participants from a large technology company who self-identify as machine learning practitioners using language models and collected data from 88 of them after filtering' (Section 5, Appendix E.1)."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Appendix C documents the full pipeline: dataset selection and licensing (C.1), LM inference settings (C.2), MExGen configuration (C.3), baseline configuration (C.4), and perturbation curve computation (C.5)."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The acknowledgments section states: 'The work of Lucas Monteiro Paes was supported by the Apple Scholars in AI/ML Fellowship.' Other authors are IBM Research employees with affiliations listed."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: Harvard University, IBM Research, and Merck Research Labs. The paper is open about the IBM connection to the ICX360 toolkit where MExGen is released."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "The Apple fellowship and IBM Research general funding do not have a direct financial stake in MExGen outperforming specific baselines. The paper evaluates a general explanation framework rather than a commercial IBM product."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests statement is found in the paper. IBM authors could hold patents related to the ICX360 toolkit, but no declaration is made."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Training data cutoff dates are not stated for any of the models used (Flan-UL2, Llama-3, DeepSeek-V3, Granite-3.3). DistilBART was trained on the evaluation datasets by design, but this is not discussed as a contamination concern."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No discussion of potential train/test overlap. DistilBART is explicitly trained on XSUM/CNN-DM (evaluation datasets), and the API models (Llama-3, DeepSeek-V3) could have seen these public benchmarks during training. Neither issue is addressed."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "XSUM (2018), CNN/DM (2015/2017), and SQuAD (2016) are all publicly available benchmarks predating the training of all models used. No discussion of contamination risk from these benchmarks appearing in model training data."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "No mention of pre-registration for the user study (no OSF, AsPredicted, or similar link)."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "The paper states: 'Although a formal IRB process does not exist in our institution, we went through an equivalent informal process, including reviewing the study with our peers.' No formal IRB or ethics board approval is reported."
    263       },
    264       "demographics_reported": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "While the survey collected demographic information (job title, work location, English proficiency, LLM usage frequency — Appendix E.5), the paper does not report the actual demographic breakdown of the 88 participants beyond stating they are ML practitioners from 'a large technology company.'"
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": true,
    271         "answer": true,
    272         "justification": "Eligibility criteria are stated: 'Those who self-identified as machine learning practitioners using language models were eligible for the study. We filtered out 8 participants who did not pass eligibility checks or did not provide valid responses' (Appendix E.1)."
    273       },
    274       "randomization_described": {
    275         "applies": true,
    276         "answer": true,
    277         "justification": "Randomization is described: 'The presentation order of the attribution algorithms was randomized to mitigate order effects.' Input texts were 'randomly drawn from the ten examples' (Section 5)."
    278       },
    279       "blinding_described": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "The paper does not explicitly describe blinding. While the survey appears to show explanations without method labels (Figures 14-15), blinding is not explicitly discussed as a design consideration."
    283       },
    284       "attrition_reported": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "'We recruited 96 participants... We filtered out 8 participants who did not pass eligibility checks or did not provide valid responses, resulting in data from 88 participants' (Appendix E.1)."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Computational cost is discussed in terms of number of LLM inferences (the dominant cost factor) and total computation time: 'The total computation time is estimated to be on the order of 1000 hours' (Appendix C.6). The paper also discusses how cost scales linearly with input units."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Appendix C.6: 'Experiments were run on a computing cluster providing nodes with 32 GB of CPU memory, V100 GPUs with 32 GB of GPU memory, and occasionally A100 GPUs with 40 or 80 GB of GPU memory. One CPU and one GPU were used at a time. The total computation time is estimated to be on the order of 1000 hours.'"
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No mention of running experiments across multiple random seeds. The automated evaluation aggregates across examples but does not report seed sensitivity. For API models, they 'fixed the LLM's random seed' (Appendix C.2) for determinism but do not test sensitivity."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The number of experimental runs is not explicitly stated. Standard errors are computed across examples (1000 or 500 depending on the model), not across multiple independent runs of the same experiment."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Hyperparameters (K, M, n/d ratios, ϕ thresholds) are reported but no search budget is stated. It is unclear how these values were selected or how many configurations were tried."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "Rather than selecting a single best configuration, the paper reports results for all tested instantiations (LOO, C-LIME, L-SHAP) and all scalarizers. Parameter choices are motivated (e.g., 'a ratio n/d of 5 or 10 can yield good results,' Section 3.2)."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Bonferroni correction is applied in the user study: 'Significant p-values after Bonferroni adjustment are noted with ** (p<0.05/3)' (Tables 6-7, Appendix E.3)."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors evaluate their own MExGen framework against baselines (P-SHAP, CaptumLIME) without acknowledging potential author-evaluation bias. They do use existing baseline implementations rather than reimplementing baselines, which partially mitigates this."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "While baselines are given matched or greater compute budgets ('P-SHAP and CaptumLIME were allocated the same or slightly more inferences,' Section 4.3), performance is not shown as a function of compute budget with systematic variation."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The paper does not discuss whether perturbation curves (the primary evaluation metric) truly measure explanation faithfulness, or whether the selected benchmarks (XSUM, CNN/DM, SQuAD) are appropriate for evaluating explanation methods. The limitations note that the user study measures 'perception' not fidelity, but this gap is not deeply analyzed."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No agentic scaffolding is involved. MExGen is a post-hoc explanation framework, not a scaffolded agent system."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Not discussed. The evaluation benchmarks (XSUM 2018, CNN/DM 2015/2017, SQuAD 2016) substantially predate the training of the models used (Llama-3, DeepSeek-V3), but temporal leakage implications for explanation quality are not addressed."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "Not discussed. No analysis of whether the evaluation setup (e.g., using model-generated outputs as target outputs) introduces any information leakage."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "Not discussed. No analysis of independence between training and evaluation data, particularly relevant since DistilBART was trained directly on the XSUM and CNN/DM evaluation datasets."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are used."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "MExGen provides more faithful explanations than baseline methods (PartitionSHAP, CaptumLIME) as measured by perturbation curves and AUPC.",
    375       "evidence": "Table 1 shows MExGen instantiations achieve higher AUPC than P-SHAP across 7 of 8 dataset-model pairs. Figure 5 shows perturbation curves where MExGen rises more quickly, indicating better identification of important input units. Section 4.3.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "MExGen outperforms LLM self-explanations in faithfulness, including self-explanations from powerful LLMs like DeepSeek-V3.",
    380       "evidence": "Table 2 shows MExGen AUPC exceeds self-explanation AUPC in all cases. 'In all cases, MExGen is more faithful to the LLM's behavior (as measured by higher AUPC) than the LLM's self-explanations.' The gap narrows with the larger DeepSeek-V3 model on CNN/DM (14.1 vs 13.5). Section 4.4, Figure 12.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "The BERT scalarizer (text-only) can approximate Log Prob scalarizer (logit-based) performance, and users perceive BERT as more faithful than Log Prob.",
    385       "evidence": "Figure 3 shows BERT has the highest Spearman correlation with Log Prob among text-only scalarizers (0.58). User study: 'Significantly more participants perceived BERT to be higher in fidelity than Log Prob (57% to 35%)' with p < .05 (Section 5, Tables 4-5).",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "C-LIME is perceived as more faithful and preferred by users over L-SHAP.",
    390       "evidence": "Bradley-Terry model shows significant difference between C-LIME and L-SHAP for both perceived fidelity (p = 0.011) and preference (p = 0.007), with Bonferroni adjustment. This is notable because the two performed similarly in automated evaluation. Section 5, Tables 6-7.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Text-only scalarizers can provide effective explanations even without access to model logits, implying no loss from text-only API access in some cases.",
    395       "evidence": "Figure 5 shows MExGen C-LIME with the BERT scalarizer (text-only) can outperform P-SHAP (which uses logits) in perturbation curve fidelity. User study shows BERT preferred over Log Prob. Section 6: 'in some cases, there may be no loss in having text-only access compared to full logit access.'",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "methodology_tags": ["benchmark-eval"],
    400   "key_findings": "MExGen extends perturbation-based attribution methods (LIME, SHAP) to generative LLMs for context-grounded tasks, using scalarizers to handle text outputs and multi-level linguistic segmentation for long inputs. Across summarization (XSUM, CNN/DM) and QA (SQuAD) tasks with six models, MExGen instantiations (LOO, C-LIME, L-SHAP) consistently produce more faithful explanations than PartitionSHAP, CaptumLIME, and LLM self-explanations, as measured by perturbation curve AUPC. A user study with 88 ML practitioners found that the text-only BERT scalarizer was perceived as more faithful than the logit-based Log Prob scalarizer, suggesting text-only API access may suffice for explanation.",
    401   "red_flags": [
    402     {
    403       "flag": "Self-evaluation without bias acknowledgment",
    404       "detail": "IBM Research authors evaluate their own MExGen framework (released as part of IBM's ICX360 toolkit) against baselines without acknowledging potential author-evaluation bias. While they use existing implementations for baselines rather than reimplementing them, the selection of evaluation metrics and experimental design still favors the proposed method."
    405     },
    406     {
    407       "flag": "DistilBART trained on evaluation data",
    408       "detail": "One of the primary models (DistilBART) was explicitly trained on the XSUM and CNN/DM evaluation datasets. While this is by design for a summarization model, it is not discussed how this affects explanation quality evaluation — the model's familiarity with training data could systematically affect attribution patterns compared to unseen data."
    409     },
    410     {
    411       "flag": "No seed sensitivity analysis",
    412       "detail": "Automated experiments do not report results across multiple random seeds. C-LIME and L-SHAP involve randomized perturbation sampling, so results could vary across seeds, but this variability is not quantified."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "Are self-explanations from large language models faithful?",
    418       "authors": ["Andreas Madsen", "Sarath Chandar", "Siva Reddy"],
    419       "year": 2024,
    420       "relevance": "Directly evaluates faithfulness of LLM self-explanations for in-context classification, finding limitations that MExGen addresses for generative tasks."
    421     },
    422     {
    423       "title": "Can large language models explain themselves? A study of LLM-generated self-explanations",
    424       "authors": ["Shiyuan Huang", "Siddarth Mamidanna", "Shreedhar Jangam", "Yilun Zhou", "Leilani H. Gilpin"],
    425       "year": 2023,
    426       "arxiv_id": "2310.11207",
    427       "relevance": "Studies LLM self-explanation capabilities, showing they may be less faithful than external attribution methods like MExGen."
    428     },
    429     {
    430       "title": "Language models don't always say what they think: Unfaithful explanations in chain-of-thought prompting",
    431       "authors": ["Miles Turpin", "Julian Michael", "Ethan Perez", "Samuel R. Bowman"],
    432       "year": 2023,
    433       "relevance": "Demonstrates unfaithfulness of chain-of-thought explanations, motivating external attribution methods for LLM explainability."
    434     },
    435     {
    436       "title": "Measuring faithfulness in chain-of-thought reasoning",
    437       "authors": ["Tamera Lanham", "Anna Chen", "Ansh Radhakrishnan"],
    438       "year": 2023,
    439       "arxiv_id": "2307.13702",
    440       "relevance": "Measures faithfulness of chain-of-thought reasoning, relevant to understanding when LLM self-explanations can be trusted."
    441     },
    442     {
    443       "title": "ContextCite: Attributing model generation to context",
    444       "authors": ["Benjamin Cohen-Wang", "Harshay Shah", "Kristian Georgiev", "Aleksander Madry"],
    445       "year": 2024,
    446       "arxiv_id": "2409.00729",
    447       "relevance": "Concurrent work extending LIME to generative models for context attribution, but operates at single predefined granularity unlike MExGen's multi-level approach."
    448     },
    449     {
    450       "title": "The Llama 3 herd of models",
    451       "authors": ["Abhimanyu Dubey"],
    452       "year": 2024,
    453       "arxiv_id": "2407.21783",
    454       "relevance": "Describes Llama-3-8B-Instruct used as one of the primary evaluation models in the paper."
    455     },
    456     {
    457       "title": "A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions",
    458       "authors": ["Lei Huang", "Weijiang Yu", "Weitao Ma"],
    459       "year": 2023,
    460       "relevance": "Surveys LLM hallucination — MExGen addresses the related problem of verifying whether LLM outputs are grounded in input context."
    461     },
    462     {
    463       "title": "Chain of thought prompting elicits reasoning in large language models",
    464       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    465       "year": 2022,
    466       "relevance": "Foundational work on chain-of-thought prompting, whose faithfulness limitations motivate external attribution methods."
    467     },
    468     {
    469       "title": "TextGenSHAP: Scalable post-hoc explanations in text generation with long documents",
    470       "authors": ["James Enouen", "Hootan Nakhost", "Sayna Ebrahimi", "Sercan O. Arik", "Yan Liu", "Tomas Pfister"],
    471       "year": 2023,
    472       "arxiv_id": "2312.01279",
    473       "relevance": "Proposes efficient SHAP estimation for text generation using speculative decoding, a baseline approach that requires logit access unlike MExGen's text-only capability."
    474     },
    475     {
    476       "title": "Semantic uncertainty: Linguistic invariances for uncertainty estimation in natural language generation",
    477       "authors": ["Lorenz Kuhn", "Yarin Gal", "Sebastian Farquhar"],
    478       "year": 2023,
    479       "relevance": "Develops uncertainty estimation for LLM generation using semantic equivalence, related to MExGen's use of semantic similarity for scalarization."
    480     }
    481   ]
    482 }

Impressum · Datenschutz