ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (34766B)


      1 {
      2   "scan_version": 3,
      3   "active_modules": ["experimental_rigor", "data_leakage"],
      4   "paper": {
      5     "title": "A Library of LLM Intrinsics for Retrieval-Augmented Generation",
      6     "authors": [
      7       "Marina Danilevsky",
      8       "Kristjan Greenewald",
      9       "Chulaka Gunasekara",
     10       "Maeda Hanafi",
     11       "Lihong He",
     12       "Yannis Katsis",
     13       "Krishnateja Killamsetty",
     14       "Yulong Li",
     15       "Yatin Nandwani",
     16       "Lucian Popa",
     17       "Dinesh Raghu",
     18       "Frederick Reiss",
     19       "Vraj Shah",
     20       "Khoi-Nguyen Tran",
     21       "Huaiyu Zhu",
     22       "Luis Lastras"
     23     ],
     24     "year": 2025,
     25     "venue": "arXiv",
     26     "arxiv_id": "2504.11704",
     27     "doi": "10.48550/arXiv.2504.11704"
     28   },
     29   "checklist": {
     30     "artifacts": {
     31       "code_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "LoRA adapters released on HuggingFace at https://huggingface.co/ibm-granite/granite-3.3-8b-rag-agent-lib. Software framework released on GitHub at https://github.com/ibm-granite/granite-io. Example notebooks are also provided."
     35       },
     36       "data_released": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Evaluation uses public benchmarks (MT-RAG, BEIR, MMLU, RAGTruth, ALCE, LongBench-Cite), but training data is proprietary. Section 2.3 states: 'The training dataset is proprietary and was obtained in combination with a third-party company who contracted the human annotators.' Other intrinsics use synthetically generated training data that is also not released."
     40       },
     41       "environment_specified": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper mentions vLLM as the inference platform and PEFT for fine-tuning, but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions. Individual LoRA training hyperparameters are given but not the full software environment."
     45       },
     46       "reproduction_instructions": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper references example notebooks and documentation at the HuggingFace and GitHub links for using the intrinsics, but does not include step-by-step instructions for reproducing the training or evaluation experiments reported in the paper."
     50       }
     51     },
     52     "statistical_methodology": {
     53       "confidence_intervals_or_error_bars": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "All results tables (Tables 2-22) report only point estimates. No confidence intervals, error bars, or ± notation appear anywhere in the paper."
     57       },
     58       "significance_tests": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper makes numerous comparative claims (e.g., '9 percentage points jump', '22 percentage points improvement') based solely on comparing numbers without any statistical significance tests."
     62       },
     63       "effect_sizes_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper consistently reports effect sizes as percentage point differences with baseline context. For example: 'there is an overall 9 percentage points jump when using query rewrite... This jump is more pronounced on the non-standalone fragment, where query rewrite... leads to 22 percentage points improvement over the no-rewrite strategy' (Section 2.2.1)."
     67       },
     68       "sample_size_justified": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No justification is provided for the sizes of training or evaluation datasets. For example, the MT-RAG evaluation uses 842 data points but no justification is given for this number. No power analysis is mentioned."
     72       },
     73       "variance_reported": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No standard deviations, variance measures, or spread statistics are reported across any experimental results. All numbers appear to be from single runs."
     77       }
     78     },
     79     "evaluation_design": {
     80       "baselines_included": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Each intrinsic is compared against relevant baselines. QR is compared against no-rewrite, Mixtral 8x7b, and gold rewrites (Tables 2-7). CR is compared against Llama 3.3 70B, Granite Guardian, and base Granite (Tables 10-11). AD is compared against BigBird w/ MLP and LLaMA 2-7B (Tables 12-13). HD is compared against GPT-3.5/4, SelfCheckGPT, LMvLM, and finetuned Llama-2-13B (Table 17)."
     84       },
     85       "baselines_contemporary": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Baselines include contemporary models: Llama-3.3-70B-Instruct, Mixtral-8x22B-Instruct, Granite Guardian (2024). The HD evaluation uses older baselines (GPT-3.5/4-turbo, Llama-2-13B) from the RAGTruth paper, but the other evaluations use current models."
     89       },
     90       "ablation_study": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section 10 presents a systematic ablation of composite intrinsics, evaluating four flows — None, QR only, AD only, and QR+AD — across answerability classification (Table 20), faithfulness (Table 21), and joint score (Table 22). This isolates the contribution of each component."
     94       },
     95       "multiple_metrics": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Multiple metrics are used throughout: Recall@k at multiple k values, RAGAS Faithfulness, RAD-Bench, precision/recall/F1, NDCG@10, ECE, and JAFS. Each intrinsic is evaluated on at least two metrics."
     99       },
    100       "human_evaluation": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "All evaluations are fully automated using benchmark metrics (Recall@k, RAGAS-F, RAD-Bench, F1, NDCG@10, ECE). No human evaluation of the intrinsics' outputs is performed."
    104       },
    105       "held_out_test_set": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Evaluation uses established benchmark test sets (MT-RAG, SQUADRun, BEIR, MMLU, RAGTruth, ALCE, LongBench-Cite) that are separate from training data. Training uses different corpora — e.g., QR trains on Cloud corpus while evaluating on full MT-RAG; AD trains on Government corpus while evaluating on MT-RAG."
    109       },
    110       "per_category_breakdown": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Results are broken down by multiple categories: standalone vs non-standalone subsets (Tables 2-7), answerable vs unanswerable classes (Tables 12-14), per-dataset in BEIR (Table 15), per-dataset in LongBench-Cite (Table 19), per-MMLU task (Figure 1), and per-benchmark for CR (Tables 10-11)."
    114       },
    115       "failure_cases_discussed": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "No systematic error analysis or qualitative examples of failures are provided. The paper mentions performance drops in reranking on some BEIR tasks and notes potential puzzling outcomes of composite intrinsics, but does not analyze specific failure cases or error categories."
    119       },
    120       "negative_results_reported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Several negative results are reported: reranking shows performance drops on 5/15 BEIR tasks including arguana (0.567→0.347) in Table 15. The composite flow analysis shows QR+AD degrades AD's unanswerable F1 from 69 to 51 (Table 20). The appendix notes Granite 3.3 is 'overall slightly worse than the 3.2 version' on LongBench-Cite (Table 26)."
    124       }
    125     },
    126     "claims_and_evidence": {
    127       "abstract_claims_supported": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The abstract is primarily descriptive, introducing the concept of LLM intrinsics and the library. It claims the paper 'describes the intended usage, training details, and evaluations for each intrinsic, as well as compositions of multiple intrinsics,' which is accurate. No unsupported empirical claims appear in the abstract."
    131       },
    132       "causal_claims_justified": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper makes causal claims like 'using query rewrite directly improves the retriever performance, which in turn improves the answer generation performance.' These are supported by controlled comparisons where a single component is swapped (e.g., QR vs no-rewrite with everything else fixed). Section 10 provides a systematic ablation of QR and AD with four controlled flows."
    136       },
    137       "generalization_bounded": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper explicitly states 'For illustrative purposes, these intrinsics are implemented using IBM Granite language models, with extension to other model families possible in the future' (Section 1). UQ Section 7.1 states the model 'is not intended to predict the certainty of responses generated by any other models besides itself.' The scope is bounded to Granite 3.3 8b."
    141       },
    142       "alternative_explanations_discussed": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "No alternative explanations for the observed improvements are discussed. For example, the paper does not consider whether performance gains could be due to the additional compute from the LoRA, differences in tokenization, or other confounds."
    146       },
    147       "proxy_outcome_distinction": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "The paper's measurements closely match its claims: Recall@k for retrieval quality, RAGAS Faithfulness for answer faithfulness, NDCG@10 for reranking, ECE for calibration quality, and F1 for classification accuracy. JAFS is carefully defined with a mathematical formula (Section 5.2.2). The paper does not make broader claims beyond what the metrics measure."
    151       }
    152     },
    153     "setup_transparency": {
    154       "model_versions_specified": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Specific HuggingFace model IDs are used throughout: 'ibm-granite/granite-3.3-8b-instruct', 'meta-llama/Llama-3.3-70B-Instruct', 'Mixtral-8x22B-Instruct', 'ibm-granite/granite-guardian-3.1-5b'. These include version numbers and sizes."
    158       },
    159       "prompts_provided": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Actual prompt text is provided for multiple intrinsics: the QR rewrite role prompt (Section 2), the passage reranking prompt (Section 6.1), the HD instruction text (Section 8.1), and the CG instruction text (Section 9.1). These are verbatim prompts, not just descriptions."
    163       },
    164       "hyperparameters_reported": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "LoRA training hyperparameters are reported for each intrinsic. For example, QR: 'rank = 32, learning rate = 3e-6, number of epochs = 25, with early stopping based on validation set, and 90/10 split' (Section 2.3). Similar detail for CR, AD, UQ, HD, and CG. Inference settings (k=20, top-5 passages) are also specified."
    168       },
    169       "scaffolding_described": {
    170         "applies": false,
    171         "answer": false,
    172         "justification": "The paper does not use agentic scaffolding. The intrinsics are individual LoRA adapters and software components composed into pipeline flows, not agent-based systems with retry logic, memory, or feedback mechanisms."
    173       },
    174       "data_preprocessing_documented": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Training data construction is documented for each intrinsic: QR uses human-created conversations from MT-RAG Cloud corpus (Section 2.3), CR uses synthetic data from CLAPNQ with Mixtral as judge (Section 4.3), AD uses Government corpus with human + synthetic data and Mixtral validation (Section 5.3), HD/CG use multi-step pipeline from CoQA/MultiDoc2Dial/QuAC (Sections 8.3, 9.3)."
    178       }
    179     },
    180     "limitations_and_scope": {
    181       "limitations_section_present": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "There is no dedicated limitations or threats-to-validity section. The paper proceeds from evaluation results directly to a brief conclusion (Section 11) without discussing limitations."
    185       },
    186       "threats_to_validity_specific": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No threats to validity are discussed anywhere in the paper. No study-specific concerns about evaluation methodology, training data quality, or generalization are raised."
    190       },
    191       "scope_boundaries_stated": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "While the paper mentions that intrinsics are 'implemented using IBM Granite language models' and UQ states it is 'not intended to predict the certainty of responses generated by any other models,' there is no systematic statement of what the results do NOT show. No dedicated scope boundaries discussion exists."
    195       }
    196     },
    197     "data_integrity": {
    198       "raw_data_available": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "Raw training data is proprietary and not released. Raw experimental outputs (model predictions, intermediate results) are not made available. Only aggregated metrics are reported in tables."
    202       },
    203       "data_collection_described": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Data collection is described for each intrinsic with sources, methods, and quality control. For example, QR training uses 'the publicly available Cloud corpus of technical documentation pages from MT-RAG' with 'high-quality, human-created conversations' (Section 2.3). Synthetic data generation and Mixtral-based validation are described for CR, AD."
    207       },
    208       "recruitment_methods_described": {
    209         "applies": false,
    210         "answer": false,
    211         "justification": "No human participants in the study. Data sources are standard benchmarks and corpora. Human annotators were used for training data creation but are not study participants."
    212       },
    213       "data_pipeline_documented": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "While the general training data generation process is described for each intrinsic, exact counts of generated examples, filtering rates, and specific transformation steps are missing. For example, Section 4.3 mentions Mixtral was used to 'validate the generated labels and filter out noisy samples' but doesn't state how many samples were generated or filtered."
    217       }
    218     },
    219     "conflicts_of_interest": {
    220       "funding_disclosed": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding acknowledgment section appears in the paper. The acknowledgments section (Section 12) only says 'Thanks to internal and external annotators' without disclosing funding sources."
    224       },
    225       "affiliations_disclosed": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "All 16 authors are clearly listed as 'IBM Research' on the first page. The paper evaluates IBM Granite models, making the affiliation-product relationship transparent."
    229       },
    230       "funder_independent_of_outcome": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "IBM Research employees evaluate IBM's own Granite models and release IBM's own intrinsics library. IBM has a direct commercial interest in demonstrating the effectiveness of its Granite models and RAG tools."
    234       },
    235       "financial_interests_declared": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No competing interests or financial disclosure statement appears in the paper. Given all authors are IBM employees evaluating IBM products, the absence of such a declaration is notable."
    239       }
    240     },
    241     "contamination": {
    242       "training_cutoff_stated": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No training data cutoff date is stated for the Granite 3.3 8b base model. The paper does not mention when the pre-training data was collected, making it impossible to assess temporal overlap with evaluation benchmarks."
    246       },
    247       "train_test_overlap_discussed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "No discussion of potential overlap between Granite's pre-training data and the evaluation benchmarks (MT-RAG, BEIR, MMLU, SQUADRun, etc.). This is particularly relevant since MMLU and BEIR are widely used and likely in many training corpora."
    251       },
    252       "benchmark_contamination_addressed": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "The paper uses multiple public benchmarks (MMLU, BEIR, SQuAD, HotpotQA) that have been available since well before Granite's training, but does not discuss contamination risk at all."
    256       }
    257     },
    258     "human_studies": {
    259       "pre_registered": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study. The paper evaluates model intrinsics on automated benchmarks."
    263       },
    264       "irb_or_ethics_approval": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "demographics_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "inclusion_exclusion_criteria": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       },
    279       "randomization_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in the study."
    283       },
    284       "blinding_described": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in the study."
    288       },
    289       "attrition_reported": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "No human participants in the study."
    293       }
    294     },
    295     "cost_and_practicality": {
    296       "inference_cost_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No inference cost or latency numbers are reported. Section 6.2 notes passage reranking 'only adds a small overhead' but provides no actual measurements. No API costs, tokens consumed, or wall-clock times are given for any intrinsic."
    300       },
    301       "compute_budget_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No computational budget is stated. GPU hours, hardware used, training time, and total compute cost are not mentioned for any of the six LoRA training processes or evaluations."
    305       }
    306     },
    307     "experimental_rigor": {
    308       "seed_sensitivity_reported": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No mention of multiple random seeds. All results appear to be from single training runs and single evaluations."
    312       },
    313       "number_of_runs_stated": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The number of experimental runs is never stated. It is unclear whether results are from single runs or averaged across multiple runs."
    317       },
    318       "hyperparameter_search_budget": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "While training hyperparameters (rank, learning rate, epochs) are reported for each LoRA, no search budget is described. It is unclear how these values were selected or how many configurations were tried."
    322       },
    323       "best_config_selection_justified": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Some intrinsics mention early stopping on validation sets (e.g., QR in Section 2.3), but no justification is given for the choice of hyperparameters like rank=32 or specific learning rates. The overall configuration selection process is not described."
    327       },
    328       "multiple_comparison_correction": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons. The paper makes numerous comparisons across 8 intrinsics, multiple benchmarks, and multiple baselines."
    332       },
    333       "self_comparison_bias_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "IBM researchers evaluate their own Granite-based intrinsics against baselines that include their own reimplementations. The potential for self-evaluation bias is not acknowledged or discussed."
    337       },
    338       "compute_budget_vs_performance": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No analysis of performance as a function of compute. The LoRA adapters add inference overhead and training cost, but these are not quantified or compared against simpler alternatives at matched compute budgets."
    342       },
    343       "benchmark_construct_validity": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "The paper uses multiple benchmarks (MT-RAG, BEIR, MMLU, RAGTruth, ALCE, LongBench-Cite) without discussing whether they adequately measure the claimed capabilities or their construct validity."
    347       },
    348       "scaffold_confound_addressed": {
    349         "applies": false,
    350         "answer": false,
    351         "justification": "No agentic scaffolding is involved. The intrinsics are individual components and pipeline compositions, not scaffold-dependent agent systems."
    352       }
    353     },
    354     "data_leakage": {
    355       "temporal_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of temporal leakage. Several evaluation benchmarks (SQuAD, HotpotQA, BEIR tasks) predate the Granite model's training and their solutions may be in the training data."
    359       },
    360       "feature_leakage_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No discussion of feature leakage. The paper does not address whether the evaluation setup provides any information that would not be available in real deployment scenarios."
    364       },
    365       "non_independence_addressed": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "MT-RAG is used for both LoRA training (specific sub-corpora) and evaluation. While different portions are used (Cloud corpus for QR training, Government corpus for AD training, full MT-RAG for evaluation), the independence of train and test is not explicitly verified."
    369       },
    370       "leakage_detection_method": {
    371         "applies": true,
    372         "answer": false,
    373         "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are mentioned."
    374       }
    375     }
    376   },
    377   "claims": [
    378     {
    379       "claim": "Query Rewrite LoRA improves Recall@20 by 9 percentage points on full MT-RAG (0.67 to 0.76) and by 22 percentage points on non-standalone queries (0.44 to 0.66), matching human gold rewrites.",
    380       "evidence": "Tables 2, 3, and 4 in Section 2.2.1 compare recall across no-rewrite, Mixtral, LoRA, and gold rewrite strategies on full, non-standalone, and standalone MT-RAG subsets.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Query Rewrite LoRA improves answer quality by 8pp in RAGAS Faithfulness and 4pp in RAD-Bench on full MT-RAG.",
    385       "evidence": "Tables 5-7 in Section 2.2.2 show answer generation quality metrics across rewrite strategies. The 18pp RAGAS-F improvement on non-standalone queries is notable.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Query Expansion achieves highest retrieval recall (87% Recall@50) outperforming all single-query methods.",
    390       "evidence": "Table 8 in Section 3.2.1 shows Recall@50 and @100 for query expansion vs individual strategies. However, query expansion uses up to 100 passages vs 20 for single methods.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Context Relevance LoRA outperforms Llama 3.3 70B, Granite Guardian 3.1 5b, and base Granite on precision for relevant labels and recall for irrelevant labels across 8 benchmarks.",
    395       "evidence": "Tables 10 and 11 in Section 4.2.3 show precision and recall scores across MTRAG, CLAPNQ, Drop, FinanceBench, BioASQ, Open Aus. Legal Corpus, and HotpotQA+SquadV2.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Answerability Determination LoRA achieves 90% weighted F1 on MT-RAG, outperforming BigBird and LLaMA 2-7B.",
    400       "evidence": "Table 13 in Section 5.2.1 shows weighted F1 of 90% for the LoRA vs 69.6% for BigBird and 87.1% for LLaMA 2-7B on MT-RAG. Table 14 shows 17% JAFS lift over vanilla Granite.",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "UQ LoRA reduces average ECE from 0.20 (base model) to 0.064 across MMLU tasks with no degradation in zero-shot accuracy (89%).",
    405       "evidence": "Section 7.2 and Figure 1 report ECE values across MMLU tasks. Table 16 shows MAE for post-answer and pre-answer certainty prediction under different system prompts.",
    406       "supported": "moderate"
    407     },
    408     {
    409       "claim": "Hallucination Detection LoRA achieves 68.6% F1 on RAGTruth QA, competitive with finetuned Llama-2-13B (68.2%) and far exceeding prompted GPT-4-turbo (45.6%).",
    410       "evidence": "Table 17 in Section 8.2 compares response-level hallucination detection across baselines from the RAGTruth paper.",
    411       "supported": "moderate"
    412     },
    413     {
    414       "claim": "Citation Generation LoRA performs on par with Llama 70B on ALCE passage-level citations (F1 ~60-62) and significantly outperforms it on LongBench-Cite span-level citations (68.6 vs 42.0 avg F1).",
    415       "evidence": "Tables 18 and 19 in Sections 9.2.1 and 9.2.2 show citation evaluation results. The 8b LoRA achieves an impressive advantage on fine-grained span-level citations.",
    416       "supported": "strong"
    417     },
    418     {
    419       "claim": "Composite QR+AD flow achieves best overall JAFS of 66, a 17-point improvement over the no-intrinsic baseline (49).",
    420       "evidence": "Tables 20-22 in Section 10.1 present the four-flow ablation (None, QR, AD, QR+AD) on answerability classification, faithfulness, and joint score. However, QR+AD ties AD-only (both 66).",
    421       "supported": "moderate"
    422     }
    423   ],
    424   "methodology_tags": ["benchmark-eval"],
    425   "key_findings": "The paper introduces a library of 8 RAG intrinsics implemented as LoRA adapters for Granite 3.3 8b, covering query rewrite, query expansion, context relevance, answerability determination, passage reranking, uncertainty quantification, hallucination detection, and citation generation. Query Rewrite shows the largest impact, improving retrieval recall by up to 22pp on context-dependent queries. Citation Generation stands out as an 8b LoRA that significantly outperforms Llama 70B on fine-grained span-level citations. The composite flow analysis reveals that combining intrinsics introduces tradeoffs — adding QR to AD degrades unanswerable detection while improving overall faithfulness.",
    426   "red_flags": [
    427     {
    428       "flag": "Company evaluating its own product",
    429       "detail": "All 16 authors are IBM Research employees evaluating IBM's own Granite models and intrinsics library. No independent evaluation is included. The paper has commercial interest in demonstrating Granite's capabilities."
    430     },
    431     {
    432       "flag": "No error bars or statistical tests",
    433       "detail": "All results across 22 tables are point estimates from apparently single runs. No confidence intervals, significance tests, or variance measures are reported, making it impossible to assess whether reported differences are meaningful."
    434     },
    435     {
    436       "flag": "No limitations section",
    437       "detail": "The paper lacks any limitations discussion, threats to validity, or scope boundaries section. For a paper proposing a general framework evaluated on specific benchmarks with a specific model, this is a significant omission."
    438     },
    439     {
    440       "flag": "Proprietary training data",
    441       "detail": "Training data for several intrinsics (notably Query Rewrite) is proprietary, and synthetically generated data for others is not released. This prevents independent reproduction of the LoRA training."
    442     },
    443     {
    444       "flag": "No contamination analysis",
    445       "detail": "The Granite base model's training data cutoff is never stated, and no contamination analysis is performed despite evaluating on well-known public benchmarks (MMLU, BEIR, SQuAD) that likely overlap with pre-training data."
    446     },
    447     {
    448       "flag": "Unfair comparison in Query Expansion",
    449       "detail": "Query Expansion uses 5 queries to retrieve up to 100 passages (Table 8), while single-query baselines retrieve only 20. The comparison is not compute-normalized, inflating the apparent advantage of expansion."
    450     }
    451   ],
    452   "cited_papers": [
    453     {
    454       "title": "MTRAG: A multi-turn conversational benchmark for evaluating retrieval-augmented generation systems",
    455       "authors": ["Yannis Katsis", "Sara Rosenthal", "Kshitij Fadnis", "Chulaka Gunasekara", "Young-Suk Lee", "Lucian Popa", "Vraj Shah", "Huaiyu Zhu", "Danish Contractor", "Marina Danilevsky"],
    456       "year": 2025,
    457       "arxiv_id": "2501.03468",
    458       "relevance": "Primary RAG evaluation benchmark used throughout the paper for multi-turn conversational retrieval and generation."
    459     },
    460     {
    461       "title": "RAGTruth: A hallucination corpus for developing trustworthy retrieval-augmented language models",
    462       "authors": ["Cheng Niu", "Yuanhao Wu", "Juno Zhu", "Siliang Xu", "Kashun Shum", "Randy Zhong", "Juntong Song", "Tong Zhang"],
    463       "year": 2024,
    464       "relevance": "Hallucination detection benchmark used to evaluate the HD intrinsic, relevant to RAG trustworthiness."
    465     },
    466     {
    467       "title": "Enabling large language models to generate text with citations",
    468       "authors": ["Tianyu Gao", "Howard Yen", "Jiatong Yu", "Danqi Chen"],
    469       "year": 2023,
    470       "doi": "10.18653/v1/2023.emnlp-main.398",
    471       "relevance": "ALCE citation benchmark used to evaluate the citation generation intrinsic; foundational work on LLM-generated citations."
    472     },
    473     {
    474       "title": "LongCite: Enabling LLMs to generate fine-grained citations in long-context QA",
    475       "authors": ["Jiajie Zhang", "Yushi Bai", "Xin Lv", "Wanjun Gu", "Danqing Liu"],
    476       "year": 2024,
    477       "arxiv_id": "2409.02897",
    478       "relevance": "Fine-grained citation benchmark used for evaluating span-level citation generation capabilities of LLMs."
    479     },
    480     {
    481       "title": "Thermometer: Towards universal calibration for large language models",
    482       "authors": ["Maohao Shen", "Subhro Das", "Kristjan Greenewald", "Prasanna Sattigeri", "Gregory Wornell", "Soumya Ghosh"],
    483       "year": 2024,
    484       "arxiv_id": "2403.08819",
    485       "relevance": "Calibration method used to train the uncertainty quantification intrinsic; relevant to LLM reliability and trustworthiness."
    486     },
    487     {
    488       "title": "Granite Guardian",
    489       "authors": ["Inkit Padhi", "Manish Nagireddy", "Giandomenico Cornacchia"],
    490       "year": 2024,
    491       "arxiv_id": "2412.07724",
    492       "relevance": "IBM's LLM guardrail model used as a baseline for context relevance evaluation."
    493     },
    494     {
    495       "title": "Activated LoRA: Fine-tuned LLMs for intrinsics",
    496       "authors": ["Kristjan Greenewald", "Luis Lastras", "Thomas Parnell", "Lucian Popa", "Vraj Shah", "Giulio Zizzo"],
    497       "year": 2025,
    498       "relevance": "Sister paper introducing the activated LoRA mechanism for efficient inference of LLM intrinsics."
    499     },
    500     {
    501       "title": "SelfCheckGPT: Zero-resource black-box hallucination detection for generative large language models",
    502       "authors": ["Potsawee Manakul", "Adian Liusie", "Mark Gales"],
    503       "year": 2023,
    504       "relevance": "Black-box hallucination detection method used as a baseline; relevant to LLM output verification."
    505     },
    506     {
    507       "title": "Retrieval augmented generation (RAG) and beyond: A comprehensive survey on how to make your LLMs use external data more wisely",
    508       "authors": ["Siyun Zhao", "Yuqing Yang", "Zilong Wang"],
    509       "year": 2024,
    510       "arxiv_id": "2409.14924",
    511       "relevance": "Comprehensive RAG survey covering retrieval strategies and generation quality, directly relevant to the RAG pipeline components this paper addresses."
    512     },
    513     {
    514       "title": "Know what you don't know: Unanswerable questions for SQuAD",
    515       "authors": ["Pranav Rajpurkar", "Robin Jia", "Percy Liang"],
    516       "year": 2018,
    517       "relevance": "SQUADRun benchmark used for answerability determination evaluation; foundational work on unanswerable question detection."
    518     },
    519     {
    520       "title": "BEIR: A heterogeneous benchmark for zero-shot evaluation of information retrieval models",
    521       "authors": ["Nandan Thakur", "Nils Reimers", "Andreas Rücklé", "Abhishek Srivastava", "Iryna Gurevych"],
    522       "year": 2021,
    523       "arxiv_id": "2104.08663",
    524       "relevance": "Standard information retrieval benchmark used to evaluate the passage reranking intrinsic across 15 diverse tasks."
    525     },
    526     {
    527       "title": "CLAPnq: Cohesive long-form answers from passages in natural questions for RAG systems",
    528       "authors": ["Sara Rosenthal", "Avirup Sil", "Radu Florian", "Salim Roukos"],
    529       "year": 2025,
    530       "doi": "10.1162/tacl_a_00729",
    531       "relevance": "RAG question answering benchmark used for context relevance evaluation and as training data source."
    532     }
    533   ],
    534   "engagement_factors": {
    535     "practical_relevance": {
    536       "score": 3,
    537       "justification": "Released LoRA adapters on HuggingFace and a Python SDK on GitHub that developers can immediately integrate into RAG pipelines."
    538     },
    539     "surprise_contrarian": {
    540       "score": 0,
    541       "justification": "Confirms expected benefits of standard RAG pipeline components (query rewriting, reranking, hallucination detection) without challenging conventional wisdom."
    542     },
    543     "fear_safety": {
    544       "score": 0,
    545       "justification": "No safety or security concerns raised; the intrinsics are intended to improve RAG reliability."
    546     },
    547     "drama_conflict": {
    548       "score": 0,
    549       "justification": "No controversy or conflict; straightforward technical contribution."
    550     },
    551     "demo_ability": {
    552       "score": 3,
    553       "justification": "Models are pip-installable via HuggingFace with example notebooks provided for immediate use."
    554     },
    555     "brand_recognition": {
    556       "score": 1,
    557       "justification": "IBM Research is well-known in enterprise AI but not among the highest-profile consumer AI labs."
    558     }
    559   }
    560 }

Impressum · Datenschutz