scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30455B)
      1 {
      2   "paper": {
      3     "title": "REVERSUM: A Multi-staged Retrieval-Augmented Generation Method to Enhance Wikipedia Tail Biographies through Personal Narratives",
      4     "authors": [
      5       "Sayantan Adak",
      6       "Pauras Mangesh Meher",
      7       "Paramita Das",
      8       "Animesh Mukherjee"
      9     ],
     10     "year": 2025,
     11     "venue": "International Conference on Computational Linguistics",
     12     "arxiv_id": "2502.12137",
     13     "doi": "10.48550/arXiv.2502.12137"
     14   },
     15   "scan_version": 3,
     16   "active_modules": [
     17     "experimental_rigor",
     18     "data_leakage"
     19   ],
     20   "methodology_tags": [
     21     "benchmark-eval",
     22     "qualitative"
     23   ],
     24   "key_findings": "REVERSUM, a multi-staged RAG pipeline with relevance detection, evidence collection, verification, and summarization, significantly outperforms standard RAG for enhancing Wikipedia tail biographies using personal narratives. Human evaluators found 92% of REVERSUM-generated content integrable (vs 75% for standard RAG) and 96% informative (vs 67.5%). Ablation analysis shows evidence verification is the most critical stage. About 51% of Wikipedia sections could not be expanded due to insufficient relevant information in the personal narratives.",
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper states 'Code and Data are available at https://github.com/sayantan11995/wikipedia_enrichment' in a footnote on the first page."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The same GitHub URL claims to include data. Additionally, the personal narratives are sourced from Internet Archive (publicly accessible), and Wikipedia articles are public."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper mentions Llama-3-8b-instruct, NVIDIA A100 40GB, and some libraries (ChromaDB, sentence-bert, langchain RecursiveTextSplitter), but provides no requirements.txt, Dockerfile, or library version numbers sufficient to recreate the environment."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link is given but no README with commands or 'Reproducing Results' section is included in the paper itself."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Results in Tables 1 and 3 are reported as point estimates only (e.g., ∆CI = 61.27, ∆Quality = 15.84) with no confidence intervals or error bars."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper states 'We conduct a Mann-Whitney U-test to compare the REVERSUM-based results with the best-performing baseline' and reports statistically significant improvements (p-value < 0.05) for B and C category articles."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Results are reported with baseline context — Table 1 shows all baseline numbers alongside REVERSUM, allowing magnitude comparison. The abstract reports 17pp and 28.5pp improvements in integrability and informativeness respectively."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The dataset contains 102 personal narratives (53 B, 49 C class) with no justification for this number and no power analysis. For manual evaluation, 100 sections were randomly selected with no justification."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. All results appear to be from single runs despite stochastic generation (temperature=0.7)."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Multiple baselines are included: a modified Banerjee and Mitra (2015b), key-phrase to section mapping (coherence-based and RAG-based), and standard RAG. Results are compared in Table 1."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The paper acknowledges 'There is no recent work that directly addresses the specific task' and justifies using Banerjee and Mitra (2015b) with modifications. Two additional strong baselines are proposed, and Zhang et al. (2024) is discussed in related work."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Table 3 presents ablation results removing each stage of REVERSUM (relevance detection, evidence collection, evidence verification, summary generation), showing the contribution of each component."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Four automatic metrics are reported: ∆Calibrated Informativeness, ∆Understandability, ∆Readability, and ∆Quality. Manual evaluation covers integrability, informativeness, understandability, and readability."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Manual evaluation by 8 annotators on 100 randomly selected samples, assessing integrability (92%), informativeness (96%), understandability (98%), and readability (99%). Inter-annotator agreement measured with Cohen's κ = 0.84."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No explicit held-out test set is described. The main evaluation is on all 102 personal narratives. For the fine-tuned continuation score model (34,576 datapoints), no train/test split is explicitly described."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Table 14 (Appendix G.1) provides results for each individual biography. Table 1 breaks down results by Wikipedia class (B vs C). Table 2 shows stage-wise percentages of non-expansion cases."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 7.1 discusses negative scenarios where the pipeline fails to generate content. Table 2 provides stage-wise percentages: 16% retrieval failures, 12% relevance detection, 3% evidence collection, 19% evidence verification, 1% summarization."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The pilot study (Section 5.1) reports that standard RAG produced content that was 'just a summary of the already existing Wikipedia content' in 56% of cases. Table 15 shows some LLMs underperforming on certain metrics."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Abstract claims of 17% integrability improvement (92% vs 75%) and 28.5% informativeness improvement (96% vs 67.5%) are supported by manual evaluation results in Section 6. These are percentage point differences labeled as percentages, which is imprecise but the underlying data matches."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Causal claims about REVERSUM's superiority are supported by controlled comparisons (same dataset, same base model across methods) and the ablation study (Table 3) which systematically removes each component to measure its contribution."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Section 9 explicitly states: 'Currently, we limit our methodology to B and C classes, as lower-category articles often lack well-defined sections.' The limitations section notes the dataset 'may not be representative of all lesser-known biographies.'"
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The limitations section discusses subjective bias from personal narratives and dataset representativeness, but these are methodological limitations, not specific alternative explanations for why REVERSUM outperforms baselines. No consideration of confounds like prompt engineering quality or model memorization."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper explicitly addresses the gap between raw informativeness and actual content quality by proposing Calibrated Informativeness (Section 5.5, Appendix E), which accounts for novelty (fraction of new words) and appropriateness (continuation score). Table 10 demonstrates the distinction."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The primary model 'Llama-3-8b-instruct' is well-specified, but the GPT-4 model used through DeepEval for faithfulness evaluation is referenced without a version or snapshot date."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Full prompt texts are provided in Tables 5-9 of the appendix for the standard RAG approach, relevance detection, evidence extraction, evidence verification, and summary generation stages."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Appendix D reports: max_new_tokens=250, do_sample=True, temperature=0.7, top_p=0.9, MMR top-k=4. Fine-tuning: learning rate 2e-5, batch size 16, 10 epochs. Baseline hyperparameters α=3, β=2, γ=1."
    164       },
    165       "scaffolding_described": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The multi-staged pipeline (relevance detection → evidence collection → verification → summarization) is described in detail in Section 5.2 with a schematic in Figure 2. Each stage's input/output and chat session management are explained."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 3 documents the data collection process: selecting B/C biographies from Wikipedia, searching Internet Archive API, manual verification by a post-graduate student, and filtering irrelevant links. Text splitting with RecursiveTextSplitter is described with chunk sizes and window parameters."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 11 is a dedicated 'Limitations' section discussing subjective bias from personal narratives, manual verification subjectivity, and dataset representativeness."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The limitations section discusses study-specific threats: 'the reliance on personal narratives such as autobiographies/biographies may introduce a subjective bias, as these sources often reflect personal perspectives and interpretations which could be in conflict with Wikipedia's neutral point of view policy.'"
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 9 states: 'Currently, we limit our methodology to B and C classes, as lower-category articles often lack well-defined sections.' The limitations also note the dataset may not be representative of all lesser-known biographies."
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Code and data are stated to be available at the GitHub URL. Source personal narratives are from Internet Archive (publicly accessible), and Wikipedia articles are public."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Section 3 describes the systematic process: compiling B/C Wikipedia biography lists, searching Internet Archive API for matching narratives, manual verification to filter noise, resulting in 102 personal narratives. Table 4 provides detailed statistics for each narrative."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "Annotator recruitment is vaguely described: '8 individuals from diverse backgrounds' for main evaluation, '9 Wikipedia users including an expert in Wikipedia research all of whom voluntarily participated' for pilot. No recruitment channels, selection criteria, or potential selection bias discussed."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "While Section 3 describes data collection steps and Table 2 shows REVERSUM stage-wise filtering, the initial data collection pipeline lacks counts: no report of how many Internet Archive search results were retrieved before manual filtering reduced them to 102 narratives."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding source or acknowledgments section mentioning grants is present in the paper."
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Author affiliations are clearly stated: all four authors are from IIT Kharagpur. They are not evaluating a commercial product."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No funding source is disclosed, so independence cannot be determined."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests or financial disclosures statement is present in the paper."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "The paper evaluates a RAG pipeline system, not a pre-trained model's inherent knowledge on a benchmark. The model is used as a generation component, not tested for its pre-trained capabilities."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "The paper tests a system/pipeline, not a model's benchmark performance. Contamination in the traditional sense does not apply."
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No standard benchmark is used for model capability evaluation. The evaluation measures the quality of system-generated content from retrieved evidence."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "No pre-registration is mentioned for the human evaluation study."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": true,
    262         "answer": false,
    263         "justification": "Section 12 (Ethical considerations) mentions voluntary participation and informed consent but does not mention IRB or ethics board approval."
    264       },
    265       "demographics_reported": {
    266         "applies": true,
    267         "answer": false,
    268         "justification": "Evaluators are described only as '8 individuals from diverse backgrounds' and '9 Wikipedia users including an expert.' No demographics such as age, gender, education level, or geographic distribution are reported."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "No explicit inclusion/exclusion criteria for annotators beyond vague descriptors like 'diverse backgrounds' and 'Wikipedia users.'"
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "Not an experimental study with participants assigned to treatment/control conditions. All evaluators assess the same generated outputs."
    279       },
    280       "blinding_described": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No mention of whether evaluators were blinded to which system (REVERSUM vs baseline) produced the output being evaluated."
    284       },
    285       "attrition_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No report of whether all annotators completed all assigned evaluations or if any dropped out."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No inference cost, latency, or per-article processing time is reported despite the multi-staged pipeline requiring multiple LLM calls per section."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "NVIDIA A100 40GB is mentioned for fine-tuning, but total GPU hours for training and inference across all 102 narratives are not stated."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "No mention of multiple random seeds or seed sensitivity analysis despite using stochastic generation (do_sample=True, temperature=0.7)."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The number of experimental runs is never explicitly stated. Given stochastic generation parameters, results could vary across runs."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "The paper mentions varying chunk length ∈ {600, 800, 1000, 1200} and k ∈ {2-5}, and grid search for the threshold value, but the total number of configurations tried and compute spent on search are not reported."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "A grid search of 0.1 increments is mentioned for the threshold but no description of the validation procedure or selection criterion for choosing among chunk sizes and k values."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "Mann-Whitney U-tests are run across four metrics for both B and C categories (8 tests total) but no correction for multiple comparisons is applied."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The authors implement all baselines (including a modified version of Banerjee and Mitra 2015b) and evaluate their own system with no acknowledgment of self-evaluation bias."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "REVERSUM uses a 4-stage pipeline with multiple LLM calls per section compared to standard RAG's single call, but the compute cost difference is never discussed."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "The paper explicitly questions whether standard informativeness captures actual quality and proposes Calibrated Informativeness to address the gap between raw content length and meaningful content addition (Section 5.5, Appendix E)."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "The scaffold/pipeline IS the variable being tested. All methods use the same underlying LLM (Llama-3-8b-instruct), isolating the pipeline design as the independent variable. Table 15 additionally compares across LLMs with the same REVERSUM scaffold."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether Llama-3's training data includes the Wikipedia articles being enhanced, which could allow the model to generate content from pre-training knowledge rather than retrieved chunks."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether providing existing Wikipedia section content as input allows the model to draw on pre-training knowledge about these topics."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No discussion of whether the FA category articles used for fine-tuning the continuation score model share structural similarities with the B/C category test articles."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "No concrete leakage detection or prevention method is used. The evidence verification step checks for hallucination but does not address whether the model draws on pre-training knowledge vs. retrieved chunks."
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "REVERSUM outperforms the best performing baseline by 17 percentage points in integrability and 28.5 percentage points in informativeness based on crowd evaluation.",
    376       "evidence": "Manual evaluation by 8 annotators: 92% integrable (vs 75% for standard RAG), 96% informative (vs 67.5%). Cohen's κ = 0.84 for integrability (Section 6).",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "REVERSUM substantially outperforms standard RAG and other baselines in automatic quality metrics for both B and C class Wikipedia articles.",
    381       "evidence": "Table 1 shows REVERSUM achieves highest ∆CI (61.27 for B, 59.26 for C) and ∆Quality (15.84 for B, 13.00 for C). Mann-Whitney U-test shows statistical significance (p < 0.05) for CI and quality in both categories.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Evidence verification is the most critical stage in REVERSUM — removing it causes the largest quality drop.",
    386       "evidence": "Table 3 ablation: removing evidence verification drops ∆Quality from 14.41 to 12.22 and ∆CI from 60.27 to 47.25, the largest drop among all ablation conditions (Section 8).",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Standard RAG produces content that is mostly a summary of existing Wikipedia content in 56% of cases.",
    391       "evidence": "Pilot study with 9 Wikipedia users evaluating 100 sections: 'in 56% cases the participants mentioned that the generated contents are just a summary of the already existing Wikipedia content' (Section 5.1).",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "REVERSUM achieves a GPT-4-based faithfulness score of 0.95 with respect to source material.",
    396       "evidence": "DeepEval-based faithfulness score of 0.95 reported, with all 50 test cases passing at a 0.75 threshold (Section 6, Appendix H.1). However, GPT-4 version unspecified, sample of only 50 cases.",
    397       "supported": "weak"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "Small sample size without justification",
    403       "detail": "Only 102 personal narratives (53 B, 49 C class) with no justification for this number. Human evaluation uses only 100 randomly selected sections with 8 annotators, making statistical power questionable for the strong claims made."
    404     },
    405     {
    406       "flag": "No variance or multiple runs",
    407       "detail": "Despite using stochastic generation (temperature=0.7, do_sample=True), all results appear to be from single runs with no variance, standard deviation, or seed sensitivity reported. Results could differ substantially across runs."
    408     },
    409     {
    410       "flag": "Self-evaluation bias",
    411       "detail": "Authors implement all baselines (including a modified version of prior work) and evaluate their own system. No independent evaluation or acknowledgment of author-evaluation bias, which Lucic et al. (2018) showed systematically favors the proposing team."
    412     },
    413     {
    414       "flag": "Potentially misleading percentage claims",
    415       "detail": "Abstract states '17% improvement in integrability' and '28.5% improvement in informativeness.' These are percentage point differences (92% vs 75%, 96% vs 67.5%), not relative percentage improvements, which could mislead readers about the magnitude."
    416     },
    417     {
    418       "flag": "No blinding in human evaluation",
    419       "detail": "Human evaluators may have known which system (REVERSUM vs standard RAG) produced the outputs, potentially biasing assessments in favor of the proposed method."
    420     },
    421     {
    422       "flag": "Minimal annotator compensation and vague recruitment",
    423       "detail": "Annotators received only $4 Amazon gift vouchers each. Recruitment is described only as '8 individuals from diverse backgrounds' with no details on recruitment channels, potential bias, or qualifications beyond being 'diverse.'"
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    429       "authors": [
    430         "Patrick Lewis",
    431         "Ethan Perez",
    432         "Aleksandra Piktus"
    433       ],
    434       "year": 2020,
    435       "relevance": "Foundational RAG paper establishing the retrieval-augmented generation paradigm used as a core component and baseline in this work."
    436     },
    437     {
    438       "title": "Retrieval-based full-length Wikipedia generation for emergent events",
    439       "authors": [
    440         "Jiebin Zhang",
    441         "Eugene J. Yu",
    442         "Qinyu Chen"
    443       ],
    444       "year": 2024,
    445       "arxiv_id": "2402.18264",
    446       "relevance": "Most recent work on RAG-based Wikipedia article generation, representing state of the art in automatic Wikipedia content creation."
    447     },
    448     {
    449       "title": "Enabling large language models to generate text with citations",
    450       "authors": [
    451         "Tianyu Gao",
    452         "Howard Yen",
    453         "Jiatong Yu",
    454         "Danqi Chen"
    455       ],
    456       "year": 2023,
    457       "relevance": "Addresses LLM-generated text with citation grounding, directly relevant to generating verifiable content from source material."
    458     },
    459     {
    460       "title": "Atlas: Few-shot learning with retrieval augmented language models",
    461       "authors": [
    462         "Gautier Izacard",
    463         "Patrick Lewis"
    464       ],
    465       "year": 2023,
    466       "relevance": "Retrieval-augmented language model for few-shot learning, relevant to the RAG methodology used in REVERSUM."
    467     },
    468     {
    469       "title": "Demonstrate-search-predict: Composing retrieval and language models for knowledge-intensive NLP",
    470       "authors": [
    471         "Omar Khattab",
    472         "Keshav Santhanam"
    473       ],
    474       "year": 2022,
    475       "arxiv_id": "2212.14024",
    476       "relevance": "Framework for composing retrieval and language models, relevant to the multi-stage RAG approach in this paper."
    477     },
    478     {
    479       "title": "RAGAs: Automated evaluation of retrieval augmented generation",
    480       "authors": [
    481         "Shahul Es",
    482         "Jithin James"
    483       ],
    484       "year": 2024,
    485       "relevance": "RAG evaluation framework (DeepEval) used in this paper for measuring faithfulness of generated content."
    486     },
    487     {
    488       "title": "Language models are few-shot learners",
    489       "authors": [
    490         "Tom Brown",
    491         "Benjamin Mann"
    492       ],
    493       "year": 2020,
    494       "relevance": "GPT-3 paper demonstrating LLM few-shot capabilities that underpin the generation approaches evaluated here."
    495     },
    496     {
    497       "title": "Llama: Open and efficient foundation language models",
    498       "authors": [
    499         "Hugo Touvron",
    500         "Thibaut Lavril"
    501       ],
    502       "year": 2023,
    503       "arxiv_id": "2302.13971",
    504       "relevance": "The Llama model family forms the basis of the LLM (Llama-3-8b-instruct) used throughout REVERSUM's pipeline."
    505     },
    506     {
    507       "title": "FRUIT: Faithfully reflecting updated information in text",
    508       "authors": [
    509         "Robert Iv",
    510         "Alexandre Passos",
    511         "Sameer Singh",
    512         "Ming-Wei Chang"
    513       ],
    514       "year": 2022,
    515       "relevance": "Directly addresses generating grounded text from evidence to update existing text, the core task this paper tackles."
    516     },
    517     {
    518       "title": "Attributed question answering: Evaluation and modeling for attributed large language models",
    519       "authors": [
    520         "Bernd Bohnet",
    521         "Vinh Q. Tran"
    522       ],
    523       "year": 2023,
    524       "arxiv_id": "2212.08037",
    525       "relevance": "Studies attribution and grounding in LLM outputs, relevant to REVERSUM's evidence verification approach."
    526     },
    527     {
    528       "title": "Assisting in writing Wikipedia-like articles from scratch with large language models",
    529       "authors": [
    530         "Yijia Shao",
    531         "Yucheng Jiang"
    532       ],
    533       "year": 2024,
    534       "relevance": "Uses LLMs for Wikipedia article generation from scratch, an alternative approach to the section enhancement task studied here."
    535     }
    536   ],
    537   "engagement_factors": {
    538     "practical_relevance": {
    539       "score": 1,
    540       "justification": "Niche application to Wikipedia content enhancement; most practitioners cannot directly apply this to their work."
    541     },
    542     "surprise_contrarian": {
    543       "score": 0,
    544       "justification": "Multi-stage RAG outperforming single-stage RAG is expected and confirms conventional wisdom about verification steps."
    545     },
    546     "fear_safety": {
    547       "score": 0,
    548       "justification": "No safety or security concerns raised by this work."
    549     },
    550     "drama_conflict": {
    551       "score": 0,
    552       "justification": "No controversy or conflict with prior work or industry claims."
    553     },
    554     "demo_ability": {
    555       "score": 2,
    556       "justification": "Code and data released on GitHub; a motivated user could run the pipeline."
    557     },
    558     "brand_recognition": {
    559       "score": 0,
    560       "justification": "IIT Kharagpur is respected academically but not a widely recognized AI lab in mainstream tech discourse."
    561     }
    562   }
    563 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs