scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26304B)
      1 {
      2   "paper": {
      3     "title": "Retrieval-Augmented Generation for Electrocardiogram-Language Models",
      4     "authors": [
      5       "Xiaoyu Song",
      6       "William Han",
      7       "Tony Chen",
      8       "Chaojing Duan",
      9       "Michael A. Rosenberg",
     10       "Emerson Liu",
     11       "Ding Zhao"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv",
     15     "arxiv_id": "2510.00261",
     16     "doi": "10.48550/arXiv.2510.00261"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "Adding a RAG pipeline to Electrocardiogram-Language Models (ELMs) consistently improves NLG performance across three public ECG datasets and multiple ELM architectures. Ablation studies show that using RAG during both training and inference yields the best results, top-1 retrieval marginally outperforms larger k values, RAG content placement (system prompt vs. user query) has negligible effect, and accurate retrieval content is necessary for gains (noise injection nullifies the benefit).",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper provides a GitHub URL: 'Our code is available at: https://github.com/willxxy/ECG-Bench' in the abstract."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "All datasets used are publicly available: MIMIC-IV-ECG, PTB-XL, ECG-QA, and ECG-Chat Instruct. The paper cites their public sources."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions 'NVIDIA RTX A6000 48 GB GPUs' and 'HuggingFace Transformers API' but provides no requirements.txt, Dockerfile, or detailed library version listing sufficient to recreate the environment."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are provided in the paper. While code is released, the paper itself does not include commands or a 'Reproducing Results' section."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "All tables report ± notation across 3 random seeds (e.g., '22.85 ± 0.18' in Table 1)."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper makes comparative claims (e.g., 'RAG consistently improves performance') but reports no statistical significance tests (no p-values, t-tests, or bootstrap tests). Differences are assessed only by comparing point estimates."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Tables report both baseline and RAG-augmented numbers with sufficient context to assess magnitude (e.g., BLEU-4 from 22.85% to 38.10% on ECG-Chat Instruct, accuracy from 27.21% to 57.77% on MIMIC-IV ECG in Table 1)."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper uses 400,000 training and 20,000 test instances but provides no justification for these sizes and no power analysis."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Standard deviations across 3 random seeds are reported in all results tables (e.g., '38.10 ± 0.05' in Table 1)."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Table 1 compares ECG-Byte with and without RAG. Figure 2 extends to three additional ELM architectures (Merl, SigLIP Signal, SigLIP Image) with and without RAG."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "ECG-Byte [12] (2024), ECG-Chat [4] (2024), and the SigLIP/Merl variants from [14] (2025) are all recent and represent the state of the art for ELMs."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Comprehensive ablation studies in Tables 2-5: RAG during training/inference combinations, varying top-k, RAG placement in prompt, and noise injection into retrieved content."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Five metrics are reported: BLEU-4, ROUGE-L, METEOR, BERTScore F1, and Accuracy (Table 1)."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No human evaluation of generated text quality is included. All evaluation relies on automated NLG metrics. For a system generating clinical ECG interpretations, human expert evaluation would be valuable."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 3.1 states: 'We inference on a separate test set of size 20,000 instances.'"
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Table 1 provides per-dataset breakdowns across three datasets (ECG-Chat Instruct, ECG-QA PTB-XL, ECG-QA MIMIC-IV ECG) rather than a single aggregate."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "No error analysis, qualitative failure examples, or discussion of where the approach breaks down. The paper only reports aggregate metrics."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Table 2 shows training-only RAG hurts performance vs. baseline (BLEU-4: 20.08 vs. 22.85). Table 3 shows increasing k doesn't help. Table 5 shows noisy RAG is worse than clean RAG on accuracy (8.19% vs. 18.27%)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract claims 'ELMs with RAG consistently improves performance over non-RAG baselines,' which is supported by Table 1 and Figure 2. Claims about ablation study insights are supported by Tables 2-5."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Causal claims about RAG improving performance are supported by controlled ablation studies (Tables 2-5) with single-variable manipulation: RAG on/off, varying k, placement, and noise injection."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Claims are bounded to ELMs and tested settings. The title specifies 'Electrocardiogram-Language Models,' and the paper tests on three specific datasets and four architectures. Discussion states 'showing consistent improvements in NLG across multiple datasets and architectures' without overclaiming beyond ECG tasks."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No alternative explanations are discussed for why RAG helps. The paper does not consider confounds such as whether the gains come from additional context length, memorization of training examples, or other factors."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper measures NLG metrics (BLEU-4, ROUGE-L, METEOR, BERTScore, Accuracy) and frames them as evidence that RAG improves ELM performance. However, it does not discuss the gap between these automated metrics and actual clinical utility or diagnostic accuracy, which is what would matter in practice for ECG interpretation."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Specific model checkpoints are identified: 'Llama-3.2-1B-Instruct checkpoint' (Section 2.3), 'siglip-base-patch16-224 checkpoint from HuggingFace' (Section 3.1), 'Res-Net101 backbone' (Section 3.1)."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The prompt structure is described (system prompt + retrieved diagnostic reports + ECG tokens + query) in Figure 1 and Section 2.2, but the actual prompt text (system prompt content, formatting) is not provided."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 3.1 reports: Adam optimizer, learning rate 1e-4, weight decay 1e-2, batch size 2, 1 epoch, 400k samples, LoRA rank 16, input length 1024, BPE 3500 merges, FAISS IndexIVFFlat with L2 distance."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. The system is a standard RAG-augmented fine-tuned LLM pipeline without agent loops, tool use, or retry logic."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 2.1 documents preprocessing in detail: standardization to PTB-XL lead configuration, powerline noise removal (notch filter Q=30 at 50/60 Hz), Butterworth bandpass filter (0.5-100 Hz), baseline drift correction (0.05 Hz high-pass), wavelet denoising (Daubechies-6, level 4, MAD soft thresholding), resampling to 250 Hz, 5-second windowing."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "There is no dedicated limitations section. Section 5 ('Discussion and Conclusions') summarizes findings and 'insights for future work' but does not discuss limitations."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No threats to validity are discussed anywhere in the paper."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show or what settings are excluded."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "All three datasets are publicly available: MIMIC-IV-ECG [9], PTB-XL [10], and ECG-QA [11]. The raw ECG signals and associated data can be independently accessed."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 2.1 describes the data sources: MIMIC-IV-ECG and PTB-XL datasets adapted for NLG, ECG-Chat Instruct dataset with conversational data, and ECG-QA dataset with ChatGPT-generated Q&A pairs."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. All data comes from standard public ECG datasets."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "The paper describes preprocessing steps but does not document the full pipeline with counts at each stage. It states '400,000 randomly sampled ECGs' for training and '20,000 instances' for test without explaining total available data, sampling criteria, or filtering losses."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding source is disclosed. The acknowledgments section only mentions a collaboration with the Mario Lemieux Center for Heart Rhythm Care at Allegheny General Hospital."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: Carnegie Mellon University, Columbia University, Allegheny Health Network, University of Colorado."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding is disclosed, so independence of funder from outcome cannot be assessed."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is provided."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The paper uses Llama-3.2-1B-Instruct as the base model but does not state its training data cutoff date."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether the pre-trained Llama model's training data could include any of the ECG datasets or their textual descriptions."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "The ECG-QA dataset includes ChatGPT-generated Q&A pairs from publicly available ECG datasets. No discussion of whether these or similar content could appear in Llama's training data."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference cost, latency, or per-example timing is reported despite the method adding a FAISS retrieval step to each inference."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "The paper mentions 'NVIDIA RTX A6000 48 GB GPUs' but does not state total GPU hours, training time, or overall computational budget."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": true,
    304         "justification": "All results are averaged over 3 random seeds with standard deviations reported (e.g., Tables 1-5)."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Section 4.2 and Table 1 header state: 'Mean baseline comparisons over 3 random seeds' and 'All results are averaged over 3 random seeds.'"
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search is described. The paper states 'using default hyperparameters' without documenting whether any search was performed or why defaults were appropriate."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The paper uses default hyperparameters without justification. LoRA rank=16, learning rate 1e-4, and other choices are not justified or compared against alternatives."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Many comparisons are made across Tables 1-5 but no statistical tests are performed, let alone corrections for multiple comparisons."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "Authors evaluate their own RAG pipeline against baselines they implemented. No acknowledgment of potential bias from self-evaluation."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "RAG adds retrieval overhead (FAISS search, additional context in prompts) but no analysis of performance vs. compute cost is provided."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper uses NLG metrics (BLEU-4, ROUGE-L, METEOR, BERTScore, Accuracy) without discussing whether these metrics actually measure meaningful ECG interpretation quality or clinical utility."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is involved. RAG is the intervention being tested, and all comparisons use the same base architecture with controlled RAG variations."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether ECG datasets or their textual descriptions existed before Llama-3.2's training data cutoff."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the RAG retrieval mechanism could leak test-time information (e.g., if the RAG database contains training data that overlaps with test queries)."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether training and test ECG instances share structural similarities (e.g., from the same patients, same recording sessions, or same hospital systems)."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection or prevention method is described."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "ELMs with RAG consistently improve NLG performance over non-RAG baselines across three datasets.",
    373       "evidence": "Table 1 shows improvements on ECG-Chat Instruct (BLEU-4: 22.85→38.10), ECG-QA PTB-XL (21.07→21.46), and ECG-QA MIMIC-IV (15.52→49.07). Figure 2 shows gains across four ELM architectures.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Using RAG during both training and inference yields the best performance.",
    378       "evidence": "Table 2 shows RAG at both stages (BLEU-4: 38.10, Acc: 18.27) outperforms training-only (20.08, 7.17), inference-only (32.33, 14.96), and no RAG (22.85, 9.65). Only tested on ECG-Chat Instruct.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Retrieving fewer items (top-1) marginally outperforms larger k values.",
    383       "evidence": "Table 3: k=1 (38.10 BLEU-4, 18.27 Acc) vs k=5 (37.99, 18.07) vs k=10 (36.91, 17.20). Differences are small. Only tested on ECG-Chat Instruct.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "RAG content placement (system prompt vs. user query) produces similar outcomes.",
    388       "evidence": "Table 4: system prompt (38.08 BLEU-4, 18.11 Acc) vs user query (38.03, 18.17). Differences within noise. Only tested on ECG-Chat Instruct.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Accurate retrieval content is necessary for performance gains; noise injection nullifies benefits.",
    393       "evidence": "Table 5: RAG with noise (23.48 BLEU-4, 8.19 Acc) performs similarly to no RAG (22.85, 9.65), while proper RAG yields 38.10 BLEU-4 and 18.27 Acc.",
    394       "supported": "strong"
    395     }
    396   ],
    397   "red_flags": [
    398     {
    399       "flag": "No significance tests",
    400       "detail": "All comparative claims ('RAG consistently improves') are based on comparing point estimates with standard deviations but no formal statistical tests. Some improvements on ECG-QA PTB-XL are marginal (BLEU-4: 21.07→21.46) and could be within noise."
    401     },
    402     {
    403       "flag": "No limitations section",
    404       "detail": "The paper has no limitations section or any discussion of where the approach might fail, what its scope boundaries are, or threats to validity."
    405     },
    406     {
    407       "flag": "Ablations on single dataset only",
    408       "detail": "All ablation studies (Tables 2-5) use only the ECG-Chat Instruct dataset. The generalizability of the ablation findings to the other two datasets is unknown, especially given the highly variable gains across datasets in Table 1."
    409     },
    410     {
    411       "flag": "No clinical validation or human evaluation",
    412       "detail": "For a system generating clinical ECG interpretations, the evaluation relies entirely on automated NLG metrics. No clinician reviewed the generated reports for clinical accuracy or safety."
    413     },
    414     {
    415       "flag": "Construct validity of metrics unexamined",
    416       "detail": "BLEU-4 and ROUGE-L measure n-gram overlap with reference text, not clinical correctness. A generated report could score high on these metrics while being clinically misleading, or score low while being clinically valid but worded differently."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "ECG-Chat: A Large ECG-Language Model for Cardiac Disease Diagnosis",
    422       "authors": ["Yubao Zhao", "Tian Zhang", "Xu Wang", "Puyu Han", "Tong Chen", "Linlin Huang", "Youzhu Jin", "Jiaju Kang"],
    423       "year": 2024,
    424       "relevance": "First work to extend RAG to free-form NLG for ELMs; directly compared baseline in this paper."
    425     },
    426     {
    427       "title": "RankRAG: Unifying Context Ranking with Retrieval-Augmented Generation in LLMs",
    428       "authors": ["Yue Yu", "Wei Ping", "Zihan Liu", "Boxin Wang", "Jiaxuan You", "Chao Zhang", "Mohammad Shoeybi", "Bryan Catanzaro"],
    429       "year": 2024,
    430       "relevance": "Key reference on RAG design for LLMs, cited for insights on how increasing k does not necessarily improve performance."
    431     },
    432     {
    433       "title": "ECG-Byte: A Tokenizer for End-to-End Generative Electrocardiogram Language Modeling",
    434       "authors": ["William Han", "Chaojing Duan", "Michael A. Rosenberg", "Emerson Liu", "Ding Zhao"],
    435       "year": 2024,
    436       "relevance": "Primary ELM architecture used in all experiments; BPE-based ECG tokenization approach."
    437     },
    438     {
    439       "title": "Q-Heart: ECG Question Answering via Knowledge-Informed Multimodal LLMs",
    440       "authors": ["Hung Manh Pham", "Jialu Tang", "Aaqib Saeed", "Dong Ma"],
    441       "year": 2025,
    442       "relevance": "Related RAG-augmented ECG QA system using RAG during instruction tuning."
    443     },
    444     {
    445       "title": "Zero-Shot ECG Diagnosis with Large Language Models and Retrieval-Augmented Generation",
    446       "authors": ["Han Yu", "Peikun Guo", "Akane Sano"],
    447       "year": 2023,
    448       "relevance": "Prior work applying RAG with LLMs for ECG diagnosis using handcrafted features."
    449     },
    450     {
    451       "title": "ECG-QA: A Comprehensive Question Answering Dataset Combined with Electrocardiogram",
    452       "authors": ["Jungwoo Oh", "Gyubok Lee", "Seongsu Bae", "Joon myoung Kwon", "Edward Choi"],
    453       "year": 2023,
    454       "relevance": "Public ECG question-answering benchmark dataset used in evaluation."
    455     },
    456     {
    457       "title": "Signal, Image, or Symbolic: Exploring the Best Input Representation for Electrocardiogram-Language Models through a Unified Framework",
    458       "authors": ["William Han", "Chaojing Duan", "Zhepeng Cen", "Yihang Yao", "Xiaoyu Song"],
    459       "year": 2025,
    460       "relevance": "Unified ELM framework providing the baseline architectures (Merl, SigLIP variants) compared in Figure 2."
    461     },
    462     {
    463       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    464       "authors": ["Edward J. Hu", "Yelong Shen", "Phillip Wallis", "Zeyuan Allen-Zhu", "Yuanzhi Li", "Shean Wang", "Lu Wang", "Weizhu Chen"],
    465       "year": 2021,
    466       "relevance": "Parameter-efficient fine-tuning method used for all ELM training in this paper."
    467     },
    468     {
    469       "title": "Visual Instruction Tuning",
    470       "authors": ["Haotian Liu", "Chunyuan Li", "Qingyang Wu", "Yong Jae Lee"],
    471       "year": 2023,
    472       "relevance": "LLaVA approach adopted for encoder-based ELM architectures in the paper."
    473     },
    474     {
    475       "title": "Retrieval-Augmented Generation for Large Language Models: A Survey",
    476       "authors": ["Yunfan Gao", "Yun Xiong", "Xinyu Gao", "Kangxiang Jia", "Jinliu Pan", "Yuxi Bi", "Yi Dai", "Jiawei Sun", "Meng Wang", "Haofen Wang"],
    477       "year": 2024,
    478       "relevance": "Comprehensive RAG survey cited for RAG design during pretraining/finetuning practices."
    479     }
    480   ],
    481   "engagement_factors": {
    482     "practical_relevance": {
    483       "score": 2,
    484       "justification": "Open-source RAG pipeline for ECG interpretation with code released; applicable to medical AI practitioners but limited to the ECG domain."
    485     },
    486     "surprise_contrarian": {
    487       "score": 0,
    488       "justification": "RAG improving NLG performance is expected and well-established in the broader LLM literature."
    489     },
    490     "fear_safety": {
    491       "score": 0,
    492       "justification": "No safety or risk concerns are raised; the paper focuses on performance improvement."
    493     },
    494     "drama_conflict": {
    495       "score": 0,
    496       "justification": "No controversy or conflict; straightforward method contribution."
    497     },
    498     "demo_ability": {
    499       "score": 2,
    500       "justification": "Code released on GitHub (ECG-Bench), though requires ECG datasets and GPU hardware to run."
    501     },
    502     "brand_recognition": {
    503       "score": 1,
    504       "justification": "Carnegie Mellon University is well-known but the specific lab and authors are not widely recognized in the broader AI community."
    505     }
    506   }
    507 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs