scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (34135B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing LLM Factual Accuracy with RAG to Counter Hallucinations: A Case Study on Domain-Specific Queries in Private Knowledge-Bases",
      6     "authors": [
      7       "Jiarui Li",
      8       "Ye Yuan",
      9       "Zehua Zhang"
     10     ],
     11     "year": 2024,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2403.10446",
     14     "doi": "10.48550/arXiv.2403.10446"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract claims the system is effective at generating more accurate answers (supported by Table 1 improvements) and reveals limitations of fine-tuning with small-scale datasets (supported by the negative fine-tuning results in Table 1 and Section 5.2 discussion).",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Causal claims like 'finetuning the embedding model yielded improvements' are supported by the ablation design in Table 1, which uses controlled single-variable manipulation across configurations.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The title 'Enhancing LLM Factual Accuracy with RAG' implies general applicability, but the study tests only CMU-specific queries with LLaMA-2-7B on a synthetically generated dataset. The abstract frames the work more narrowly as 'domain-specific,' but the title overgeneralizes.",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No discussion of alternative explanations for the results. For example, the fine-tuning degradation could be due to catastrophic forgetting, prompt format mismatch, or dataset quality — the paper speculates about causes but does not systematically consider alternatives.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper measures Recall, F1, Cosine Similarity, and BLEU and frames these as demonstrating 'factual accuracy,' but does not discuss the gap between these proxy metrics and actual factual accuracy. BLEU and cosine similarity measure lexical/semantic similarity to references, not factual correctness.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No dedicated limitations or threats-to-validity section exists. Limitations are mentioned in passing within Sections 5.2 and 6, but there is no substantive dedicated discussion.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats are discussed inline: the dataset is 'possibly small in size and relatively biased' (Section 5.2), fine-tuning 'may reduce the model's performance in language generation' (Section 5.2), and the model's 7B parameter size limits its capacity (Section 5.3).",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No explicit scope boundaries are stated. The paper does not articulate what the results do NOT show or what populations/settings are excluded from the claims.",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding information is mentioned anywhere in the paper. No acknowledgments section with grants or sponsors.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors are identified as being from the Information Network Institute, Carnegie Mellon University. Since they evaluate a CMU-specific system and are CMU students, the affiliation is relevant and disclosed.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding is disclosed; this appears to be an unfunded student project at Carnegie Mellon University.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is present in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Core terms like 'factual accuracy', 'hallucination', and 'domain-specific' are used throughout but never formally defined. RAG is cited (Gao et al. 2023) but not defined in paper.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 1 explicitly lists three contributions: specialized dataset creation, RAG pipeline development, and experiment-based evaluation with ablations.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "Introduction mentions relevant areas (ICL, RAG, hallucination) with citations but lacks a dedicated related work section or discussion of how this work differs from or builds on existing RAG systems.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "The abstract states 'Our code and models are available on Github' but no repository URL is provided anywhere in the paper. Without a verifiable link, this cannot be confirmed.",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "The dataset was curated from CMU websites and annotated with WizardLM, but no download link or dataset archive is provided. The paper does not release the 34,781 QA pairs or the crawled documents.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper specifies model names (Llama-2-7b-chat-hf, mxbai-embed-large-v1, bge-reranker-large) and some training parameters, but provides no requirements.txt, Dockerfile, or detailed environment setup listing library versions and dependencies.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions are provided. The paper describes the system at a high level but lacks specific commands or a README-style guide for replicating the experiments.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Table 1 reports standard deviations in parentheses for Recall and F1 across 4 independent runs, e.g., '0.361 (0.069)' for baseline Recall.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No statistical significance tests are reported. The paper compares configurations by raw numbers in Table 1 without any p-values, t-tests, or other statistical tests to support claims that one configuration outperforms another.",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "The paper presents raw metric values in Table 1 but does not report effect sizes or percentage improvements. The reader must compute the magnitude of improvement from the raw numbers.",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The paper uses 128 QA pairs per evaluation run (randomly sampled from 6,957 test pairs) across 4 runs, but provides no justification for these choices and no power analysis.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Table 1 reports standard deviation across 4 independent runs: 'Both score and standard deviation are derived from 4 independent runs.'",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Table 1 includes a 'Baseline (w/o RAG)' configuration and progressively adds components (RAG pipeline, embedding fine-tuning, core model fine-tuning).",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "The only baseline is the authors' own system without RAG (raw LLaMA-2). No comparison against other RAG systems, established QA systems, or external baselines is provided.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Table 1 presents a clear ablation study showing performance under 5 configurations: baseline, raw RAG, +embedding fine-tuning, +core model fine-tuning, and both fine-tuned.",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Four metrics are used: Recall, F1 Score, Cosine Similarity, and BLEU Score (Table 1, Figure 4).",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "No systematic human evaluation of system outputs is conducted. The case study (Section 5.3) shows three qualitative examples but lacks structured human ratings. The Cohen's Kappa evaluation is for dataset annotation quality, not system output quality.",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Section 3.3 states '27,824 pairs are used as training data and 6,957 pairs are used as testing data after random split.' Evaluation samples from the 'human evaluated test set.'",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": false,
    216           "justification": "Only aggregate metrics are reported across all test examples. No breakdown by question type, difficulty, topic, or data source (html vs pdf vs papers).",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Section 5.3 case study discusses failure modes: the model produces filler tokens ('context:', 'answer:', '<INSTR>'), generates repetitive text, and restates retrieved context rather than paraphrasing. Section 5.2 also notes the fine-tuning degradation.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Table 1 shows that fine-tuning the core model hurts F1 (0.289→0.211), Cosine (0.577→0.502), and BLEU (0.102→0.056). The paper explicitly discusses this: 'The cumulative effect of finetuning both models produced a drop in F1 score.'",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Specific HuggingFace model IDs are provided: 'meta-llama/Llama-2-7b-chat-hf', 'mixedbread-ai/mxbai-embed-large-v1', 'BAAI/bge-reranker-large', and 'sentence-transformers/all-MiniLM-L6-v2'.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Full prompt text is provided in Appendix B: B.1 (dataset generation prompt), B.2 (core model generation prompt), and B.3 (core model fine-tuning prompt).",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "Training hyperparameters are well-documented (LoRA rank 16, INT4, 5 epochs, 1000 max steps, batch size 8, learning rate 2e-4, embedding 10 epochs). However, inference-time parameters (temperature, top-p, max tokens) are not reported, and these significantly affect output quality.",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding is used. The system is a standard RAG pipeline (retriever → reranker → generator) without retry logic, tool use, feedback loops, or agent-like behavior.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section 3 describes preprocessing in detail: web crawling with Selenium/BeautifulSoup, BFS depth-2 link exploration, HTML/JavaScript removal, keyword-based filtering (Appendix C), removal of files <200 characters and 'Page_not_found' titles, chunking into 1000-word segments.",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "Neither the crawled web data nor the generated QA pairs are made available for independent verification. No download links or data archives are provided.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 3 describes data collection in detail: web crawling from CMU websites using Selenium/BeautifulSoup, BFS link exploration, PDF download from CMU sites, research papers fetched via Semantic Scholar API using LTI faculty names, filtered to 2023 open-sourced papers.",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants in the study. The two annotators for Cohen's Kappa evaluation appear to be the authors. Data comes from automated web crawling and LLM-generated annotations.",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "Section 3 documents the full pipeline: crawling → HTML/PDF extraction → text extraction → keyword filtering → size filtering → chunking (1000 words) → WizardLM annotation (10 QA pairs per chunk) → producing 34,781 QA pairs → 80/20 train/test split.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "The paper uses LLaMA-2 but never states its training data cutoff date. CMU web pages could be in LLaMA-2's pretraining data, making this information critical.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of whether LLaMA-2's pretraining data might include CMU web pages that were used to generate the test QA pairs. Also no discussion of overlap between the automatically generated train and test QA pairs from the same source documents.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "CMU web pages were publicly available before LLaMA-2's training cutoff and could be in its pretraining data. The paper does not address this contamination risk, which would inflate the baseline performance.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in the study. The evaluations are entirely automated.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in the study.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants in the study.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in the study.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in the study.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in the study.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in the study.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No inference cost, latency, or per-query time is reported. The paper does not quantify how long the RAG pipeline takes to answer a query or the computational cost per inference.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "While training details are given (1000 max steps, batch size 8, 5 epochs), no GPU hours, total training time, hardware specifications, or total compute budget are reported.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "The 4 independent runs use different random samples of 128 QA pairs, not different random seeds for model training or initialization. No seed sensitivity analysis is performed.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": true,
    378           "justification": "Table 1 caption states: 'Both score and standard deviation are derived from 4 independent runs. Each run randomly samples 128 QA pairs from our human evaluated test set.'",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "No hyperparameter search is described. Fixed hyperparameters are used (LoRA rank 16, learning rate 2e-4, etc.) with no mention of how they were selected or how many configurations were tried.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": false,
    390           "justification": "The ablation study shows all configurations but does not justify which configuration is recommended or how it would be selected. Different configurations win on different metrics (e.g., +Embedding wins on F1/Cosine/BLEU but +Both wins on Recall).",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": false,
    395           "answer": false,
    396           "justification": "No statistical tests are performed, so multiple comparison correction is structurally inapplicable.",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The authors evaluate their own system against their own baseline and ablations. No acknowledgment of author-evaluation bias or discussion of independent evaluation.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": false,
    408           "justification": "The RAG pipeline and fine-tuned models use more compute than the baseline, but performance is not discussed as a function of compute budget. The core model fine-tuning uses additional compute (LoRA, 1000 steps) but actually hurts several metrics.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "The paper evaluates on a custom QA dataset using Recall, F1, Cosine Similarity, and BLEU, claiming to measure 'factual accuracy.' No discussion of whether these metrics actually capture factual accuracy, or whether the WizardLM-generated QA pairs are valid proxies for real user queries.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": false,
    419           "answer": false,
    420           "justification": "The RAG pipeline IS the system being evaluated — it is the intervention, not a confound. The ablation study appropriately isolates components within the same pipeline.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "No discussion of temporal leakage. CMU web pages and research papers from 2023 were crawled to create the dataset, but the paper does not consider whether LLaMA-2's pretraining data (collected before July 2023) might already include this content.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "No discussion of feature leakage. The QA pairs are generated from the same document chunks that serve as the retrieval corpus, creating a potential circularity: the 'correct' answer was generated from the same context that RAG retrieves.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "No discussion of train/test independence. Training and test QA pairs are randomly split from the same pool of 34,781 WizardLM-generated pairs from the same document corpus. QA pairs from the same document chunk could appear in both splits.",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No leakage detection or prevention method is used. No decontamination, membership inference, temporal splits, or independence verification is performed.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "RAG augmentation improves QA accuracy on CMU/LTI domain-specific queries",
    455       "evidence": "Table 1: baseline w/o RAG (Recall 0.361) vs raw RAG pipeline (0.409), ablations show further improvements",
    456       "supported": "moderate"
    457     },
    458     {
    459       "claim": "Fine-tuning the embedding model improves retrieval quality",
    460       "evidence": "Table 1: raw RAG F1 0.289 vs +embedding F1 0.304; Recall 0.409→0.437",
    461       "supported": "moderate"
    462     },
    463     {
    464       "claim": "Fine-tuning the core LLaMA-2 model on small CMU dataset hurts generation quality",
    465       "evidence": "Table 1: +embedding+core F1 (0.219) worse than +embedding alone (0.304); Section 5.2 discusses overfitting/dataset bias",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "The system successfully answers CMU-specific questions with external context",
    470       "evidence": "Section 5.3 case studies show correct answers (e.g., Fall 2024 semester start date, SAMA memory efficiency)",
    471       "supported": "moderate"
    472     },
    473     {
    474       "claim": "Small-scale, domain-specific datasets are insufficient for improving LLM generation without risk of degradation",
    475       "evidence": "Table 1 and Section 5.2 show fine-tuning on 27K QA pairs reduced F1; Section 5.2-5.3 attribute to size, bias, and prompt drift",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "Cohen's Kappa = 0.67 indicates substantial inter-annotator agreement on dataset annotation",
    480       "evidence": "Section 3.4 calculates κ=0.67 (83.33% agreement) between two annotators on subset",
    481       "supported": "strong"
    482     }
    483   ],
    484   "methodology_tags": [
    485     "benchmark-eval",
    486     "case-study",
    487     "system-evaluation"
    488   ],
    489   "key_findings": "A RAG system augmenting LLaMA-2 with domain-specific CMU/LTI context improved QA accuracy (Recall +25%, F1 +12% over baseline). Fine-tuning the embedding model further improved retrieval quality, but fine-tuning the core generation model on the small 27K-pair dataset hurt overall F1 score, suggesting that prompt format drift and overfitting overwhelm dataset-specific improvements. The system demonstrates practical potential for knowledge-intensive domain-specific QA but reveals limitations of fine-tuning on small, biased datasets.",
    490   "red_flags": [
    491     {
    492       "flag": "Insufficient test sample size",
    493       "detail": "Only 128 QA pairs sampled per evaluation run despite 6,957 test pairs available. No justification or power analysis provided."
    494     },
    495     {
    496       "flag": "No statistical significance testing",
    497       "detail": "Comparative claims made (e.g., embedding improves over baseline) but no p-values or significance tests reported. Overlapping error bars in Table 1 make some claims uncertain."
    498     },
    499     {
    500       "flag": "No external baselines",
    501       "detail": "Only compared against in-house configurations, not other RAG systems, other retrieval augmentation approaches, or literature baseline methods."
    502     },
    503     {
    504       "flag": "Evaluation metric-outcome mismatch",
    505       "detail": "Claims to measure 'factual accuracy' but evaluates Recall, F1, Cosine Similarity, BLEU—metrics that measure retrieval+generation quality, not factual correctness (would need human judgment)."
    506     },
    507     {
    508       "flag": "Fine-tuning performance degradation unresolved",
    509       "detail": "Core model fine-tuning reduced F1 score but no attempt made to diagnose root cause or implement fixes; presented only as a limitation."
    510     },
    511     {
    512       "flag": "Prompt format drift acknowledged but not addressed",
    513       "detail": "Section 5.2-5.3 acknowledge fine-tuned model outputs tokens like 'context:', 'answer:', '<INSTR>' due to prompt template mismatch with pretraining, but evaluation metrics not adapted."
    514     },
    515     {
    516       "flag": "Dataset annotation quality borderline",
    517       "detail": "Cohen's Kappa 0.67 is 'substantial' but at lower end of acceptable. Only two annotators; evaluation limited to subset before using WizardLM as primary annotator."
    518     },
    519     {
    520       "flag": "Private data prevents reproduction",
    521       "detail": "CMU institutional data cannot be shared; results cannot be independently verified or reproduced on the same dataset."
    522     },
    523     {
    524       "flag": "Code/model availability unverified",
    525       "detail": "Abstract promises Github release but no URL provided; cannot verify code or fine-tuned models actually available."
    526     },
    527     {
    528       "flag": "Limited generalization evidence",
    529       "detail": "All evaluation on single CMU/LTI domain. No evidence approach generalizes to other domains or knowledge bases."
    530     }
    531   ],
    532   "cited_papers": [
    533     {
    534       "title": "Retrieval-augmented generation for large language models: A survey",
    535       "authors": "Gao et al.",
    536       "year": 2023,
    537       "relevance": "Foundational RAG survey; directly cited and core to paper's approach"
    538     },
    539     {
    540       "title": "A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions",
    541       "authors": "Huang et al.",
    542       "year": 2023,
    543       "relevance": "Directly relevant to hallucination problem paper aims to mitigate"
    544     },
    545     {
    546       "title": "On the dangers of stochastic parrots: Can language models be too big?",
    547       "authors": "Bender et al.",
    548       "year": 2021,
    549       "relevance": "Foundational critique of LLM limitations and hallucination risks"
    550     },
    551     {
    552       "title": "Language models are few-shot learners",
    553       "authors": "Brown et al.",
    554       "year": 2020,
    555       "relevance": "GPT-3 foundational LLM work; establishes baseline capabilities"
    556     },
    557     {
    558       "title": "LLaMA 2: Open foundation and open-tuned chat models",
    559       "authors": "Touvron et al.",
    560       "year": 2023,
    561       "relevance": "Core model used in paper; essential implementation reference"
    562     },
    563     {
    564       "title": "HaluEval-Wild: Evaluating hallucinations of language models in the wild",
    565       "authors": "Zhu et al.",
    566       "year": 2024,
    567       "relevance": "Recent benchmark for evaluating hallucinations in real-world scenarios"
    568     },
    569     {
    570       "title": "Survey on in-context learning",
    571       "authors": "Dong et al.",
    572       "year": 2022,
    573       "relevance": "Related technique (ICL) paper compares against RAG as alternative approach"
    574     }
    575   ],
    576   "engagement_factors": {
    577     "practical_relevance": {
    578       "score": 2,
    579       "justification": "RAG systems are genuinely practical for domain-specific QA, but implementation is CMU-specific, data is private, and code availability unverified, limiting applicability to others."
    580     },
    581     "surprise_contrarian": {
    582       "score": 0,
    583       "justification": "RAG improving factual accuracy is well-established in literature; the negative result (fine-tuning hurting performance) is expected given dataset size and prompt drift."
    584     },
    585     "fear_safety": {
    586       "score": 1,
    587       "justification": "Paper addresses hallucination mitigation (a safety concern) through RAG augmentation, but does not raise or discuss AI risk concerns beyond hallucination."
    588     },
    589     "drama_conflict": {
    590       "score": 0,
    591       "justification": "Straightforward technical systems paper with no controversial claims, conflicts, or narrative tension."
    592     },
    593     "demo_ability": {
    594       "score": 1,
    595       "justification": "Approach is technically sound and reproducible in principle, but requires access to private CMU data and unverified code release, creating practical barriers."
    596     },
    597     "brand_recognition": {
    598       "score": 2,
    599       "justification": "Carnegie Mellon University is prestigious but paper appears to be student work rather than major research initiative; not from top-tier AI labs (OpenAI, DeepMind, Meta Research)."
    600     }
    601   },
    602   "hn_data": {
    603     "threads": [
    604       {
    605         "hn_id": "43451552",
    606         "title": "Blockchain with Proof of Quantum Work",
    607         "points": 5,
    608         "comments": 1,
    609         "url": "https://news.ycombinator.com/item?id=43451552",
    610         "created_at": "2025-03-23T08:24:58Z"
    611       },
    612       {
    613         "hn_id": "39301136",
    614         "title": "Ten Hard Problems in Artificial Intelligence We Must Get Right",
    615         "points": 4,
    616         "comments": 1,
    617         "url": "https://news.ycombinator.com/item?id=39301136",
    618         "created_at": "2024-02-08T12:28:48Z"
    619       },
    620       {
    621         "hn_id": "39173354",
    622         "title": "Black-Box Access Is Insufficient for Rigorous AI Audits",
    623         "points": 2,
    624         "comments": 1,
    625         "url": "https://news.ycombinator.com/item?id=39173354",
    626         "created_at": "2024-01-29T06:28:23Z"
    627       },
    628       {
    629         "hn_id": "43424742",
    630         "title": "Blockchain with Proof of Quantum Work",
    631         "points": 2,
    632         "comments": 0,
    633         "url": "https://news.ycombinator.com/item?id=43424742",
    634         "created_at": "2025-03-20T15:35:28Z"
    635       },
    636       {
    637         "hn_id": "40260848",
    638         "title": "Large Language Models for Data Annotation: A Survey",
    639         "points": 2,
    640         "comments": 0,
    641         "url": "https://news.ycombinator.com/item?id=40260848",
    642         "created_at": "2024-05-04T22:35:48Z"
    643       },
    644       {
    645         "hn_id": "41504752",
    646         "title": "Leveraging Large Language Models for Solving Rare MIP Challenges",
    647         "points": 2,
    648         "comments": 0,
    649         "url": "https://news.ycombinator.com/item?id=41504752",
    650         "created_at": "2024-09-10T19:45:16Z"
    651       },
    652       {
    653         "hn_id": "41499290",
    654         "title": "State and Action Factorization in Power Grids",
    655         "points": 2,
    656         "comments": 0,
    657         "url": "https://news.ycombinator.com/item?id=41499290",
    658         "created_at": "2024-09-10T10:25:47Z"
    659       },
    660       {
    661         "hn_id": "40690995",
    662         "title": "Rough Set Improved Therapy-Based Metaverse Assisting System",
    663         "points": 2,
    664         "comments": 0,
    665         "url": "https://news.ycombinator.com/item?id=40690995",
    666         "created_at": "2024-06-15T17:04:37Z"
    667       },
    668       {
    669         "hn_id": "39173902",
    670         "title": "AI Auditing: The Broken Bus on the Road to AI Accountability",
    671         "points": 1,
    672         "comments": 1,
    673         "url": "https://news.ycombinator.com/item?id=39173902",
    674         "created_at": "2024-01-29T08:04:17Z"
    675       },
    676       {
    677         "hn_id": "40046815",
    678         "title": "Exact analytical algorithm for solvent accessible surface area",
    679         "points": 1,
    680         "comments": 0,
    681         "url": "https://news.ycombinator.com/item?id=40046815",
    682         "created_at": "2024-04-15T23:30:21Z"
    683       }
    684     ],
    685     "top_points": 5,
    686     "total_points": 23,
    687     "total_comments": 4
    688   }
    689 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs