scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27462B)
      1 {
      2   "paper": {
      3     "title": "Lynx: An Open Source Hallucination Evaluation Model",
      4     "authors": [
      5       "Selvan Sunitha Ravi",
      6       "Bartosz Mielczarek",
      7       "Anand Kannappan",
      8       "Douwe Kiela",
      9       "Rebecca Qian"
     10     ],
     11     "year": 2024,
     12     "venue": "arXiv preprint",
     13     "arxiv_id": "2407.08488",
     14     "doi": ""
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The paper provides a GitHub repository URL: https://github.com/patronus-ai/Lynx-hallucination-detection (footnote 3). Models are released on HuggingFace (footnote 1) and the dataset HaluBench is also on HuggingFace (footnote 2)."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "HaluBench is released on HuggingFace (https://huggingface.co/datasets/PatronusAI/HaluBench). The paper also states: 'We are also releasing the training data, code and model generations on Github.' Source datasets (DROP, CovidQA, PubMedQA, HaluEval, RAGTruth, FinanceBench) are all publicly available."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "Appendix B describes the training setup (FSDP, flash attention, cosine scheduler, vLLM for inference, 8 H100s for 70B evaluation) but does not provide a requirements.txt, Dockerfile, or specific library versions sufficient to recreate the environment."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "While code is released on GitHub, the paper itself does not contain step-by-step reproduction instructions or reference a README with commands to replicate the main experiments. The training setup in Appendix B provides hyperparameters but not a runnable reproduction guide."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "All results in Tables 3, 4, and 5 are reported as point estimates (e.g., '87.4% accuracy') with no confidence intervals, error bars, or uncertainty measures."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper makes multiple comparative claims (e.g., 'LYNX outperforms GPT-4o') based solely on comparing accuracy percentages without any statistical significance tests."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The paper reports effect sizes in context: 'LYNX (70B) outperformed GPT-4o by almost one percent accuracy on average,' 'LYNX (70B) is 8.3% more accurate than GPT-4o at identifying inaccurate responses in medical answers in PubMedQA,' and 'average increase of 27.6% across all tasks' over GPT-3.5-Turbo. Baselines and absolute numbers are provided in Table 3."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No justification is given for the sample sizes used. HaluBench consists of 15k samples (1k from each of several datasets), the training set is 2400 samples, and human annotation covers 200 samples, but no rationale is provided for why these specific sizes were chosen."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No variance, standard deviation, or spread across runs is reported. Results appear to be from single evaluation runs. The paper does not mention multiple seeds or repeated experiments."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Table 3 includes comparisons against multiple baselines: GPT-4o, GPT-4-Turbo, GPT-3.5-Turbo, Claude-3-Sonnet, Claude-3-Haiku, RAGAS Faithfulness, Mistral-Instruct-7B, Llama-3-Instruct-8B, and Llama-3-Instruct-70B."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The baselines include GPT-4o, Claude-3-Sonnet, and Llama-3-Instruct models, which were all contemporary at the time of the paper's publication (July 2024). RAGAS is also a recent RAG evaluation framework."
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The paper includes an ablation comparing LYNX (70B) to its base model Llama-3-Instruct-70B (Table 3), showing the effect of fine-tuning. Table 4 ablates the effect of adding RAGTruth training data. Table 5 ablates the Llama-2-Chat-13B model before and after fine-tuning."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "The paper reports only accuracy as the evaluation metric across all tasks and models. No precision, recall, F1, or other metrics are reported despite the task being binary classification where class-imbalanced performance would be informative."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Human annotation was performed on a subset of HaluBench: 'Expert annotators manually checked the original and perturbed answers as well as reasoning provided for each example' (Section 3.2). Agreement with human annotators is reported in Table 2 (0.90-0.96 across domains, n=50 per domain)."
     91       },
     92       "held_out_test_set": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The training set (2400 samples from RAGTruth, DROP, CovidQA, PubMedQA train splits) and the evaluation set (HaluBench) are explicitly separated. HaluBench includes tasks from HaluEval and FinanceBench that were not used in training. For overlapping datasets (DROP, CovidQA, PubMedQA), the paper states training samples come from the 'train split' while evaluation uses separately constructed samples."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Table 3 provides per-dataset breakdowns across all 6 component datasets of HaluBench (HaluEval, RAGTruth, FinanceBench, DROP, CovidQA, PubMedQA) as well as an overall score."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 6 (Limitations and Future Work) discusses several categories of failures: failures outside of LLM generation (retrieval failures), multilingual coverage gaps, limitations to QA tasks only, and lack of factuality assessment. Appendix B.3 notes that LYNX (70B) performed worse than Llama-3-Instruct-70B on the RAGTruth test split specifically."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Appendix B.3 reports that LYNX (70B) underperformed Llama-3-Instruct-70B on RAGTruth (80.2% vs 83.8%), and that adding RAGTruth training data improved RAGTruth performance but caused 'a slight decrease in performance on the other splits.' Table 5 reports Llama-2-Chat-13B's near-zero baseline performance."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims LYNX 'outperforms GPT-4o, Claude-3-Sonnet and closed and open-source LLM-as-a-judge models on HaluBench.' Table 3 shows LYNX (70B) at 87.4% overall vs GPT-4o at 86.5% and Claude-3-Sonnet at 78.8%, supporting this claim. The benchmark size claim (15k samples) is supported by the dataset description."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper's main causal claim is that fine-tuning on hallucination detection data improves performance. This is supported by controlled comparisons: LYNX (70B) vs. Llama-3-Instruct-70B (same base model, with/without fine-tuning) and LYNX (8B) vs. Llama-3-Instruct-8B. The ablation in Table 4 tests the effect of adding RAGTruth data. These are single-variable manipulations."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The abstract and title claim LYNX is a 'SOTA hallucination detection LLM' broadly, but evaluation is limited to HaluBench, which the authors themselves constructed. The paper claims LYNX works for 'real-world hallucination scenarios' but only tests on QA-format tasks in English. Section 6 acknowledges the multilingual limitation but the title and abstract are not bounded to the tested setting."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper does not discuss alternative explanations for LYNX's superior performance. For instance, LYNX is trained on data from the same distribution as HaluBench (same source datasets, same perturbation approach), which could explain the performance advantage. The possibility that the benchmark favors LYNX by construction is not addressed."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "LYNX is based on 'Llama-3-70B-Instruct' and 'Llama-3-8B-Instruct' which are specific enough. However, baseline models are specified only as 'GPT-4o', 'GPT-4-Turbo', 'GPT-3.5-Turbo', 'Claude-3-Sonnet', 'Claude-3-Haiku' without API version/snapshot dates. GPT-4o behavior changes across versions, and no snapshot date is provided."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Appendix A provides the full prompts used for data generation (perturbation prompt, reasoning generation prompts for both perturbed and original samples) and the evaluation prompt used for both instruction fine-tuning and model evaluation. These are actual prompt text, not just descriptions."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Appendix B reports: learning rate 5.0e-7, batch size 256, 3 epochs, cosine scheduler with 100 warmup steps, lionw optimizer with beta1=0.9 and beta2=0.95, gradient clipping threshold=1.0, FSDP FULL_SHARD, activation checkpointing, greedy decoding, max_new_tokens=600. GPT-4o perturbation generation used temperature=0."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. LYNX is a straightforward fine-tuned model that takes an input and produces a JSON output with reasoning and score."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 3.2 describes the data construction pipeline in detail: source datasets identified, sampling procedure (500 examples from each dataset, 500 additional perturbed examples), perturbation generation method using GPT-4o, and the formal notation for the perturbation process. Table 2 reports HaluBench composition."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 6 'Limitations and Future Work' provides a substantive discussion across five sub-areas: failures outside LLM generation, multilingual coverage, summarization tasks, truthfulness/world knowledge, and NLI applications."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 6 identifies specific threats: LYNX cannot handle retrieval failures in RAG pipelines, English-only evaluation limits real-world applicability, QA-only focus misses summarization hallucinations, and conflicting information in source documents presents a challenge. These are specific to THIS system and evaluation."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 3 explicitly scopes to intrinsic hallucination evaluation only: 'we consider factuality assessments out of scope for this work.' Section 6 states LYNX does not handle retrieval failures, non-English inputs, or summarization tasks. The definition in Section 3.1 explicitly excludes relevance assessment and correctness assessment."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "HaluBench is released on HuggingFace (https://huggingface.co/datasets/PatronusAI/HaluBench). Training data and model generations are released on GitHub. The underlying source datasets (DROP, CovidQA, etc.) are publicly available."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 3.2 describes in detail how HaluBench was constructed: which source datasets were used, how many samples were taken from each, how hallucinated examples were generated using GPT-4o perturbation, and how human annotation was performed. The perturbation method is formally defined with equation (1)."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "The paper mentions 'Expert annotators' for human annotation (Section 3.2) but does not describe who these annotators were, how they were recruited, their qualifications, or whether they are employees of Patronus AI. This is relevant given the potential for bias."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 3.2 documents the pipeline: source datasets identified, random sampling of 500 per dataset, perturbation generation via GPT-4o, construction of balanced positive/negative sets, human annotation of 200 samples for quality verification. Training data construction (Section 3.3) documents 600 examples per subtask with 300 perturbed."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No explicit funding disclosure or grants section is present. Section 7 (Acknowledgements) thanks partners Nvidia and Nomic AI 'for their support' but does not specify what form this support took (financial, compute, etc.)."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are clearly listed: Patronus AI (4 of 5 authors), Contextual AI, and Stanford University. Three of the authors (Ravi, Mielczarek, Kannappan, Qian) work at Patronus AI, the company that developed and is releasing LYNX as a product."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "The majority of authors are from Patronus AI, which develops AI evaluation tools commercially. LYNX is released under the Patronus AI brand on HuggingFace. The company has a direct commercial interest in demonstrating that their hallucination detection model outperforms alternatives. Nvidia and Nomic AI are acknowledged as 'partners' without further specification."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is present in the paper. Four of five authors work at Patronus AI, a company that commercially benefits from LYNX's reputation, but this conflict is not explicitly declared."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "The paper does not state the training data cutoff for the base Llama-3 models or for GPT-4o, GPT-4-Turbo, GPT-3.5-Turbo, Claude-3-Sonnet, or Claude-3-Haiku that are used as baselines. This is relevant because some evaluation datasets (HaluEval, DROP) have been publicly available for years."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No discussion of potential train/test overlap. The base Llama-3 models may have seen HaluEval, DROP, CovidQA, PubMedQA, or FinanceBench data during pre-training since these are publicly available datasets. Similarly, GPT-4o and Claude-3 models may have seen these benchmarks. This is not addressed."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "HaluBench is constructed from publicly available datasets (HaluEval published 2023, DROP published 2019, etc.) that were likely in the training data of models being evaluated. The paper does not discuss whether the models being benchmarked may have seen these datasets during pre-training, creating a contamination risk especially for the non-perturbed examples."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human subjects study was conducted. The human annotation was for data quality verification, not a human subjects experiment."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human subjects study was conducted. The human annotation task does not constitute a human subjects study."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human subjects study was conducted."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human subjects study was conducted."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human subjects study was conducted."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human subjects study was conducted."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human subjects study was conducted."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "Despite the paper emphasizing LYNX (8B) as 'a fraction of the size and cost of closed source LLMs,' no actual inference costs, latency numbers, or tokens consumed are reported. The paper mentions using 8 H100s for 70B inference but does not quantify cost per example."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "The paper states training used '32 Nvidia H100 GPUs' but does not report total training time, GPU hours, or cost. Evaluation used '8 H100s' but no wall-clock time or total compute budget is given."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "LYNX (70B) outperforms GPT-4o and all other closed and open-source models on HaluBench with 87.4% overall accuracy.",
    293       "evidence": "Table 3 shows LYNX (70B) at 87.4% overall vs GPT-4o at 86.5%, GPT-4-Turbo at 85.0%, Claude-3-Sonnet at 78.8%, and Llama-3-Instruct-70B at 80.1%.",
    294       "supported": "moderate"
    295     },
    296     {
    297       "claim": "LYNX (70B) is 8.3% more accurate than GPT-4o at identifying inaccurate responses in medical answers in PubMedQA.",
    298       "evidence": "Table 3 shows LYNX (70B) at 90.4% vs GPT-4o at 82.1% on PubMedQA, a difference of 8.3 percentage points.",
    299       "supported": "moderate"
    300     },
    301     {
    302       "claim": "LYNX is the first open-source hallucination detection model that outperforms GPT-4o and closed-source LLMs-as-Judge.",
    303       "evidence": "Table 3 shows LYNX (70B) at 87.4% vs GPT-4o at 86.5% on HaluBench. The 'first' claim is not verifiable from the evidence provided.",
    304       "supported": "weak"
    305     },
    306     {
    307       "claim": "Fine-tuning Llama-3-Instruct-70B improves hallucination detection accuracy by 7.8% on average.",
    308       "evidence": "Table 3 shows Llama-3-Instruct-70B at 80.1% and LYNX (70B) at 87.4%, but the difference is 7.3%, not 7.8% as stated. Section 4.1 states '7.8% increase in average accuracy.'",
    309       "supported": "moderate"
    310     },
    311     {
    312       "claim": "HaluBench shows high agreement with human annotations (0.94 across 200 samples).",
    313       "evidence": "Table 2 reports per-dataset agreement: DROP 0.92, FinanceBench 0.90, CovidQA 0.96, PubMedQA 0.96, n=50 per dataset (200 total).",
    314       "supported": "strong"
    315     }
    316   ],
    317   "methodology_tags": [
    318     "benchmark-eval"
    319   ],
    320   "key_findings": "LYNX, a fine-tuned Llama-3-70B-Instruct model, achieves 87.4% accuracy on HaluBench, a new 15k-sample hallucination detection benchmark, narrowly outperforming GPT-4o (86.5%). The largest gains are in domain-specific tasks like PubMedQA (+8.3% over GPT-4o) and FinanceBench. HaluBench is constructed using semantic perturbation of answers from existing QA datasets and shows 0.94 agreement with human annotators across 200 verified samples.",
    321   "red_flags": [
    322     {
    323       "flag": "Evaluating on own benchmark",
    324       "detail": "LYNX is evaluated exclusively on HaluBench, which was constructed by the same team. The training data and benchmark share the same source datasets (DROP, CovidQA, PubMedQA) and the same perturbation methodology (GPT-4o), giving LYNX a potential distribution advantage over baselines that did not have access to similarly distributed training data."
    325     },
    326     {
    327       "flag": "Company evaluating its own product",
    328       "detail": "Four of five authors are from Patronus AI, which commercially develops AI evaluation tools. LYNX is released under the Patronus AI brand. The paper does not include a conflicts-of-interest statement despite the clear commercial interest in demonstrating superior performance."
    329     },
    330     {
    331       "flag": "No uncertainty quantification",
    332       "detail": "All results are single-run point estimates with no confidence intervals, error bars, or significance tests. The performance gap between LYNX (70B) at 87.4% and GPT-4o at 86.5% is less than 1 percentage point, which may not be statistically significant."
    333     },
    334     {
    335       "flag": "Single metric (accuracy) on balanced binary classification",
    336       "detail": "Only accuracy is reported despite binary classification. Precision, recall, F1, or per-class accuracy would reveal whether LYNX is better at detecting hallucinations, confirming faithfulness, or both. The balanced dataset design masks potential class-specific weaknesses."
    337     },
    338     {
    339       "flag": "Contamination risk unaddressed",
    340       "detail": "HaluBench uses publicly available datasets (DROP from 2019, HaluEval from 2023) that could be in the training data of GPT-4o and other baseline models. The non-perturbed examples from these datasets are especially vulnerable to contamination. This is not discussed."
    341     },
    342     {
    343       "flag": "Numerical discrepancy in claims",
    344       "detail": "Section 4.1 claims fine-tuning produces a '7.8% increase in average accuracy' but the difference between Llama-3-Instruct-70B (80.1%) and LYNX (70B) (87.4%) is 7.3 percentage points, not 7.8%."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "Ragas: Automated evaluation of retrieval augmented generation",
    350       "authors": ["Shahul Es", "Jithin James", "Luis Espinosa-Anke", "Steven Schockaert"],
    351       "year": 2023,
    352       "relevance": "Proposes an LLM-based evaluation framework for RAG systems, directly compared as a baseline in this paper."
    353     },
    354     {
    355       "title": "HaluEval: A large-scale hallucination evaluation benchmark for large language models",
    356       "authors": ["Junyi Li", "Xiaoxue Cheng", "Wayne Xin Zhao", "Jian-Yun Nie", "Ji-Rong Wen"],
    357       "year": 2023,
    358       "arxiv_id": "2305.11747",
    359       "relevance": "Major hallucination evaluation benchmark used as a component of HaluBench and as a baseline dataset."
    360     },
    361     {
    362       "title": "RAGTruth: A hallucination corpus for developing trustworthy retrieval-augmented language models",
    363       "authors": ["Yuanhao Wu", "Juno Zhu", "Siliang Xu", "Kashun Shum", "Cheng Niu", "Randy Zhong", "Juntong Song", "Tong Zhang"],
    364       "year": 2023,
    365       "arxiv_id": "2401.00396",
    366       "relevance": "Hallucination corpus with word-level annotations used both as training data and evaluation component in HaluBench."
    367     },
    368     {
    369       "title": "ARES: An automated evaluation framework for retrieval-augmented generation systems",
    370       "authors": ["Jon Saad-Falcon", "Omar Khattab", "Christopher Potts", "Matei Zaharia"],
    371       "year": 2024,
    372       "relevance": "Automated RAG evaluation framework using few-shot LLM judges, a key related method discussed in the paper."
    373     },
    374     {
    375       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    376       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    377       "year": 2024,
    378       "relevance": "Foundational work on using LLMs as evaluation judges, the paradigm LYNX operates within."
    379     },
    380     {
    381       "title": "Prometheus: Inducing fine-grained evaluation capability in language models",
    382       "authors": ["Seungone Kim", "Jamin Shin", "Yejin Cho", "Joel Jang"],
    383       "year": 2023,
    384       "relevance": "Fine-tuned LLM evaluation model, methodologically similar to LYNX's approach of training specialized judge models."
    385     },
    386     {
    387       "title": "JudgeLM: Fine-tuned large language models are scalable judges",
    388       "authors": ["Lianghui Zhu", "Xinggang Wang", "Xinlong Wang"],
    389       "year": 2023,
    390       "arxiv_id": "2310.17631",
    391       "relevance": "Another fine-tuned LLM judge model demonstrating the approach of specializing models for evaluation tasks."
    392     },
    393     {
    394       "title": "SelfCheckGPT: Zero-resource black-box hallucination detection for generative large language models",
    395       "authors": ["Potsawee Manakul", "Adian Liusie", "Mark JF Gales"],
    396       "year": 2023,
    397       "arxiv_id": "2303.08896",
    398       "relevance": "Zero-resource hallucination detection method relevant to the survey's coverage of LLM evaluation approaches."
    399     },
    400     {
    401       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    402       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    403       "year": 2022,
    404       "relevance": "Foundational CoT prompting technique used in LYNX's training to generate reasoning chains."
    405     },
    406     {
    407       "title": "CRAG -- Comprehensive RAG Benchmark",
    408       "authors": ["Xiao Yang", "Kai Sun", "Hao Xin"],
    409       "year": 2024,
    410       "relevance": "Industry-focused RAG benchmark with domain-specific tasks, contrasted with HaluBench's approach."
    411     },
    412     {
    413       "title": "Long-form factuality in large language models",
    414       "authors": ["Jerry Wei", "Chengrun Yang", "Xinying Song"],
    415       "year": 2024,
    416       "relevance": "Evaluates factuality in long-form LLM outputs using search augmentation, related to hallucination detection."
    417     },
    418     {
    419       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    420       "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"],
    421       "year": 2022,
    422       "relevance": "Benchmark measuring LLM truthfulness on common misconceptions, related to hallucination evaluation."
    423     }
    424   ]
    425 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs