scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24486B)
      1 {
      2   "paper": {
      3     "title": "ARES: An Automated Evaluation Framework for Retrieval-Augmented Generation Systems",
      4     "authors": [
      5       "Jon Saad-Falcon",
      6       "Omar Khattab",
      7       "Christopher Potts",
      8       "Matei Zaharia"
      9     ],
     10     "year": 2023,
     11     "venue": "arXiv",
     12     "arxiv_id": "2311.09476"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states 'We make our code and datasets publicly available on Github' in both the abstract and conclusion (Sections 1 and 6). This constitutes a release commitment with a platform specified."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available datasets (KILT, SuperGLUE, AIS) and states that their own datasets are publicly available on Github. The evaluation uses standard public benchmarks (NQ, HotpotQA, FEVER, WoW, MultiRC, ReCoRD)."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While the paper mentions specific models (DeBERTa-v3-Large, FLAN-T5 XXL) and that 'GPUs with about 32GB of memory' were required (Section 7), there is no requirements.txt, Dockerfile, or detailed environment specification listing library versions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper describes the ARES pipeline at a high level (Section 3) and provides prompts in the appendix, but there are no step-by-step reproduction instructions, README with commands, or scripts to replicate experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "ARES uses prediction-powered inference (PPI) to produce 95% confidence intervals for RAG system scores. Section 3.3 describes this methodology, and Figures 2-3 show confidence intervals. Table 6 reports PPI ranges."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper compares ARES vs. RAGAS and GPT-3.5 judges using Kendall's tau differences but does not perform statistical significance tests on these differences. Claims like 'ARES is 59.3 percentage points higher' are raw comparisons without significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports absolute differences in Kendall's tau (e.g., '0.065 higher for context relevance and 0.132 higher for answer relevance than RAGAS') and accuracy point differences (e.g., '59.3 and 14.4 percentage points') with baseline context throughout Section 5."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Table 3 provides an analysis of PPI labeled count vs. ARES efficacy, systematically testing annotation sizes from 25 to 400 datapoints and finding '150 is the minimum number required.' This justifies the chosen sample sizes."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Results are reported as single-run Kendall's tau values in Tables 1, 3, 4, 5, and 6. There is no mention of variance across multiple runs, standard deviations, or confidence intervals for the ranking correlations themselves."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares ARES against RAGAS (version 0.0.18), a few-shot GPT-3.5 judge, and sampled annotations baseline across all experiments (Tables 1, 5)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "RAGAS (2023) and GPT-3.5 were contemporary baselines at the time of publication. The paper also compares against sampled human annotations, which represents the established gold standard approach."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper includes ablations: ARES LLM Judge without PPI vs. with PPI (Table 1), varying PPI labeled counts from 25-400 (Table 3), GPT-4 labels vs. human labels (Table 4), and cross-domain transfer experiments (Table 6)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper uses Kendall's tau for ranking correlation and accuracy for prediction correctness, evaluated across three dimensions: context relevance, answer faithfulness, and answer relevance."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The ARES framework explicitly uses human-annotated validation sets (150-300 datapoints) as ground truth for PPI. The evaluation compares ARES predictions against these human judgments, and accuracy against human labels is a core metric."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper uses validation subsets from KILT and SuperGLUE datasets to create mock RAG systems with known ground truth, and separately tests on real RAG systems (Section 5.3). The human preference validation set is distinct from the training data used for judge fine-tuning."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per dataset (NQ, HotpotQA, WoW, FEVER, MultiRC, ReCoRD) and per evaluation dimension (context relevance, answer relevance, answer faithfulness) in Tables 1, 5, and 6."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 5.4 discusses limitations of cross-domain applications, showing failures in cross-lingual transfer (Kendall's tau 0.33 for XGLUE), text-to-code transfer (0.28 for CodeSearchNet), and extraction tasks (0.38 for T-Rex)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that GPT-4 labels underperform human labels (Table 4), that cross-lingual/cross-modality transfer fails significantly (Section 5.4), and that below 100-150 annotation points ARES cannot meaningfully distinguish RAG systems (Table 3)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims ARES 'accurately evaluates RAG systems while using only a few hundred human annotations' and 'judges remain effective across domain shifts.' Both are supported by Tables 1, 5 (accuracy and Kendall's tau) and Table 6 (cross-domain results)."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims about PPI improving ranking accuracy, supported by controlled ablations comparing ARES with and without PPI (Tables 1, 5). The ablation design (single-variable manipulation) is adequate for these claims."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 5.4 explicitly bounds generalization, stating ARES fails for cross-lingual, text-to-code, and extraction tasks. Section 7 (Limitations) notes all datasets are in English and specialized domains may require expert annotators."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not substantively discuss alternative explanations for why ARES outperforms baselines. For example, the mock RAG system evaluation design (where ground truth is artificial) could inflate results compared to real-world scenarios, but this is not discussed."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Section 4.1 specifies 'gpt-3.5-turbo-16k, version 10/23', 'DeBERTa-v3-Large', 'FLAN-T5 XXL', 'text-embedding-ada-002', and 'version 0.0.18 of RAGAS'. GPT-4 is mentioned without version but is used only for an exploratory comparison."
    138       },
    139       "prompts_provided": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The appendix (Sections A.2-A.6) provides full prompt templates for context relevance, answer faithfulness, answer relevance scoring, and synthetic query/answer generation, including the actual text used with placeholder notation for few-shot examples."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section A.1 reports fine-tuning configuration: cross-entropy loss, Adam optimizer, 5e-6 learning rate, 32 batch size, 0.1 dropout, linear warmup and decay schedule, and early stopping after 3 epochs with no improvement."
    148       },
    149       "scaffolding_described": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The ARES pipeline is described in detail in Section 3: three stages (synthetic data generation, judge fine-tuning, PPI-based ranking), with each stage's inputs, outputs, and methodology clearly specified. Figure 1 provides a visual overview."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 3.1 documents synthetic data generation and filtering (queries that cannot retrieve their source passage are filtered out). Section 4.2 describes how mock RAG systems were constructed by sampling positive/negative examples from KILT and SuperGLUE validation sets."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 7 is a dedicated 'Limitations' section discussing annotation requirements, hardware costs, and English-only evaluation."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 discusses specific threats: annotator expertise requirements for specialized domains, GPU memory requirements (32GB), and that all datasets are English-only. Section 5.4 discusses specific cross-domain failure modes."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 5.4 explicitly states what ARES does NOT handle: cross-lingual transfer, text-to-code, and entity extraction tasks. Section 7 states results apply only to English datasets. These are specific boundaries on the claims."
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The paper uses publicly available benchmark datasets (KILT, SuperGLUE, AIS) and states code and datasets are available on Github, enabling independent verification of the results."
    182       },
    183       "data_collection_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 4.2 describes in detail how datasets were selected, how mock RAG systems were constructed (with success rates from 70% to 90% in 2.5% increments), and how positive/negative examples were sampled."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No human participants were recruited for a study. The human annotations used for PPI validation are mentioned but the annotator recruitment process is not described, though the paper primarily uses standard benchmark datasets."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3 documents the full pipeline: passage set input -> synthetic query/answer generation with filtering -> judge fine-tuning with contrastive learning -> PPI-based scoring with confidence intervals. Each transformation step is described."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "No funding source or acknowledgments section listing grants or sponsors was found in the paper."
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Author affiliations are clearly listed: Stanford University (Saad-Falcon, Khattab, Potts) and Databricks/UC Berkeley (Zaharia). The footnote notes the project started during a research internship at Databricks."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "With no funding disclosure, independence cannot be assessed. Notably, Matei Zaharia is affiliated with Databricks, which has a commercial interest in RAG systems and evaluation tooling. This potential conflict is not discussed."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No competing interests statement or financial interests declaration was found. Matei Zaharia co-founded Databricks, which has commercial interests in the RAG evaluation space, but this is not disclosed in a formal conflicts statement."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The paper uses GPT-3.5 and GPT-4 for evaluation and labeling but does not state the training data cutoff dates for these models. The KILT and SuperGLUE benchmarks are public and predate these models' training."
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper does not discuss whether GPT-3.5 or GPT-4 may have seen the KILT/SuperGLUE evaluation data during training. Since these are public benchmarks, contamination risk exists but is not addressed."
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "KILT (2021) and SuperGLUE (2019) predate GPT-3.5 and GPT-4 training. The paper does not discuss whether these benchmarks may have been in the training data, which could affect the GPT-3.5 judge baseline comparison."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants study was conducted. The human annotations used for PPI are a validation set, not a human subjects study."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human subjects study was conducted."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants study was conducted."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants study was conducted."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants study was conducted."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants study was conducted."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants study was conducted."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Section 7 mentions GPUs with 32GB memory are needed and fine-tuning takes 'several hours,' but no specific inference costs, API costs, tokens consumed, or per-example costs are reported for running ARES on the benchmarks."
    280       },
    281       "compute_budget_stated": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The paper mentions 'GPUs with about 32GB of memory' and 'several hours for fine-tuning and generation' (Section 7) but does not quantify total GPU hours, API spend, or hardware specifications used for the experiments."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "ARES beats RAGAS by 59.3 and 14.4 percentage points on average for context relevance and answer relevance evaluation accuracy, respectively, across KILT and SuperGLUE datasets.",
    291       "evidence": "Table 1 shows per-dataset accuracy comparisons. RAGAS context relevance accuracy ranges from 15.0% to 36.4% while ARES ranges from 67.8% to 92.3%. Answer relevance accuracy for RAGAS ranges from 69.2% to 77.8% while ARES ranges from 78.5% to 97.2%.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "ARES achieves Kendall's tau 0.065 higher for context relevance and 0.132 higher for answer relevance than RAGAS on average for ranking pseudo RAG systems.",
    296       "evidence": "Table 1 provides Kendall's tau values across 6 datasets for both ARES and RAGAS. The averages match the stated differences. ARES consistently achieves tau >= 0.78 while RAGAS drops as low as 0.44.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "ARES requires 78% fewer annotations than the sampled annotations baseline while achieving higher accuracy.",
    301       "evidence": "Section 5.1 states ARES uses 300 annotations total vs. sampled annotations using 150 per mock system (1,350 total). Table 1 shows ARES achieves higher Kendall's tau on average.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "ARES predicts within 2.5 percentage points of ground truth for answer hallucination rates on the AIS benchmark.",
    306       "evidence": "Table 2 shows ARES split prediction of 0.478 vs. correct 0.458 for WoW (2.0 point difference) and 0.835 vs. 0.859 for CNN/DM (2.4 point difference).",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "ARES judges remain effective across domain shifts for KILT and SuperGLUE tasks.",
    311       "evidence": "Table 6 shows cross-domain Kendall's tau values ranging from 0.78 to 1.0 across all tested domain shifts within KILT/SuperGLUE. However, Section 5.4 shows failures for cross-lingual (0.33), text-to-code (0.28), and extraction (0.38) tasks.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "PPI consistently improves ranking prediction accuracy of the fine-tuned LLM judge.",
    316       "evidence": "Table 1 shows ARES (with PPI) achieves higher Kendall's tau than ARES LLM Judge (without PPI) in 10 out of 12 dataset-metric combinations, with equal performance in the remaining 2.",
    317       "supported": "strong"
    318     }
    319   ],
    320   "methodology_tags": [
    321     "benchmark-eval"
    322   ],
    323   "key_findings": "ARES is an automated evaluation framework for RAG systems that fine-tunes lightweight LM judges (DeBERTa-v3-Large) on synthetic data and uses prediction-powered inference (PPI) to provide confidence intervals for RAG system scoring. Across eight knowledge-intensive tasks in KILT, SuperGLUE, and AIS, ARES outperforms RAGAS by 59.3 and 14.4 percentage points on context relevance and answer relevance evaluation accuracy, respectively, while requiring only ~150-300 human annotations. The fine-tuned judges generalize across related domains within the same benchmark family but fail significantly for cross-lingual, text-to-code, and entity extraction transfers.",
    324   "red_flags": [
    325     {
    326       "flag": "Artificial evaluation setup may inflate results",
    327       "detail": "The primary evaluation (Tables 1, 3, 4) uses mock RAG systems created by artificially mixing positive and negative examples at known ratios (70%-90% success rates). This controlled setup may not reflect the difficulty of evaluating real-world RAG systems where errors are more subtle. The real RAG system evaluation (Table 5) uses fewer systems and datasets."
    328     },
    329     {
    330       "flag": "Potential conflict of interest not disclosed",
    331       "detail": "Matei Zaharia is co-founder and CTO of Databricks, which has commercial interests in RAG evaluation and LLM infrastructure. The paper's footnote notes the project started during a research internship at Databricks, but no formal conflicts of interest statement is provided."
    332     },
    333     {
    334       "flag": "No variance or reproducibility analysis",
    335       "detail": "All results are reported as single-run numbers. There is no analysis of variance across different random seeds for synthetic data generation, judge fine-tuning, or PPI confidence interval estimation. The stability of the results is unknown."
    336     },
    337     {
    338       "flag": "Benchmark contamination risk unaddressed",
    339       "detail": "GPT-3.5 and GPT-4 are used as baselines and for label generation, but the paper does not discuss whether these models may have seen the KILT and SuperGLUE evaluation data during training, which could bias the comparison."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    345       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    346       "year": 2023,
    347       "arxiv_id": "2306.05685",
    348       "relevance": "Foundational work on using LLMs as evaluators, directly relevant to automated evaluation methodology."
    349     },
    350     {
    351       "title": "Prediction-Powered Inference",
    352       "authors": ["Anastasios N. Angelopoulos", "Stephen Bates", "Clara Fannjiang", "Michael I. Jordan", "Tijana Zrnic"],
    353       "year": 2023,
    354       "relevance": "Core statistical method used by ARES for providing confidence intervals; relevant to evaluation methodology rigor."
    355     },
    356     {
    357       "title": "Benchmarking Large Language Models in Retrieval-Augmented Generation",
    358       "authors": ["Jiawei Chen", "Hongyu Lin", "Xianpei Han", "Le Sun"],
    359       "year": 2023,
    360       "arxiv_id": "2309.01431",
    361       "relevance": "Benchmark evaluation of LLMs in RAG settings, directly relevant to understanding LLM capability in retrieval-augmented tasks."
    362     },
    363     {
    364       "title": "FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation",
    365       "authors": ["Sewon Min", "Kalpesh Krishna", "Xinxi Lyu"],
    366       "year": 2023,
    367       "relevance": "Alternative approach to evaluating LLM factuality, relevant to understanding evaluation methodology in the LLM space."
    368     },
    369     {
    370       "title": "Language Models are Few-Shot Learners",
    371       "authors": ["Tom B. Brown", "Benjamin Mann", "Nick Ryder"],
    372       "year": 2020,
    373       "relevance": "Foundational GPT-3 paper establishing few-shot learning capabilities used as baseline approach in ARES evaluation."
    374     },
    375     {
    376       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    377       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
    378       "year": 2020,
    379       "relevance": "Original RAG paper defining the retrieval-augmented generation paradigm that ARES evaluates."
    380     },
    381     {
    382       "title": "G-Eval: NLG Evaluation Using GPT-4 with Better Human Alignment",
    383       "authors": ["Yang Liu", "Dan Iter", "Yichong Xu"],
    384       "year": 2023,
    385       "arxiv_id": "2303.16634",
    386       "relevance": "Related LLM-based evaluation approach for natural language generation, relevant to automated evaluation methodology."
    387     },
    388     {
    389       "title": "Calibrating LLM-based Evaluator",
    390       "authors": ["Yuxuan Liu", "Tianchi Yang", "Shaohan Huang"],
    391       "year": 2023,
    392       "arxiv_id": "2309.13308",
    393       "relevance": "Addresses calibration of LLM-based evaluators, directly relevant to evaluation reliability and methodology."
    394     },
    395     {
    396       "title": "ColBERTv2: Effective and Efficient Retrieval via Lightweight Late Interaction",
    397       "authors": ["Keshav Santhanam", "Omar Khattab", "Jon Saad-Falcon"],
    398       "year": 2022,
    399       "relevance": "Retrieval system used as one of the RAG configurations evaluated in ARES experiments."
    400     },
    401     {
    402       "title": "Augmented Language Models: A Survey",
    403       "authors": ["Grégoire Mialon", "Roberto Dessì", "Maria Lomeli"],
    404       "year": 2023,
    405       "relevance": "Survey of augmented language models including RAG approaches, relevant to understanding the broader landscape of LLM-based systems."
    406     }
    407   ]
    408 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs