scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25696B)
      1 {
      2   "paper": {
      3     "title": "SHROOM-INDElab at SemEval-2024 Task 6: Zero- and Few-Shot LLM-Based Classification for Hallucination Detection",
      4     "authors": [
      5       "Bradley P. Allen",
      6       "Fina Polat",
      7       "Paul Groth"
      8     ],
      9     "year": 2024,
     10     "venue": "International Workshop on Semantic Evaluation",
     11     "arxiv_id": "2404.03732",
     12     "doi": "10.48550/arXiv.2404.03732"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "The SHROOM-INDElab system using GPT-4 prompt engineering for hallucination detection ranked 4th (model-agnostic) and 6th (model-aware) in SemEval-2024 Task 6, achieving 82.9% and 80.2% accuracy on test sets. A counterintuitive finding was that zero-shot classification outperformed few-shot classification with automatically generated examples. The concept definition of hallucination was the most important prompt component in ablation analysis, while task and role definitions contributed minimally.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "GitHub link provided in abstract footnote: https://www.github.com/bradleypallen/shroom/"
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Uses the publicly available SHROOM shared task datasets provided by SemEval-2024 Task 6, including unlabeled training, labeled validation, and labeled test datasets."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment setup section is provided. Only mentions LangChain Python library and OpenAI API usage."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided. The paper describes the approach but does not include README-style commands or a 'Reproducing Results' section."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Table 3 are point estimates (e.g., '0.829' accuracy) with no confidence intervals or error bars. Figures 3-6 show point estimates without uncertainty bands."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Claims of improvement over baseline (e.g., '+0.132' accuracy) are made by comparing raw numbers. No statistical significance tests (t-tests, bootstrap tests, etc.) are reported."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Table 3 reports improvements with full baseline context, e.g., 'SHROOM-INDElab 0.829 (+0.132)' alongside baseline '0.697 (+0.000)', allowing the reader to interpret both absolute and relative effect sizes."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No justification for the number of samples per query (5 or 20), no power analysis, and no justification for K=5 examples in Stage 1 or the 64 sampled data points."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The hyperparameter and ablation studies mention 'three different passes' but report no standard deviation, variance, or spread measures across those passes. Only single values are shown."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table 3 compares against the task baseline system and top-performing competitors (GroupCheckGPT rank 1, HaRMoNEE rank 1)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Baselines are contemporary SemEval-2024 shared task competitors and the official task baseline, all from the same competition."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Section 4.3 presents an ablation study removing prompt components sequentially: examples, task definition, role definition, and concept definition, measuring their contribution to classifier performance."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Two metrics are used throughout: accuracy and Spearman's correlation coefficient (ρ). Table 4 also reports Cohen's κ."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "Evaluation is automated comparison against gold-standard human labels. Section 4.4 analyzes agreement with human labellers via Fleiss' κ and Cohen's κ, but no humans independently evaluated the system's outputs."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Clear separation between validation and test datasets. Hyperparameter study uses validation set; final results reported on held-out test set (Table 3)."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down by track (model-agnostic vs. model-aware), by dataset (validation vs. test), and Table 4 provides breakdown by human consensus level (low/high/unanimous)."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The ablation study reveals that few-shot examples hurt accuracy. Section 5 discusses: 'the exclusion of selected examples led to better accuracy suggests the need for further investigation with respect to how the way in which examples are selected and included... impacts accuracy.'"
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Key negative result: zero-shot outperforms few-shot (ablation study, Section 4.3). Also, Figure 4 shows increasing examples per label decreases Spearman's ρ. These are honest reports of things that didn't work as expected."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims (4th/6th ranking, consistency with human labellers, zero-shot better than few-shot) are all supported by results in Sections 4.1, 4.4, and 4.3 respectively."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The ablation study (Section 4.3) makes causal claims about component contributions through controlled single-variable manipulation (removing one component at a time). The sequential removal design is adequate for this purpose."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Claims are bounded to the SHROOM/SemEval-2024 Task 6 setting. The paper does not overclaim beyond the specific task, datasets, and models tested."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The discussion acknowledges that few-shot examples hurt accuracy and says it 'suggests the need for further investigation' but does not explore alternative explanations for why this occurs or for other results."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures accuracy and Spearman's ρ on hallucination detection and frames results as hallucination detection performance on SHROOM. No proxy gap exists — claims match measurement granularity."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Section 4 specifies 'gpt-3.5-turbo' and 'gpt-4-0125-preview' (includes date-based version). Embedding model specified as 'text-embedding-ada-002'. Exact dates of runs are given (January 25, 28 and February 17-18, 2024)."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Figure 2 provides the full prompt text with all components (task definition, role definition, concept definition, examples). Table 2 provides all task and role definitions for each task type. The actual prompt text is shown, not just described."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 4 reports: temperature (1.2 for submission, varied 0.0-2.0 in study), K=5 examples in Stage 1, 1 example per label, 20 samples for majority voting, λ=0.2 for diversity-consistency trade-off."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. The system is a prompt-based LLM classifier with temperature sampling and majority voting — not an agentic workflow."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 3.3 describes example selection: 64 data points sampled per task type, zero-shot classification, partitioning into positive/negative pools, and the selection algorithm (Algorithm 1) with entropy and diversity criteria."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "Section 5 is titled 'Discussion and Conclusion' and is only 2 paragraphs. There is no dedicated limitations section or substantive discussion of limitations."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No specific threats to validity are discussed. The brief discussion section only mentions need for further investigation into example selection, without identifying specific validity threats."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No explicit statements about what the results do NOT show. No discussion of what populations, settings, or claims are excluded from the findings."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The SHROOM shared task datasets are publicly available through SemEval-2024. Code is available on GitHub. The competition data allows independent verification."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 2 describes the SHROOM dataset structure (task, input, target, generated text, labels). Notes ~200 crowd-sourced human labellers each labeling ~20 data points."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants recruited by the authors. The SHROOM dataset with crowd-sourced labels was provided by the shared task organizers. This is a standard benchmark evaluation."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The two-stage workflow (Figure 1) and example selection process (Section 3.3, Algorithm 1) document the full pipeline from unlabeled data through zero-shot classification to few-shot classification."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Acknowledgements section states: 'This work is partially supported by the European Union's Horizon Europe research and innovation programme within the ENEXA project (grant Agreement no. 101070305).'"
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "All authors are listed as University of Amsterdam, Intelligent Data Engineering Lab. No product being evaluated is affiliated with the authors."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "EU Horizon Europe research funding has no financial stake in the hallucination detection results. The ENEXA project is a general research initiative."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is included in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No mention of the training data cutoff dates for GPT-3.5-turbo or GPT-4-0125-preview. The models' training data could include SHROOM-related content."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of whether SHROOM task data or related hallucination examples could appear in GPT-4's training data."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No discussion of contamination risk. SemEval tasks and associated data could potentially appear in LLM training corpora."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants recruited by the authors. This is a benchmark evaluation using shared task datasets."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants recruited by the authors."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants recruited by the authors."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants recruited by the authors."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants recruited by the authors."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants recruited by the authors."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants recruited by the authors."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Section 4 states: 'Approximately $500 USD in OpenAI API charges were incurred during the above runs.'"
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Total API spend of approximately $500 USD is stated, covering Stage 1, Stage 2 submission, and hyperparameter/ablation study runs."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No random seed sensitivity analysis is reported. Temperature sampling introduces stochasticity but its effect on result stability is not analyzed across seeds."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "Hyperparameter and ablation studies state 'three different passes over the model-agnostic validation dataset.' Submission uses '20 samples for majority voting.'"
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "A hyperparameter study is conducted varying temperature, examples per label, and samples per query, but the total search budget (total configurations tried, total compute for search) is not quantified."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The hyperparameter study was conducted on gpt-3.5-turbo, but the final submission used gpt-4-0125-preview. The transfer of hyperparameter choices across models is not justified. The submission's temperature of 1.2 is not clearly derived from the gpt-3.5-turbo study results."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "Only a small number of comparisons are made (2 models, baseline vs. system). No mass testing situation requiring correction."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "No acknowledgment of evaluating their own system or potential author bias in prompt design and tuning."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": false,
    329         "answer": false,
    330         "justification": "Comparisons are against shared task submissions where compute differences are part of the system design; compute budget comparison across competitors is not expected."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "No discussion of whether the SHROOM benchmark actually measures hallucination detection capability as intended, or whether the binary framing captures the nuances of hallucination."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding involved. The system is a direct prompt-based classifier without agentic components."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether GPT-4's training data could include SHROOM-related content or similar hallucination detection tasks."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup (providing input, target, and generated text together) gives the model information not available in a real deployment scenario."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of independence between training examples (used for Stage 1 zero-shot → Stage 2 few-shot) and evaluation data."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No concrete leakage detection or prevention methods are used or discussed."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "The SHROOM-INDElab system ranked 4th in the model-agnostic track and 6th in the model-aware track of SemEval-2024 Task 6.",
    369       "evidence": "Table 3 shows test set results: 82.9% accuracy (model-agnostic) and 80.2% accuracy (model-aware), compared to 1st-place systems at 84.7% and 81.3%.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "GPT-4-0125-preview significantly outperforms GPT-3.5-turbo for hallucination detection on the SHROOM validation datasets.",
    374       "evidence": "Table 3: GPT-4 achieves 0.814/0.772 accuracy (model-agnostic/aware) vs. GPT-3.5-turbo at 0.773/0.764 on validation sets.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Zero-shot classification provides better accuracy than few-shot classification with automatically generated examples.",
    379       "evidence": "Ablation study (Section 4.3, Figure 6) shows removing examples improves accuracy, though Spearman's ρ slightly decreases. Tested only on gpt-3.5-turbo with one validation dataset.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "The concept definition of hallucination is the most significant prompt component for classifier performance.",
    384       "evidence": "Ablation study (Section 4.3, Figure 6) shows removing the concept definition causes the largest drop in both accuracy and Spearman's ρ compared to removing other components.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "The classifier's labeling decisions are consistent with those of crowd-sourced human labellers.",
    389       "evidence": "Section 4.4: Adding classifier labels to human annotations increases Fleiss' κ from 0.373 to 0.405. Table 4 shows Cohen's κ of 0.623 overall and 0.856 for unanimous human labels.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "System agreement with human labeling increases as the certainty of human labeling increases.",
    394       "evidence": "Table 4 shows accuracy rising from 0.621 (low consensus 2/3 split) to 0.854 (high 4/5 split) to 0.929 (unanimous), with corresponding increases in Cohen's κ.",
    395       "supported": "strong"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "Sequential ablation confounding",
    401       "detail": "The ablation study (Section 4.3) removes components sequentially rather than independently (examples → task definition → role definition → concept definition). The measured effect of each component is confounded with interaction effects from previously removed components."
    402     },
    403     {
    404       "flag": "No statistical tests on main results",
    405       "detail": "All performance comparisons in Table 3 are raw number comparisons with no significance tests, confidence intervals, or variance measures. Improvements like '+0.132' accuracy could be within noise."
    406     },
    407     {
    408       "flag": "Hyperparameter tuning on different model than submission",
    409       "detail": "The hyperparameter study was conducted using gpt-3.5-turbo but the final submission used gpt-4-0125-preview. The transferability of optimal hyperparameters across models is not validated."
    410     },
    411     {
    412       "flag": "No variance reported despite multiple runs",
    413       "detail": "The paper mentions 'three different passes' for hyperparameter and ablation studies but never reports standard deviation or any spread measure, preventing assessment of result stability."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models",
    419       "authors": ["Potsawee Manakul", "Adian Liusie", "Mark JF Gales"],
    420       "year": 2023,
    421       "arxiv_id": "2303.08896",
    422       "relevance": "Zero-resource hallucination detection method for LLMs, directly related to LLM evaluation and safety."
    423     },
    424     {
    425       "title": "ChainPoll: A High Efficacy Method for LLM Hallucination Detection",
    426       "authors": ["Robert Friel", "Atindriyo Sanyal"],
    427       "year": 2023,
    428       "arxiv_id": "2310.18344",
    429       "relevance": "LLM-based hallucination detection method using prompt engineering, comparable approach to the surveyed system."
    430     },
    431     {
    432       "title": "A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions",
    433       "authors": ["Lei Huang", "Weijiang Yu", "Weitao Ma"],
    434       "year": 2023,
    435       "arxiv_id": "2311.05232",
    436       "relevance": "Comprehensive survey on LLM hallucination covering taxonomy and evaluation methods."
    437     },
    438     {
    439       "title": "Survey of Hallucination in Natural Language Generation",
    440       "authors": ["Ziwei Ji", "Nayeon Lee", "Rita Frieske"],
    441       "year": 2023,
    442       "relevance": "Foundational survey on hallucination in NLG systems, relevant to LLM safety and evaluation."
    443     },
    444     {
    445       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    446       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    447       "year": 2022,
    448       "relevance": "Foundational work on chain-of-thought prompting, a key technique for LLM capability evaluation."
    449     },
    450     {
    451       "title": "Large Language Models are Zero-Shot Reasoners",
    452       "authors": ["Takeshi Kojima", "Shixiang Shane Gu", "Machel Reid"],
    453       "year": 2022,
    454       "relevance": "Demonstrates zero-shot reasoning capability of LLMs, foundational for prompt-based evaluation approaches."
    455     },
    456     {
    457       "title": "Better Zero-Shot Reasoning with Role-Play Prompting",
    458       "authors": ["Aobo Kong", "Shiwan Zhao", "Hao Chen"],
    459       "year": 2023,
    460       "arxiv_id": "2308.07702",
    461       "relevance": "Role-play prompting technique for improving LLM reasoning, relevant to prompt engineering methodology."
    462     },
    463     {
    464       "title": "Pre-Train, Prompt, and Predict: A Systematic Survey of Prompting Methods in Natural Language Processing",
    465       "authors": ["Pengfei Liu", "Weizhe Yuan", "Jinlan Fu"],
    466       "year": 2023,
    467       "relevance": "Systematic survey of prompting methods covering the landscape of LLM interaction techniques."
    468     },
    469     {
    470       "title": "Universal Self-Adaptive Prompting",
    471       "authors": ["Xingchen Wan", "Ruoxi Sun", "Hootan Nakhost"],
    472       "year": 2023,
    473       "arxiv_id": "2305.14926",
    474       "relevance": "Self-adaptive prompting method for automated example selection in few-shot LLM usage."
    475     },
    476     {
    477       "title": "Leveraging Large Language Models for NLG Evaluation: A Survey",
    478       "authors": ["Zhen Li", "Xiaohan Xu", "Tao Shen"],
    479       "year": 2024,
    480       "arxiv_id": "2401.07103",
    481       "relevance": "Survey on using LLMs as evaluators for NLG, relevant to LLM-as-judge methodology."
    482     }
    483   ]
    484 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs