scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29748B)
      1 {
      2   "paper": {
      3     "title": "FacTool: Factuality Detection in Generative AI — A Tool Augmented Framework for Multi-Task and Multi-Domain Scenarios",
      4     "authors": [
      5       "I-Chun Chern",
      6       "Steffi Chern",
      7       "Shiqi Chen",
      8       "Weizhe Yuan",
      9       "Kehua Feng",
     10       "Chunting Zhou",
     11       "Junxian He",
     12       "Graham Neubig",
     13       "Pengfei Liu"
     14     ],
     15     "year": 2023,
     16     "venue": "arXiv",
     17     "arxiv_id": "2307.13528",
     18     "doi": "10.48550/arXiv.2307.13528"
     19   },
     20   "scan_version": 3,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval"],
     23   "key_findings": "FacTool, a tool-augmented factuality detection framework, outperforms LLM self-check baselines across knowledge-based QA, code generation, math problem solving, and scientific literature review. FacTool powered by GPT-4 achieves the highest performance across all scenarios, with especially large gains in scientific literature review (95.24 vs 36.71 claim-level F1 against the best self-check baseline). Self-check methods are prone to false positives and less sensitive at detecting factual errors. Among evaluated chatbots (GPT-4, ChatGPT, Claude-v1, Bard, Vicuna-13B), GPT-4 has the best factual accuracy.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The abstract states: 'We release the code of FACTOOL associated with ChatGPT plugin interface at https://github.com/GAIR-NLP/factool.' A concrete URL is provided."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "While evaluation uses standard public benchmarks (RoSE, HumanEval, GSM-Hard), the paper also constructs custom datasets — FactPrompts (50 KB-QA prompts+responses) and 100 scientific literature review prompts+responses — along with ground-truth annotations for all tasks. These custom datasets and annotations are not explicitly stated as released."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No requirements.txt, Dockerfile, conda environment, or dependency specifications are mentioned in the paper. Only the models used (gpt-3.5-turbo, gpt-4) are named."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided in the paper. Code is released but the paper itself contains no 'Reproducing Results' section or equivalent."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "All results in Tables 4, 5, and 6 are reported as point estimates (e.g., '89.09 claim-level F1') with no confidence intervals, error bars, or uncertainty measures."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper claims 'FACTOOL powered by GPT-4 outperforms all other baselines across all scenarios' based solely on comparing point estimates in Tab. 5 without any statistical significance tests."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Performance is reported with baseline context throughout (e.g., '71.79 v.s. 57.14 response-level F1 on KB-based QA', '95.24 v.s. 36.71 claim-level F1' on scientific review), allowing the reader to assess the magnitude of differences."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Dataset sizes range from 50 to 164 samples per task (Tab. 3) with no justification for why these sizes are adequate, no power analysis, and no acknowledgment that small sample sizes may limit conclusions."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "All results are single-run point estimates with no standard deviation, variance, or any spread measure reported across runs."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Two baselines are included: Self-Check with zero-shot CoT and Self-Check with 3-shot CoT, both using ChatGPT and GPT-4 (Section 6)."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Self-Check baselines reference Madaan et al. (2023) and Chen et al. (2023), which are contemporary works from the same year using the same models."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The framework has five components (claim extraction, query generation, tool querying, evidence collection, agreement verification) but no ablation study is conducted to measure the contribution of individual components."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are reported using accuracy, recall, precision, and F1-score at both claim-level and response-level (Tab. 5)."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Human annotation is used to create ground-truth labels (Section 5.3: 'the authors collectively annotate the extracted claims'), but this is dataset construction, not human evaluation of the system's outputs. The schema specifies 'manual classification of the benchmark or dataset itself does not count.'"
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No explicit dev/test split is described. The system's prompts were likely developed and refined using the same data that evaluation is reported on. No statement about separating data used for prompt engineering from test data."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Results are broken down by task (KB-QA, Code, Math, Scientific) in Tab. 5, by chatbot in Tab. 6, and per-scenario in Figures 4 and 5."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 6.2.3 'Failure Analysis' provides detailed discussion of failure cases across all four tasks, including reasoning errors, conflicting evidence, limited test case variety, round-off errors, and title/author matching issues, with specific examples."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper reports that 'Self-check models powered by ChatGPT outperform FACTOOL powered by ChatGPT on KB-QA' (Section 6.2.1), which is a negative result for their own method. They also analyze why (reasoning errors in ChatGPT's agreement verification)."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims 'Experiments on four different tasks show the efficacy of the proposed method,' which is supported by Tab. 5 showing superior performance across all four tasks when powered by GPT-4."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The core claim is that tool augmentation improves factuality detection. This is tested by comparing FacTool (with tools) against Self-Check (same LLM, without tools), which is a controlled comparison isolating the tool-use variable. The study design is adequate for this comparative claim."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper claims to be a 'task and domain agnostic framework' (abstract and Section 1) but only evaluates on four specific tasks in English. The title and framing significantly overclaim relative to the tested scope."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "No alternative explanations for the results are discussed. For example, the paper does not consider whether improvements come from the specific prompting strategy rather than tool use, or whether the baselines are suboptimally configured."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper clearly defines factuality for each task (Section 3.2, Tab. 2) and measures it directly against those definitions. Claims match the granularity of measurements — no proxy gap exists."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 6 specifies 'ChatGPT (gpt-3.5-turbo-0301) and GPT-4 (gpt-4-0314)' with exact API model identifiers."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Full prompt text is provided in Appendix A (Figures 6, 7, 8) for claim extraction, query generation, and agreement verification across all four tasks."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for the LLM API calls, despite these significantly affecting output quality."
    163       },
    164       "scaffolding_described": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The five-stage pipeline (claim extraction → query generation → tool querying → evidence collection → agreement verification) is described in detail in Section 4, with each component elaborated in dedicated subsections and illustrated in Figures 1-3."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 5 documents dataset construction procedures for each task: sampling 100 summaries from RoSE, 50 responses from FactPrompts, processing HumanEval prompts per Chen et al. (2022), sampling 100 positive-target GSM-Hard prompts, and creating 100 scientific prompts via self-instruct."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 6.2.3 'Failure Analysis' is a dedicated subsection providing substantive discussion of where FacTool fails across all four tasks, including specific failure modes with examples."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The failure analysis (Section 6.2.3) discusses system-level failure cases (reasoning errors, conflicting evidence, round-off errors) but does not discuss threats to the study's validity, such as evaluation methodology limitations, small sample sizes, or potential biases in author annotation."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The paper claims to be 'task and domain agnostic' without explicitly stating what it does NOT show — e.g., that it was only tested on four specific tasks, in English, using only OpenAI models."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "Raw annotations, intermediate outputs (extracted claims, generated queries, collected evidence per example), and annotated ground truth labels are not made available for independent verification."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 5 describes data collection procedures for each task including source datasets, sampling criteria (e.g., 'positive target solution value' for GSM-Hard), and response generation methods."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants in the study. Data comes from standard benchmarks (RoSE, HumanEval, GSM-Hard) and model-generated responses. Author annotation does not constitute human subject participation."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "While dataset construction is described at a high level (Section 5), there are gaps: how exactly FactPrompts were selected from Quora/TruthfulQA is unclear, the scientific prompt generation process lacks detail, and the annotation procedure ('the authors collectively annotate') provides no inter-annotator agreement or annotation protocol."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "The Acknowledgements section thanks three people for discussions but lists no funding sources, grants, or corporate sponsors despite authors being affiliated with multiple universities and Meta AI."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: Shanghai Jiao Tong University, CMU, City University of Hong Kong, NYU, Meta AI, HKUST, and Shanghai AI Laboratory."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "Cannot be determined since funding is not disclosed. One author (Chunting Zhou) is from Meta AI while the paper evaluates OpenAI products, but without funding disclosure, independence cannot be verified."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests statement or financial interest declarations are present in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The paper does not state the training data cutoff dates for ChatGPT (gpt-3.5-turbo-0301) or GPT-4 (gpt-4-0314), which is necessary to assess whether benchmark data could be in the training set."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No discussion of whether HumanEval, GSM-Hard, RoSE, or TruthfulQA data appeared in the training data of ChatGPT or GPT-4."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "HumanEval (2021), GSM8K/GSM-Hard (2021), and TruthfulQA (2022) were all published before the GPT-4 training cutoff and could be in the training data. This contamination risk is not discussed."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study. Data is from benchmarks and model-generated responses; authors serve as annotators but not as study subjects."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants; IRB review is not applicable."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in the study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in the study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "The framework makes extensive API calls to ChatGPT and GPT-4 for claim extraction, query generation, and agreement verification, plus external tool queries (Google Search, Google Scholar, code execution), but no inference cost, latency, or token consumption is reported."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No total API spend, computational budget, or wall-clock time is reported despite the framework requiring multiple LLM calls and tool queries per example."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "All results appear to be from single runs. No analysis of sensitivity to random seeds or LLM sampling stochasticity is provided."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The number of experimental runs is never stated. Results are presented without indicating how many runs produced them."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Design choices such as the number of generated search queries (2), test cases (3), and potential solutions (3) appear tuned but no search budget or selection process is described."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The selection of specific configurations (e.g., 2 queries per claim, 3 test cases, 3 potential solutions) is not justified. No comparison of different configurations is shown."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The paper makes many comparisons across 4 tasks × 6 methods × 2 levels (claim/response) without any correction for multiple comparisons, though no formal statistical tests are conducted either."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors designed and implemented both FacTool and the self-check baselines, then compared them, without acknowledging the potential bias of evaluating their own system. Baseline implementations may be suboptimal."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "FacTool uses significantly more compute than self-check baselines (multiple LLM calls plus external tool queries vs. a single LLM call), but this compute difference is never discussed or controlled for."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The paper does not discuss whether its benchmarks adequately measure factuality as defined. For example, whether code execution on synthetic test cases truly captures code factuality, or whether author/title matching captures scientific citation accuracy."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "FacTool uses a complex multi-stage pipeline with external tools while self-check baselines use a simple prompting approach. The confound between the tool/scaffold and the underlying LLM reasoning is not addressed — performance gains could come from the scaffolding rather than tool use per se."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of temporal leakage. Benchmarks like HumanEval (2021) and GSM-Hard (derived from GSM8K, 2021) existed before GPT-4's training, meaning the model may have seen solutions."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the evaluation setup leaks information. For example, ChatGPT generates both the responses being evaluated and serves as the claim extractor and verifier."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No discussion of independence between training and test data. The same model that generated responses is used in FacTool's pipeline for verification."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No leakage detection or prevention methods (canary strings, membership inference, temporal splits, decontamination) are applied."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "FacTool powered by GPT-4 outperforms all other baselines across all four scenarios at both claim and response levels.",
    375       "evidence": "Tab. 5 shows FacTool+GPT-4 achieves highest F1 in all tasks: 89.09/71.79 (KB-QA), 92.11/92.11 (Code), 98.97/80.36 (Math), 95.24/94.74 (Scientific) claim/response-level F1.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "FacTool significantly outperforms self-check baselines in scientific literature review, with 95.24 vs 36.71 claim-level F1.",
    380       "evidence": "Tab. 5 shows FacTool+GPT-4 at 95.24 vs Self-Check(3)+GPT-4 at 36.71 claim-level F1, and 94.74 vs 21.54 response-level F1 for scientific literature review.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Self-check models are prone to false positives and less sensitive in detecting factual errors compared to FacTool.",
    385       "evidence": "Tab. 5 shows consistently lower precision for self-check vs FacTool: 63.16 vs 87.50 (KB-QA), 76.43 vs 89.74 (Code), 58.33 vs 69.23 (Math), 12.73 vs 100.00 (Scientific) response-level precision.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "GPT-4 has the best factual accuracy among evaluated chatbots (GPT-4, ChatGPT, Claude-v1, Bard, Vicuna-13B).",
    390       "evidence": "Tab. 6 and Exp-III (Section 6.3) show GPT-4 with 75.60% weighted claim-level accuracy and 43.33% response-level accuracy, highest among all chatbots, evaluated on only 60 prompts.",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "ChatGPT's claim extraction closely matches human-annotated atomic content units, with ROUGE-1 F1 of 0.7836.",
    395       "evidence": "Tab. 4 reports ChatGPT achieves 0.7836 ROUGE-1 F1, 0.6610 ROUGE-2 F1, 0.7655 ROUGE-L F1, and 0.7174 BERTScore F1 compared to golden ACUs on RoSE (Exp-I, Section 6.1).",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No statistical significance testing",
    402       "detail": "All 'outperforms' claims are based on comparing point estimates without any significance tests. With sample sizes of 50-164, observed differences could be due to chance."
    403     },
    404     {
    405       "flag": "No uncertainty quantification",
    406       "detail": "Single-run results with no error bars, confidence intervals, or variance across runs. LLM outputs are stochastic, so results could vary substantially across runs."
    407     },
    408     {
    409       "flag": "Authors annotated their own benchmark",
    410       "detail": "Section 5.3 states 'the authors collectively annotate the extracted claims' for KB-QA and scientific literature review. No inter-annotator agreement is reported, and author bias toward their own system's outputs is not addressed."
    411     },
    412     {
    413       "flag": "Overclaiming generality",
    414       "detail": "The paper claims 'task and domain agnostic' in the abstract and title but only tests four specific tasks in English using OpenAI models. The framework requires task-specific claim definitions, query strategies, and tools for each new domain."
    415     },
    416     {
    417       "flag": "Unfair compute comparison",
    418       "detail": "FacTool uses multiple LLM calls plus external tool queries (Google Search, Google Scholar, code execution) per example, while self-check baselines use a single LLM call. The significant compute advantage is never discussed."
    419     },
    420     {
    421       "flag": "Small evaluation datasets",
    422       "detail": "Evaluation uses 50-164 samples per task (Tab. 3), with heavily imbalanced class distributions in some cases (e.g., 10:90 positive:negative in scientific literature review). No justification for these sizes."
    423     },
    424     {
    425       "flag": "No contamination analysis",
    426       "detail": "Benchmarks like HumanEval and GSM-Hard predate GPT-4's training and could be in its training data, but this contamination risk is never discussed."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Evaluating large language models trained on code",
    432       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    433       "year": 2021,
    434       "relevance": "Introduces HumanEval benchmark for code generation evaluation, used as one of the four evaluation tasks in this paper."
    435     },
    436     {
    437       "title": "RARR: Researching and revising what language models say, using language models",
    438       "authors": ["Luyu Gao", "Zhuyun Dai", "Panupong Pasupat"],
    439       "year": 2022,
    440       "relevance": "Prior work on using LLMs to verify and revise their own outputs through retrieval, directly relevant to factuality detection in LLM outputs."
    441     },
    442     {
    443       "title": "Toolformer: Language models can teach themselves to use tools",
    444       "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"],
    445       "year": 2023,
    446       "relevance": "Key work on tool use in LLMs, foundational motivation for FacTool's tool-augmented approach to factuality detection."
    447     },
    448     {
    449       "title": "Self-refine: Iterative refinement with self-feedback",
    450       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    451       "year": 2023,
    452       "relevance": "Self-check baselines in FacTool are based on this work's approach of using LLMs to identify and correct their own errors."
    453     },
    454     {
    455       "title": "Teaching large language models to self-debug",
    456       "authors": ["Xinyun Chen", "Maxwell Lin", "Nathanael Schärli"],
    457       "year": 2023,
    458       "relevance": "Related work on LLM self-debugging for code generation, one of the baseline approaches for self-checking."
    459     },
    460     {
    461       "title": "Survey of hallucination in natural language generation",
    462       "authors": ["Ziwei Ji", "Nayeon Lee", "Rita Frieske"],
    463       "year": 2023,
    464       "relevance": "Comprehensive survey of LLM hallucination problems that motivates factuality detection frameworks like FacTool."
    465     },
    466     {
    467       "title": "PAL: Program-aided language models",
    468       "authors": ["Luyu Gao", "Aman Madaan", "Shuyan Zhou"],
    469       "year": 2022,
    470       "arxiv_id": "2211.10435",
    471       "relevance": "Demonstrates using code execution to augment LLM reasoning, relevant to FacTool's tool-augmented verification approach for math problems."
    472     },
    473     {
    474       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    475       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    476       "year": 2023,
    477       "relevance": "Chain-of-thought prompting is used in FacTool's self-check baselines and agreement verification step."
    478     },
    479     {
    480       "title": "Training verifiers to solve math word problems",
    481       "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian"],
    482       "year": 2021,
    483       "arxiv_id": "2110.14168",
    484       "relevance": "Introduces GSM8K dataset from which GSM-Hard (used in FacTool evaluation) is derived."
    485     },
    486     {
    487       "title": "GPT-4 technical report",
    488       "authors": ["OpenAI"],
    489       "year": 2023,
    490       "relevance": "GPT-4 is one of the two primary LLMs used to power FacTool and is evaluated as a chatbot in the factuality assessment."
    491     },
    492     {
    493       "title": "CodeT: Code generation with generated tests",
    494       "authors": ["Bei Chen", "Fengji Zhang", "Anh Nguyen"],
    495       "year": 2022,
    496       "arxiv_id": "2207.10397",
    497       "relevance": "FacTool's code verification approach of generating test cases and potential solutions builds on this work's methodology."
    498     },
    499     {
    500       "title": "LIMA: Less is more for alignment",
    501       "authors": ["Chunting Zhou", "Pengfei Liu", "Puxin Xu"],
    502       "year": 2023,
    503       "relevance": "Source of KB-QA prompts used in FacTool's chatbot evaluation experiment (Exp-III)."
    504     }
    505   ],
    506   "engagement_factors": {
    507     "practical_relevance": {
    508       "score": 3,
    509       "justification": "Directly usable tool for fact-checking LLM outputs across multiple domains, released as open-source code with a ChatGPT plugin interface."
    510     },
    511     "surprise_contrarian": {
    512       "score": 1,
    513       "justification": "Confirms the expected finding that external tools help verify LLM-generated facts rather than challenging conventional wisdom."
    514     },
    515     "fear_safety": {
    516       "score": 1,
    517       "justification": "Addresses LLM hallucination concerns but does not demonstrate novel attacks or raise new safety issues."
    518     },
    519     "drama_conflict": {
    520       "score": 0,
    521       "justification": "No controversy or conflict angle; straightforward framework proposal and evaluation."
    522     },
    523     "demo_ability": {
    524       "score": 3,
    525       "justification": "Code released on GitHub with ChatGPT plugin interface, immediately tryable by developers."
    526     },
    527     "brand_recognition": {
    528       "score": 2,
    529       "justification": "Authors from CMU, NYU, and Meta AI; evaluates ChatGPT and GPT-4 which are high-profile, but paper is not from OpenAI/Anthropic/Google directly."
    530     }
    531   }
    532 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs