ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (31778B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "FacTool: Factuality Detection in Generative AI - A Tool Augmented Framework for Multi-Task and Multi-Domain Scenarios",
      6     "authors": [
      7       "I-Chun Chern",
      8       "Steffi Chern",
      9       "Shiqi Chen",
     10       "Weizhe Yuan",
     11       "Kehua Feng",
     12       "Chunting Zhou",
     13       "Junxian He",
     14       "Graham Neubig",
     15       "Pengfei Liu"
     16     ],
     17     "year": 2023,
     18     "venue": "arXiv.org",
     19     "arxiv_id": "2307.13528",
     20     "doi": "10.48550/arXiv.2307.13528"
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The abstract claims 'Experiments on four different tasks show the efficacy of the proposed method,' which is supported by Tab. 5 showing superior performance across all four tasks when powered by GPT-4.",
     28         "source": "opus"
     29       },
     30       "causal_claims_justified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The core claim is that tool augmentation improves factuality detection. This is tested by comparing FacTool (with tools) against Self-Check (same LLM, without tools), which is a controlled comparison isolating the tool-use variable. The study design is adequate for this comparative claim.",
     34         "source": "opus"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper claims to be a 'task and domain agnostic framework' (abstract and Section 1) but only evaluates on four specific tasks in English. The title and framing significantly overclaim relative to the tested scope.",
     40         "source": "opus"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No alternative explanations for the results are discussed. For example, the paper does not consider whether improvements come from the specific prompting strategy rather than tool use, or whether the baselines are suboptimally configured.",
     46         "source": "opus"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper clearly defines factuality for each task (Section 3.2, Tab. 2) and measures it directly against those definitions. Claims match the granularity of measurements — no proxy gap exists.",
     52         "source": "opus"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 6.2.3 'Failure Analysis' is a dedicated subsection providing substantive discussion of where FacTool fails across all four tasks, including specific failure modes with examples.",
     60         "source": "opus"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The failure analysis (Section 6.2.3) discusses system-level failure cases (reasoning errors, conflicting evidence, round-off errors) but does not discuss threats to the study's validity, such as evaluation methodology limitations, small sample sizes, or potential biases in author annotation.",
     66         "source": "opus"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The paper claims to be 'task and domain agnostic' without explicitly stating what it does NOT show — e.g., that it was only tested on four specific tasks, in English, using only OpenAI models.",
     72         "source": "opus"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "The Acknowledgements section thanks three people for discussions but lists no funding sources, grants, or corporate sponsors despite authors being affiliated with multiple universities and Meta AI.",
     80         "source": "opus"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Author affiliations are clearly listed: Shanghai Jiao Tong University, CMU, City University of Hong Kong, NYU, Meta AI, HKUST, and Shanghai AI Laboratory.",
     86         "source": "opus"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "Cannot be determined since funding is not disclosed. One author (Chunting Zhou) is from Meta AI while the paper evaluates OpenAI products, but without funding disclosure, independence cannot be verified.",
     92         "source": "opus"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No competing interests statement or financial interest declarations are present in the paper.",
     98         "source": "opus"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 3.1 formally defines 'factuality' and Section 3.2 provides precise definitions of prompt, response, claim, and evidence for each of the four task domains.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The contributions are explicitly enumerated in the Introduction: extending factuality detection, connecting tool use with factuality, and evaluating modern chatbots.",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 2 and Table 1 systematically compare FACTOOL against prior approaches (FEVER, FactCC, QAGS, WICE, RARR) on specific dimensions (claim source, evidence provision, domain covered).",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "empirical": {
    124       "artifacts": {
    125         "code_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The abstract states: 'We release the code of FACTOOL associated with ChatGPT plugin interface at https://github.com/GAIR-NLP/factool.' A concrete URL is provided.",
    129           "source": "opus"
    130         },
    131         "data_released": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "While evaluation uses standard public benchmarks (RoSE, HumanEval, GSM-Hard), the paper also constructs custom datasets — FactPrompts (50 KB-QA prompts+responses) and 100 scientific literature review prompts+responses — along with ground-truth annotations for all tasks. These custom datasets and annotations are not explicitly stated as released.",
    135           "source": "opus"
    136         },
    137         "environment_specified": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No requirements.txt, Dockerfile, conda environment, or dependency specifications are mentioned in the paper. Only the models used (gpt-3.5-turbo, gpt-4) are named.",
    141           "source": "opus"
    142         },
    143         "reproduction_instructions": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "No step-by-step reproduction instructions are provided in the paper. Code is released but the paper itself contains no 'Reproducing Results' section or equivalent.",
    147           "source": "opus"
    148         }
    149       },
    150       "statistical_methodology": {
    151         "confidence_intervals_or_error_bars": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "All results in Tables 4, 5, and 6 are reported as point estimates (e.g., '89.09 claim-level F1') with no confidence intervals, error bars, or uncertainty measures.",
    155           "source": "opus"
    156         },
    157         "significance_tests": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "The paper claims 'FACTOOL powered by GPT-4 outperforms all other baselines across all scenarios' based solely on comparing point estimates in Tab. 5 without any statistical significance tests.",
    161           "source": "opus"
    162         },
    163         "effect_sizes_reported": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Performance is reported with baseline context throughout (e.g., '71.79 v.s. 57.14 response-level F1 on KB-based QA', '95.24 v.s. 36.71 claim-level F1' on scientific review), allowing the reader to assess the magnitude of differences.",
    167           "source": "opus"
    168         },
    169         "sample_size_justified": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Dataset sizes range from 50 to 164 samples per task (Tab. 3) with no justification for why these sizes are adequate, no power analysis, and no acknowledgment that small sample sizes may limit conclusions.",
    173           "source": "opus"
    174         },
    175         "variance_reported": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "All results are single-run point estimates with no standard deviation, variance, or any spread measure reported across runs.",
    179           "source": "opus"
    180         }
    181       },
    182       "evaluation_design": {
    183         "baselines_included": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Two baselines are included: Self-Check with zero-shot CoT and Self-Check with 3-shot CoT, both using ChatGPT and GPT-4 (Section 6).",
    187           "source": "opus"
    188         },
    189         "baselines_contemporary": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Self-Check baselines reference Madaan et al. (2023) and Chen et al. (2023), which are contemporary works from the same year using the same models.",
    193           "source": "opus"
    194         },
    195         "ablation_study": {
    196           "applies": true,
    197           "answer": false,
    198           "justification": "The framework has five components (claim extraction, query generation, tool querying, evidence collection, agreement verification) but no ablation study is conducted to measure the contribution of individual components.",
    199           "source": "opus"
    200         },
    201         "multiple_metrics": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Results are reported using accuracy, recall, precision, and F1-score at both claim-level and response-level (Tab. 5).",
    205           "source": "opus"
    206         },
    207         "human_evaluation": {
    208           "applies": true,
    209           "answer": false,
    210           "justification": "Human annotation is used to create ground-truth labels (Section 5.3: 'the authors collectively annotate the extracted claims'), but this is dataset construction, not human evaluation of the system's outputs. The schema specifies 'manual classification of the benchmark or dataset itself does not count.'",
    211           "source": "opus"
    212         },
    213         "held_out_test_set": {
    214           "applies": true,
    215           "answer": false,
    216           "justification": "No explicit dev/test split is described. The system's prompts were likely developed and refined using the same data that evaluation is reported on. No statement about separating data used for prompt engineering from test data.",
    217           "source": "opus"
    218         },
    219         "per_category_breakdown": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Results are broken down by task (KB-QA, Code, Math, Scientific) in Tab. 5, by chatbot in Tab. 6, and per-scenario in Figures 4 and 5.",
    223           "source": "opus"
    224         },
    225         "failure_cases_discussed": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Section 6.2.3 'Failure Analysis' provides detailed discussion of failure cases across all four tasks, including reasoning errors, conflicting evidence, limited test case variety, round-off errors, and title/author matching issues, with specific examples.",
    229           "source": "opus"
    230         },
    231         "negative_results_reported": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "The paper reports that 'Self-check models powered by ChatGPT outperform FACTOOL powered by ChatGPT on KB-QA' (Section 6.2.1), which is a negative result for their own method. They also analyze why (reasoning errors in ChatGPT's agreement verification).",
    235           "source": "opus"
    236         }
    237       },
    238       "setup_transparency": {
    239         "model_versions_specified": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Section 6 specifies 'ChatGPT (gpt-3.5-turbo-0301) and GPT-4 (gpt-4-0314)' with exact API model identifiers.",
    243           "source": "opus"
    244         },
    245         "prompts_provided": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Full prompt text is provided in Appendix A (Figures 6, 7, 8) for claim extraction, query generation, and agreement verification across all four tasks.",
    249           "source": "opus"
    250         },
    251         "hyperparameters_reported": {
    252           "applies": true,
    253           "answer": false,
    254           "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for the LLM API calls, despite these significantly affecting output quality.",
    255           "source": "opus"
    256         },
    257         "scaffolding_described": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "The five-stage pipeline (claim extraction → query generation → tool querying → evidence collection → agreement verification) is described in detail in Section 4, with each component elaborated in dedicated subsections and illustrated in Figures 1-3.",
    261           "source": "opus"
    262         },
    263         "data_preprocessing_documented": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "Section 5 documents dataset construction procedures for each task: sampling 100 summaries from RoSE, 50 responses from FactPrompts, processing HumanEval prompts per Chen et al. (2022), sampling 100 positive-target GSM-Hard prompts, and creating 100 scientific prompts via self-instruct.",
    267           "source": "opus"
    268         }
    269       },
    270       "data_integrity": {
    271         "raw_data_available": {
    272           "applies": true,
    273           "answer": false,
    274           "justification": "Raw annotations, intermediate outputs (extracted claims, generated queries, collected evidence per example), and annotated ground truth labels are not made available for independent verification.",
    275           "source": "opus"
    276         },
    277         "data_collection_described": {
    278           "applies": true,
    279           "answer": true,
    280           "justification": "Section 5 describes data collection procedures for each task including source datasets, sampling criteria (e.g., 'positive target solution value' for GSM-Hard), and response generation methods.",
    281           "source": "opus"
    282         },
    283         "recruitment_methods_described": {
    284           "applies": false,
    285           "answer": false,
    286           "justification": "No human participants in the study. Data comes from standard benchmarks (RoSE, HumanEval, GSM-Hard) and model-generated responses. Author annotation does not constitute human subject participation.",
    287           "source": "opus"
    288         },
    289         "data_pipeline_documented": {
    290           "applies": true,
    291           "answer": false,
    292           "justification": "While dataset construction is described at a high level (Section 5), there are gaps: how exactly FactPrompts were selected from Quora/TruthfulQA is unclear, the scientific prompt generation process lacks detail, and the annotation procedure ('the authors collectively annotate') provides no inter-annotator agreement or annotation protocol.",
    293           "source": "opus"
    294         }
    295       },
    296       "contamination": {
    297         "training_cutoff_stated": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "The paper does not state the training data cutoff dates for ChatGPT (gpt-3.5-turbo-0301) or GPT-4 (gpt-4-0314), which is necessary to assess whether benchmark data could be in the training set.",
    301           "source": "opus"
    302         },
    303         "train_test_overlap_discussed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "No discussion of whether HumanEval, GSM-Hard, RoSE, or TruthfulQA data appeared in the training data of ChatGPT or GPT-4.",
    307           "source": "opus"
    308         },
    309         "benchmark_contamination_addressed": {
    310           "applies": true,
    311           "answer": false,
    312           "justification": "HumanEval (2021), GSM8K/GSM-Hard (2021), and TruthfulQA (2022) were all published before the GPT-4 training cutoff and could be in the training data. This contamination risk is not discussed.",
    313           "source": "opus"
    314         }
    315       },
    316       "human_studies": {
    317         "pre_registered": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in this study. Data is from benchmarks and model-generated responses; authors serve as annotators but not as study subjects.",
    321           "source": "opus"
    322         },
    323         "irb_or_ethics_approval": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants; IRB review is not applicable.",
    327           "source": "opus"
    328         },
    329         "demographics_reported": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in the study.",
    333           "source": "opus"
    334         },
    335         "inclusion_exclusion_criteria": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in the study.",
    339           "source": "opus"
    340         },
    341         "randomization_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in the study.",
    345           "source": "opus"
    346         },
    347         "blinding_described": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in the study.",
    351           "source": "opus"
    352         },
    353         "attrition_reported": {
    354           "applies": false,
    355           "answer": false,
    356           "justification": "No human participants in the study.",
    357           "source": "opus"
    358         }
    359       },
    360       "cost_and_practicality": {
    361         "inference_cost_reported": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "The framework makes extensive API calls to ChatGPT and GPT-4 for claim extraction, query generation, and agreement verification, plus external tool queries (Google Search, Google Scholar, code execution), but no inference cost, latency, or token consumption is reported.",
    365           "source": "opus"
    366         },
    367         "compute_budget_stated": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "No total API spend, computational budget, or wall-clock time is reported despite the framework requiring multiple LLM calls and tool queries per example.",
    371           "source": "opus"
    372         }
    373       },
    374       "experimental_rigor": {
    375         "seed_sensitivity_reported": {
    376           "applies": true,
    377           "answer": false,
    378           "justification": "All results appear to be from single runs. No analysis of sensitivity to random seeds or LLM sampling stochasticity is provided.",
    379           "source": "opus"
    380         },
    381         "number_of_runs_stated": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "The number of experimental runs is never stated. Results are presented without indicating how many runs produced them.",
    385           "source": "opus"
    386         },
    387         "hyperparameter_search_budget": {
    388           "applies": true,
    389           "answer": false,
    390           "justification": "Design choices such as the number of generated search queries (2), test cases (3), and potential solutions (3) appear tuned but no search budget or selection process is described.",
    391           "source": "opus"
    392         },
    393         "best_config_selection_justified": {
    394           "applies": true,
    395           "answer": false,
    396           "justification": "The selection of specific configurations (e.g., 2 queries per claim, 3 test cases, 3 potential solutions) is not justified. No comparison of different configurations is shown.",
    397           "source": "opus"
    398         },
    399         "multiple_comparison_correction": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The paper makes many comparisons across 4 tasks × 6 methods × 2 levels (claim/response) without any correction for multiple comparisons, though no formal statistical tests are conducted either.",
    403           "source": "opus"
    404         },
    405         "self_comparison_bias_addressed": {
    406           "applies": true,
    407           "answer": false,
    408           "justification": "The authors designed and implemented both FacTool and the self-check baselines, then compared them, without acknowledging the potential bias of evaluating their own system. Baseline implementations may be suboptimal.",
    409           "source": "opus"
    410         },
    411         "compute_budget_vs_performance": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "FacTool uses significantly more compute than self-check baselines (multiple LLM calls plus external tool queries vs. a single LLM call), but this compute difference is never discussed or controlled for.",
    415           "source": "opus"
    416         },
    417         "benchmark_construct_validity": {
    418           "applies": true,
    419           "answer": false,
    420           "justification": "The paper does not discuss whether its benchmarks adequately measure factuality as defined. For example, whether code execution on synthetic test cases truly captures code factuality, or whether author/title matching captures scientific citation accuracy.",
    421           "source": "opus"
    422         },
    423         "scaffold_confound_addressed": {
    424           "applies": true,
    425           "answer": false,
    426           "justification": "FacTool uses a complex multi-stage pipeline with external tools while self-check baselines use a simple prompting approach. The confound between the tool/scaffold and the underlying LLM reasoning is not addressed — performance gains could come from the scaffolding rather than tool use per se.",
    427           "source": "opus"
    428         }
    429       },
    430       "data_leakage": {
    431         "temporal_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "No discussion of temporal leakage. Benchmarks like HumanEval (2021) and GSM-Hard (derived from GSM8K, 2021) existed before GPT-4's training, meaning the model may have seen solutions.",
    435           "source": "opus"
    436         },
    437         "feature_leakage_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "No discussion of whether the evaluation setup leaks information. For example, ChatGPT generates both the responses being evaluated and serves as the claim extractor and verifier.",
    441           "source": "opus"
    442         },
    443         "non_independence_addressed": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No discussion of independence between training and test data. The same model that generated responses is used in FacTool's pipeline for verification.",
    447           "source": "opus"
    448         },
    449         "leakage_detection_method": {
    450           "applies": true,
    451           "answer": false,
    452           "justification": "No leakage detection or prevention methods (canary strings, membership inference, temporal splits, decontamination) are applied.",
    453           "source": "opus"
    454         }
    455       }
    456     }
    457   },
    458   "claims": [
    459     {
    460       "claim": "FACTOOL powered by GPT-4 outperforms all self-check baselines across all four task scenarios",
    461       "evidence": "Table 5 shows GPT-4 FACTOOL achieves higher F1 than Self-Check(0) and Self-Check(3) in KB-QA (71.79 vs 57.14), Code (92.11 vs 85.26), Math (80.36 vs 70.59), Scientific (94.74 vs 21.54) at response level",
    462       "supported": "strong"
    463     },
    464     {
    465       "claim": "Tool augmentation provides the largest benefit over self-checking in scientific literature review",
    466       "evidence": "Response-level F1 of 94.74 for FACTOOL vs 21.54 for Self-Check(3) with GPT-4 backbone on scientific task (Table 5); Google Scholar is described as highly robust for citation verification",
    467       "supported": "strong"
    468     },
    469     {
    470       "claim": "GPT-4 has the best factual accuracy among evaluated chatbots (GPT-4, ChatGPT, Claude-v1, Bard, Vicuna-13B)",
    471       "evidence": "Table 6 reports weighted claim-level accuracy: GPT-4 75.60%, ChatGPT 68.63%, Claude-v1 63.95%, Bard 61.15%, Vicuna-13B 50.35%, evaluated using FACTOOL as the golden evaluator",
    472       "supported": "moderate"
    473     },
    474     {
    475       "claim": "Self-check models tend to classify claims as True more frequently than FACTOOL, resulting in lower precision",
    476       "evidence": "Table 5 shows Self-Check(3)/GPT-4 response-level precision of 63.16 vs FACTOOL 87.50 on KB-QA; 12.73 vs 100.00 on Scientific literature review",
    477       "supported": "strong"
    478     },
    479     {
    480       "claim": "ChatGPT and GPT-4 perform similarly on claim extraction quality vs human ACU annotations",
    481       "evidence": "Table 4 shows similar ROUGE-1 F1: ChatGPT 0.7836, GPT-4 0.7860; ChatGPT is recommended for claim extraction due to cost efficiency",
    482       "supported": "moderate"
    483     },
    484     {
    485       "claim": "FACTOOL powered by ChatGPT is outperformed by Self-Check(0) on KB-QA at response level",
    486       "evidence": "Table 5 shows Self-Check(0)/ChatGPT achieves 54.90 response F1 vs FACTOOL/ChatGPT 52.63 on KB-QA, attributed to reasoning errors in agreement verification",
    487       "supported": "moderate"
    488     }
    489   ],
    490   "methodology_tags": [
    491     "benchmark-eval",
    492     "case-study"
    493   ],
    494   "key_findings": "FACTOOL is a five-step tool-augmented framework (claim extraction, query generation, tool querying, evidence collection, agreement verification) that outperforms LLM self-checking for factuality detection across KB-QA, code generation, math problem solving, and scientific literature review. GPT-4 powered FACTOOL achieves the best performance on all tasks, with the largest advantage over baselines in scientific literature review (94.74 vs 21.54 response-level F1). Applied as an evaluator for chatbot comparison, FACTOOL ranks GPT-4 highest in factual accuracy (75.60% weighted claim-level) followed by ChatGPT, Claude-v1, Bard, and Vicuna-13B. Failure analysis identifies reasoning errors, conflicting evidence, and ambiguous claims as the main failure modes for KB-QA, while limited test case diversity affects code generation.",
    495   "red_flags": [
    496     {
    497       "flag": "Circular self-evaluation in Exp-III",
    498       "detail": "Experiment III uses GPT-4-powered FACTOOL as the 'golden evaluator' to rank chatbots including GPT-4 itself, creating a circularity where GPT-4's own factuality judgments determine GPT-4's ranking."
    499     },
    500     {
    501       "flag": "No statistical significance testing",
    502       "detail": "All comparative claims ('outperforms all baselines across all scenarios') are made without statistical significance tests on small samples of 50-164 examples per task."
    503     },
    504     {
    505       "flag": "Benchmark contamination unaddressed",
    506       "detail": "HumanEval (2021), GSM8K, and RoSE are all pre-training cutoff for GPT-3.5 and GPT-4; the paper does not discuss whether contamination could inflate or deflate results."
    507     },
    508     {
    509       "flag": "Weak baselines",
    510       "detail": "Self-check (prompting the model to verify itself) is a natural but weak baseline; no comparison to other retrieval-augmented factuality systems like RARR is done in a head-to-head evaluation."
    511     },
    512     {
    513       "flag": "No confidence intervals or variance",
    514       "detail": "All results are single-run point estimates with no variance reporting; LLM stochasticity means results could vary significantly across runs."
    515     },
    516     {
    517       "flag": "No limitations section",
    518       "detail": "Despite claiming domain-agnostic generalization, the paper has no dedicated limitations or threats-to-validity section; failure analysis is not equivalent."
    519     }
    520   ],
    521   "cited_papers": [
    522     {
    523       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    524       "relevance": "Core evaluation benchmark for code generation factuality task"
    525     },
    526     {
    527       "title": "RARR: Researching and Revising What Language Models Say, Using Language Models",
    528       "relevance": "Most closely related prior work on factuality detection without explicit claims/evidence"
    529     },
    530     {
    531       "title": "Training Verifiers to Solve Math Word Problems (GSM8K)",
    532       "relevance": "Base dataset for math problem evaluation task"
    533     },
    534     {
    535       "title": "Revisiting the Gold Standard: Grounding Summarization Evaluation with Robust Human Evaluation (RoSE)",
    536       "relevance": "Provides ground-truth ACUs used for claim extraction evaluation and KB-QA task"
    537     },
    538     {
    539       "title": "FEVER: A Large-Scale Dataset for Fact Extraction and VERification",
    540       "relevance": "Foundational prior work in fact verification that FACTOOL extends beyond"
    541     },
    542     {
    543       "title": "Evaluating the Factual Consistency of Abstractive Text Summarization (FactCC)",
    544       "relevance": "Prior factuality detection approach compared against in Table 1"
    545     },
    546     {
    547       "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    548       "relevance": "Related work on tool use in LLMs that motivates FACTOOL's tool-augmented approach"
    549     },
    550     {
    551       "title": "TruthfulQA: Measuring How Models Mimic Human Falsehoods",
    552       "relevance": "Source of prompts used in FactPrompts dataset construction"
    553     },
    554     {
    555       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    556       "relevance": "Baseline reasoning approach used in self-check baselines and FACTOOL verification"
    557     }
    558   ],
    559   "engagement_factors": {
    560     "practical_relevance": {
    561       "score": 3,
    562       "justification": "Directly usable tool for fact-checking LLM outputs across multiple domains, released as open-source code with a ChatGPT plugin interface."
    563     },
    564     "surprise_contrarian": {
    565       "score": 1,
    566       "justification": "Confirms the expected finding that external tools help verify LLM-generated facts rather than challenging conventional wisdom."
    567     },
    568     "fear_safety": {
    569       "score": 1,
    570       "justification": "Addresses LLM hallucination concerns but does not demonstrate novel attacks or raise new safety issues."
    571     },
    572     "drama_conflict": {
    573       "score": 0,
    574       "justification": "No controversy or conflict angle; straightforward framework proposal and evaluation."
    575     },
    576     "demo_ability": {
    577       "score": 3,
    578       "justification": "Code released on GitHub with ChatGPT plugin interface, immediately tryable by developers."
    579     },
    580     "brand_recognition": {
    581       "score": 2,
    582       "justification": "Authors from CMU, NYU, and Meta AI; evaluates ChatGPT and GPT-4 which are high-profile, but paper is not from OpenAI/Anthropic/Google directly."
    583     }
    584   },
    585   "hn_data": {
    586     "threads": [
    587       {
    588         "hn_id": "35544388",
    589         "title": "Many bioinformatics programming tasks can be automated with ChatGPT",
    590         "points": 1,
    591         "comments": 0,
    592         "url": "https://news.ycombinator.com/item?id=35544388"
    593       },
    594       {
    595         "hn_id": "37317042",
    596         "title": "Two-way quantum computers – enhancement of 1WQC to solve NP problems",
    597         "points": 1,
    598         "comments": 1,
    599         "url": "https://news.ycombinator.com/item?id=37317042"
    600       }
    601     ],
    602     "top_points": 1,
    603     "total_points": 2,
    604     "total_comments": 1
    605   }
    606 }

Impressum · Datenschutz