ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (35089B)


      1 {
      2   "paper": {
      3     "title": "GPT-4 Technical Report",
      4     "authors": ["OpenAI"],
      5     "year": 2023,
      6     "venue": "arXiv",
      7     "arxiv_id": "2303.08774"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": true,
     16         "justification": "The paper open-sources OpenAI Evals (https://github.com/openai/evals), their evaluation framework. However, model weights, training code, and full reproduction pipelines are not released."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The evaluations use standard public benchmarks (MMLU, HumanEval, HellaSwag, ARC, WinoGrande, DROP, GSM-8K, TruthfulQA) that are all publicly available. The paper does not create new datasets."
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "Section 2 explicitly states: 'this report contains no further details about the architecture (including model size), hardware, training compute, dataset construction, training method, or similar.' No environment specifications are provided."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No step-by-step reproduction instructions are provided. The model is proprietary, architecture is withheld, and the specific evaluation scripts beyond the Evals framework are not released. The paper cannot be reproduced by external researchers."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Tables 1 and 2 report point estimates only (e.g., '298 / 400', '86.4%') with no confidence intervals or error bars. The only uncertainty quantification is the ECE in Figure 8's calibration plots."
     39       },
     40       "significance_tests": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper claims 'GPT-4 outperforms both previous large language models and most state-of-the-art systems' based solely on comparing numbers. No statistical significance tests (p-values, t-tests, bootstrap) are reported for any comparison."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper reports effect sizes with baselines: '19 percentage points higher than our latest GPT-3.5' (Section 5), '82% compared to GPT-3.5' (Section 6), and Tables 1-2 provide both GPT-4 and GPT-3.5 scores enabling magnitude comparison."
     49       },
     50       "sample_size_justified": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No justification is given for the number of exam questions, benchmark examples, or evaluation prompts used. The 5,214 prompts for the preference evaluation and 10 Codeforces contests are stated but not justified."
     54       },
     55       "variance_reported": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "Most results are single-run point estimates. The Codeforces section mentions averaging over 100 simulations but reports no standard deviation. Appendix A.3 explicitly states free-response questions were 'only a single time.' No variance or standard deviation is reported across experimental runs."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "GPT-3.5 is the primary baseline throughout. Table 2 also compares to LM SOTA (PaLM, LLaMA, U-PaLM) and overall SOTA with benchmark-specific tuning."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Baselines include PaLM (2022), LLaMA (2023), Chinchilla (2022), U-PaLM (2022), and Flan-PaLM (2022), which were all contemporary or near-contemporary at time of publication."
     71       },
     72       "ablation_study": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Appendix B compares GPT-4 base model vs GPT-4 post-RLHF on all exam benchmarks (73.7% vs 74.0% average). Tables 9-10 show results with contaminated questions removed. Table 1 compares vision vs no-vision variants."
     76       },
     77       "multiple_metrics": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The paper evaluates on dozens of different metrics across exams (bar exam, SAT, GRE, AP exams) and benchmarks (MMLU accuracy, HumanEval pass rate, HellaSwag, ARC, WinoGrande, DROP F1, GSM-8K, TruthfulQA, RealToxicityPrompts, human preference rate)."
     81       },
     82       "human_evaluation": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Human labelers judged GPT-4 vs GPT-3.5 responses on 5,214 prompts (70.2% preference for GPT-4, Section 4). Free-response exam answers were graded by 'qualified third-party contractors.' Over 50 expert red teamers evaluated model outputs."
     86       },
     87       "held_out_test_set": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Section 4: 'The evaluation setup was designed based on performance on a validation set of exams, and we report final results on held-out test exams.' Appendix A.2 describes the holdout/non-holdout exam pair methodology."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Table 1 breaks down performance by individual exam. Table 2 by benchmark. Figure 5 by language. Figure 6 by factuality category (learning, technology, writing, etc.). Tables 9-10 by contamination per exam."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section 5 discusses hallucinations, reasoning errors, and overconfidence. Table 4 shows an incorrect TruthfulQA answer. Codeforces rating is below 5th percentile. The System Card extensively catalogs failure modes including harmful content generation, bias, and jailbreaks (Figures 1, 10)."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Codeforces performance is poor (below 5th percentile). AP English scores are mediocre (2/5). RLHF degrades calibration (Figure 8, ECE from 0.007 to 0.074). Appendix B shows RLHF hurts performance on several individual exams. GPT-4 underperformed predictions on the easiest HumanEval bucket (Section 3.2)."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The abstract claims 'human-level performance on various professional and academic benchmarks' (supported by Table 1), 'passing a simulated bar exam with a score around the top 10%' (298/400, ~90th percentile in Table 1), and 'accurately predict some aspects of GPT-4's performance' (supported by Figures 1-2)."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "The paper claims 'The post-training alignment process results in improved performance on measures of factuality' (abstract) and 'We've decreased the model's tendency to respond to requests for disallowed content by 82%' (Section 6), attributing these to the combined RLHF + RBRM + data filtering pipeline without isolating individual intervention effects. The base vs RLHF ablation (Appendix B) only covers exam performance, not the emphasized safety/factuality claims."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Claims are generally bounded: 'human-level performance on various professional and academic benchmarks' (not general intelligence), 'on translated variants of MMLU' for multilingual claims. The abstract explicitly states 'While less capable than humans in many real-world scenarios.' Section 5 bounds capabilities clearly."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Appendices C-D extensively analyze contamination as an alternative explanation for benchmark performance, with per-exam contamination rates and decontaminated scores. Appendix E acknowledges GSM-8K was partially in training data. Figure 3 discusses inverse scaling as an alternative trend."
    128       },
    129       "proxy_outcome_distinction": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The abstract states 'While less capable than humans in many real-world scenarios' while claiming 'human-level performance on various professional and academic benchmarks,' explicitly distinguishing benchmark performance from real-world capability. Section 5 reinforces this by discussing hallucinations and reasoning errors despite strong exam scores."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper refers to 'GPT-4' and 'GPT-3.5' without version strings. Appendix A.7 gives internal snapshot dates (March 1, 2023; February 23, 2023; December 16, 2022) that are not externally reproducible identifiers. Architecture and model size are explicitly withheld (Section 2)."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Appendix A.8 provides detailed few-shot prompts used for multiple-choice evaluation (AP Art History example with full chain-of-thought) and free-response evaluation, including actual prompt text, sampling parameters (temperature, max_tokens, stop tokens), and format."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Temperature settings are specified: 0.3 for MCQ explanations, 0.0 for letter extraction, 0.6 for free-response (Appendix A.2-A.3). Max tokens and stop tokens are shown in the example prompts. The Codeforces evaluation reports 10 attempts per problem."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "The main evaluations use direct prompting without agentic scaffolding. The ARC autonomous replication tests (System Card Section 2.9) use a scaffold described as a 'read-execute-print loop' but this is a secondary evaluation, not the main methodology."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Appendix A describes exam sourcing (publicly available materials, purchased study guides), contamination checking methodology, prompt formatting (separate for MCQ and free-response), image handling (text tags for non-vision models, embedded images for multimodal), and scoring methodology with specific references."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 5 is titled 'Limitations' and provides substantive discussion of hallucinations, knowledge cutoff, inability to learn from experience, reasoning errors, overconfidence, bias, and calibration degradation. The System Card (Section H, 60+ pages) provides extensive additional limitation analysis."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 5 discusses specific threats: hallucination ('can suffer from hallucinations'), temporal knowledge cutoff ('September 2021'), post-training calibration degradation (Figure 8, ECE increases from 0.007 to 0.074), reasoning errors on specific problem types, and security vulnerabilities in generated code. The System Card discusses jailbreak vulnerabilities."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 2 explicitly states: 'this report contains no further details about the architecture... hardware, training compute, dataset construction, training method.' Section 5 states GPT-4 'does not learn from its experience' and 'has a limited context window.' The System Card notes that custom fine-tuning and image capabilities are 'explicitly out of scope.'"
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No raw evaluation data is released. Individual model responses to exam questions, benchmark examples, or safety evaluations are not available. Training data is proprietary. Only aggregate results are reported."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Appendix A details exam sourcing ('most recent publicly-available official past exams, or practice exams in published third-party 2022-2023 study material'), contamination checking procedures, scoring methodologies per exam type, and model snapshot dates. Section 4 describes the preference evaluation data collection."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Red teamer recruitment is described: 'over 50 experts from domains such as long-term AI alignment risks, cybersecurity, biorisk, and international security' (Section 6). Preference evaluation labelers were instructed per footnote 7. Exam graders described as 'qualified third-party contractors with relevant work experience' (Appendix A.3). The System Card notes selection bias in red teamer demographics."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The evaluation pipeline is documented: exam sourcing → contamination checking (50-character substring matching, Appendix C) → prompt formatting (MCQ vs free-response) → generation (with specified temperatures) → scoring (using published methodologies and rubrics). The safety evaluation pipeline (RBRM classifiers, Figure 6-9) is also described."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The paper acknowledges Microsoft's partnership: 'We thank Microsoft for their partnership, especially Microsoft Azure for supporting model training with infrastructure design and management.' This constitutes disclosure of a major funding/resource relationship."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The paper is clearly attributed to 'OpenAI' with extensive authorship credit listings. The company developing and commercially deploying GPT-4 is the same entity conducting the evaluation."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "OpenAI has direct financial interest in GPT-4 performing well — the model is their commercial product. Microsoft, acknowledged as a major partner, is a major investor in OpenAI. Neither entity is independent of the outcome."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is included. The authors are employees of OpenAI, which commercially deploys GPT-4. No disclosure of equity, patents, or other financial interests related to the findings."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "Section 5: 'GPT-4 generally lacks knowledge of events that have occurred after the vast majority of its pre-training data cuts off in September 2021.' Footnote 10 adds: 'The pre-training and post-training data contain a small amount of more recent data.'"
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "Appendix C describes a detailed contamination checking methodology using 50-character substring matching. Tables 9-11 report per-exam and per-benchmark contamination rates. BIG-bench was 'inadvertently mixed into the training set' and excluded (footnote 5). Results with contaminated questions removed are reported alongside full results."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "Tables 9-10 provide per-exam contamination rates ranging from 0% to 100%. Table 11 covers academic benchmarks. The paper reports both contaminated and non-contaminated scores, and concludes 'contamination overall has very little effect on the reported results.' Appendix E explicitly addresses GSM-8K partial inclusion in training."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "The paper does not conduct a human subjects study. Human labelers and red teamers are evaluators of model outputs, not study participants."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human subjects study is conducted. Labelers and red teamers serve as evaluators, not participants in a research study on humans."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human subjects study. The System Card notes red teamers have 'ties to English-speaking, Western countries' and have 'specific educational and professional backgrounds,' but these are evaluators, not study participants."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human subjects study is conducted."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human subjects experiment. Preference evaluation does randomize response order (footnote 7) but this is an evaluation methodology detail, not a human subjects study design."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human subjects study. The preference evaluation uses blinding ('labelers were not told which response was generated by which model,' footnote 7) but this is part of model evaluation, not a study of human subjects."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human subjects study is conducted."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "No inference costs, API costs, token consumption, or latency numbers are reported for any evaluation. The paper does not quantify the cost of running the exam or benchmark evaluations."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "Section 2 explicitly withholds this: 'this report contains no further details about the architecture (including model size), hardware, training compute.' Figure 1's x-axis is normalized compute (GPT-4 = 1) with no absolute values."
    287       }
    288     },
    289     "experimental_rigor": {
    290       "seed_sensitivity_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No seed sensitivity analysis is reported. Exam evaluations appear to be single-run. Codeforces simulations are averaged over 100 iterations but no variance across seeds is shown. Main benchmark results (MMLU, HumanEval, etc.) do not report seed sensitivity."
    294       },
    295       "number_of_runs_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Appendix A.3 states free-response questions were run 'only a single time.' Appendix A.6 states Codeforces used '10 attempts per problem' with '100 times' simulation per contest. Appendix A.2 states holdout exams were run 'once for a final score.' While not always explicit for every benchmark, run counts are stated for the major evaluations."
    299       },
    300       "hyperparameter_search_budget": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "Section 4 states 'The evaluation setup was designed based on performance on a validation set of exams' but does not report how many configurations were tried, what search method was used, or what compute was spent on tuning the evaluation methodology."
    304       },
    305       "best_config_selection_justified": {
    306         "applies": true,
    307         "answer": true,
    308         "justification": "Appendix A.2: 'We iterated on our methodology using the nonholdout exam, and then ran each holdout exam once for a final score.' This holdout methodology for configuration selection is clearly described and avoids overfitting to the test set."
    309       },
    310       "multiple_comparison_correction": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The paper makes dozens of comparisons across exams and benchmarks (Tables 1-2, Figures 4-9) with no correction for multiple comparisons. No Bonferroni, Holm, or other family-wise error rate corrections are applied."
    314       },
    315       "self_comparison_bias_addressed": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "OpenAI evaluates its own commercial product without acknowledging self-evaluation bias. While standard benchmarks and third-party exam graders mitigate this somewhat, the paper does not discuss the inherent bias of a company evaluating its own model, nor does it include independent third-party evaluation of the main results."
    319       },
    320       "compute_budget_vs_performance": {
    321         "applies": true,
    322         "answer": true,
    323         "justification": "Figures 1 and 2 are core contributions showing performance as a function of normalized compute budget. The scaling law analysis (Section 3) explicitly plots loss and capability vs compute, demonstrating predictable scaling relationships."
    324       },
    325       "benchmark_construct_validity": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The paper uses professional exams and academic benchmarks without questioning whether they measure the claimed capabilities. No discussion of whether bar exam scores reflect legal reasoning ability, whether MMLU scores measure understanding vs pattern matching, or whether these proxies have construct validity for 'human-level performance.'"
    329       },
    330       "scaffold_confound_addressed": {
    331         "applies": false,
    332         "answer": false,
    333         "justification": "The main evaluations use direct prompting without agentic scaffolding. Model comparisons use consistent prompting methodology across GPT-4 and GPT-3.5. No scaffold confound is present in the primary results."
    334       }
    335     },
    336     "data_leakage": {
    337       "temporal_leakage_addressed": {
    338         "applies": true,
    339         "answer": true,
    340         "justification": "The paper states the September 2021 training cutoff and systematically checks whether exam/benchmark questions appeared in training data. Appendices C-D provide detailed contamination analysis, and results are reported both with and without contaminated questions."
    341       },
    342       "feature_leakage_addressed": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "The paper does not discuss whether the evaluation format itself leaks information. No analysis of whether few-shot prompting examples, answer choice formatting, or exam structure provides hints not available in real deployment. The multiple-choice format ('present all answers ABCD') could differ from real-world usage."
    346       },
    347       "non_independence_addressed": {
    348         "applies": true,
    349         "answer": true,
    350         "justification": "The contamination analysis (Appendix C) uses substring matching to check for overlap between evaluation and training data. The paper identifies and excludes BIG-bench due to inadvertent training set contamination. Per-exam contamination rates are reported with decontaminated results."
    351       },
    352       "leakage_detection_method": {
    353         "applies": true,
    354         "answer": true,
    355         "justification": "Appendix C describes a concrete method: 'We measure cross-contamination between our evaluation dataset and the pre-training data using substring match... we randomly select three substrings of 50 characters.' This yields per-example contamination labels. Limitations of the method (false positives and negatives) are acknowledged."
    356       }
    357     }
    358   },
    359   "claims": [
    360     {
    361       "claim": "GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers.",
    362       "evidence": "Table 1 shows 298/400 (~90th percentile) on the Uniform Bar Exam. GPT-4 achieves top scores on many AP exams, SAT, GRE, and professional exams. Figure 4 visualizes performance across exams.",
    363       "supported": "strong"
    364     },
    365     {
    366       "claim": "GPT-4's final loss can be accurately predicted from models trained with 1,000x to 10,000x less compute using a scaling law.",
    367       "evidence": "Figure 1 shows the predicted vs observed loss on an internal codebase, with the power law fit from smaller models accurately predicting GPT-4's final loss. Section 3.1 describes the methodology.",
    368       "supported": "strong"
    369     },
    370     {
    371       "claim": "GPT-4 outperforms existing language models on all benchmarks tested, and beats SOTA with benchmark-specific training on all datasets except DROP.",
    372       "evidence": "Table 2 shows GPT-4 achieving 86.4% on MMLU (vs 70.7% LM SOTA), 95.3% on HellaSwag (vs 84.2%), 96.3% on ARC (vs 85.2%), 67.0% on HumanEval (vs 26.2%), and 92.0% on GSM-8K (vs 58.8%). DROP is the exception (80.9 vs 88.4 SOTA).",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "RLHF post-training does not substantially alter base model capability on exam benchmarks.",
    377       "evidence": "Appendix B, Table 8: averaged across all exams, the base model achieves 73.7% while the RLHF model achieves 74.0%. Individual exam comparisons show variation in both directions.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "GPT-4 significantly reduces hallucinations relative to previous GPT-3.5 models, scoring 19 percentage points higher on internal factuality evaluations.",
    382       "evidence": "Figure 6 shows GPT-4 outperforming ChatGPT versions across nine factuality categories. The evaluation is internal and adversarially designed.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Safety mitigations have decreased the model's tendency to respond to requests for disallowed content by 82% compared to GPT-3.5.",
    387       "evidence": "Figure 9 shows incorrect behavior rates on disallowed and sensitive prompts for text-davinci-003, gpt-3.5-turbo, and gpt-4. Section 6 reports the 82% figure and 29% improvement on sensitive requests.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "GPT-4 surpasses the English-language performance of existing language models on MMLU in 24 of 26 languages tested.",
    392       "evidence": "Figure 5 shows GPT-4's accuracy across 26 languages on MMLU compared to Chinchilla (67.0%), PaLM (69.3%), and GPT-3.5 (70.1%) English performance. GPT-4 exceeds these in 24 languages.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Post-training (RLHF) hurts model calibration significantly.",
    397       "evidence": "Figure 8 shows the pre-trained model has ECE of 0.007 (near-perfect calibration) while the post-trained model has ECE of 0.074 on an MMLU subset.",
    398       "supported": "strong"
    399     }
    400   ],
    401   "methodology_tags": ["benchmark-eval"],
    402   "key_findings": "GPT-4 achieves human-level or better performance on most professional and academic exams tested, including top 10% on the bar exam. Scaling laws allow accurate prediction of GPT-4's loss and capabilities from 1000x-10000x smaller models. RLHF post-training minimally affects base model exam capability (73.7% vs 74.0%) but significantly improves safety metrics (82% reduction in disallowed content responses) while degrading calibration (ECE from 0.007 to 0.074). The paper withholds architecture, model size, training data, and compute details, citing competitive and safety concerns.",
    403   "red_flags": [
    404     {
    405       "flag": "Company evaluating its own product",
    406       "detail": "OpenAI evaluates GPT-4, its commercial product, without independent third-party verification of the main results. The company has direct financial interest in GPT-4 appearing capable. While standard benchmarks provide some objectivity, internal evaluations (factuality, safety) have no external validation."
    407     },
    408     {
    409       "flag": "Architecture and training details completely withheld",
    410       "detail": "Section 2 explicitly states: 'this report contains no further details about the architecture (including model size), hardware, training compute, dataset construction, training method, or similar.' This makes independent verification or reproduction impossible."
    411     },
    412     {
    413       "flag": "No error bars or variance on main results",
    414       "detail": "Tables 1 and 2 report only point estimates across dozens of exams and benchmarks. No confidence intervals, error bars, or variance across runs. Free-response questions were run only once (Appendix A.3). This makes it impossible to assess result reliability."
    415     },
    416     {
    417       "flag": "Contamination methodology limitations acknowledged but not addressed",
    418       "detail": "Appendix C acknowledges the 50-character substring method has false negatives (paraphrased content would be missed) and false positives. RLHF post-training data was 'not checked explicitly' for contamination. GSM-8K was acknowledged to be partially in training data (Appendix E)."
    419     },
    420     {
    421       "flag": "Safety improvement claims conflate multiple interventions",
    422       "detail": "The 82% reduction in disallowed content is attributed to the combined post-training pipeline (RLHF + RBRMs + data filtering + SFT), but no ablation isolates individual intervention effects. The paper does not report which safety improvements come from which intervention."
    423     },
    424     {
    425       "flag": "Selective presentation acknowledged in System Card",
    426       "detail": "The System Card states: 'the examples included throughout this system card are not zero-shot and are cherry picked from our evaluation efforts to illustrate specific types of safety concerns or harms.' This applies to both positive and negative examples."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Language models are few-shot learners",
    432       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    433       "year": 2020,
    434       "relevance": "GPT-3 paper establishing few-shot prompting capabilities of large language models; foundational to GPT-4's lineage."
    435     },
    436     {
    437       "title": "Evaluating large language models trained on code",
    438       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    439       "year": 2021,
    440       "relevance": "Introduces HumanEval benchmark for code generation used in GPT-4's capability evaluation and scaling prediction."
    441     },
    442     {
    443       "title": "Training language models to follow instructions with human feedback",
    444       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    445       "year": 2022,
    446       "arxiv_id": "2203.02155",
    447       "relevance": "InstructGPT paper establishing the RLHF methodology used for GPT-4's post-training alignment."
    448     },
    449     {
    450       "title": "PaLM: Scaling language modeling with pathways",
    451       "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin"],
    452       "year": 2022,
    453       "arxiv_id": "2204.02311",
    454       "relevance": "Major competing LLM used as baseline in GPT-4's benchmark evaluations."
    455     },
    456     {
    457       "title": "Training compute-optimal large language models",
    458       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"],
    459       "year": 2022,
    460       "arxiv_id": "2203.15556",
    461       "relevance": "Chinchilla scaling laws paper relevant to GPT-4's predictable scaling approach and used as baseline."
    462     },
    463     {
    464       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    465       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    466       "year": 2022,
    467       "relevance": "Chain-of-thought prompting methodology used in GPT-4's GSM-8K evaluation."
    468     },
    469     {
    470       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    471       "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"],
    472       "year": 2022,
    473       "relevance": "Benchmark used to evaluate GPT-4's factuality improvements over GPT-3.5 (Figure 7)."
    474     },
    475     {
    476       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    477       "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"],
    478       "year": 2022,
    479       "arxiv_id": "2204.05862",
    480       "relevance": "Anthropic's RLHF work on AI safety alignment, directly compared to GPT-4's approach on TruthfulQA."
    481     },
    482     {
    483       "title": "Red teaming language models with language models",
    484       "authors": ["Ethan Perez", "Saffron Huang", "H. Francis Song"],
    485       "year": 2022,
    486       "arxiv_id": "2202.03286",
    487       "relevance": "Red teaming methodology for LLM safety evaluation, related to GPT-4's expert adversarial testing approach."
    488     },
    489     {
    490       "title": "Scaling laws for neural language models",
    491       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    492       "year": 2020,
    493       "arxiv_id": "2001.08361",
    494       "relevance": "Foundational scaling laws work that GPT-4's predictable scaling capability prediction builds upon."
    495     },
    496     {
    497       "title": "Measuring massive multitask language understanding",
    498       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    499       "year": 2021,
    500       "relevance": "MMLU benchmark used extensively in GPT-4's capability evaluation including multilingual testing (Figure 5)."
    501     },
    502     {
    503       "title": "LLaMA: Open and efficient foundation language models",
    504       "authors": ["Hugo Touvron", "Thibaut Lavril", "Gautier Izacard"],
    505       "year": 2023,
    506       "arxiv_id": "2302.13971",
    507       "relevance": "Open-source LLM baseline used in GPT-4's benchmark comparisons (Table 2, HellaSwag)."
    508     },
    509     {
    510       "title": "Ethical and social risks of harm from Language Models",
    511       "authors": ["Laura Weidinger", "John Mellor", "Maribeth Rauh"],
    512       "year": 2021,
    513       "relevance": "Framework for understanding societal risks of LLMs that informed GPT-4's System Card analysis."
    514     },
    515     {
    516       "title": "Deep reinforcement learning from human preferences",
    517       "authors": ["Paul F Christiano", "Jan Leike", "Tom Brown"],
    518       "year": 2017,
    519       "relevance": "Original RLHF algorithm paper forming the basis of GPT-4's post-training methodology."
    520     }
    521   ]
    522 }

Impressum · Datenschutz