scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28151B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "GPT-4 Technical Report",
      6     "authors": [
      7       "OpenAI",
      8       "Josh Achiam",
      9       "Steven Adler",
     10       "Sandhini Agarwal",
     11       "L. Ahmad",
     12       "Ilge Akkaya"
     13     ],
     14     "year": 2023,
     15     "venue": "arXiv",
     16     "arxiv_id": "2303.08774",
     17     "doi": null
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract claims of bar exam top 10%, human-level performance on benchmarks, and predictable scaling are all backed by Table 1, Figures 1-2, and Table 2 in the paper.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "RLHF improvement claims are supported by ablation in Appendix B (base vs. RLHF model comparisons); scaling predictions are validated by fitting power laws to smaller models and confirming on GPT-4.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The abstract claims 'human-level performance' broadly based on exam simulations; while the paper acknowledges 'less capable than humans in many real-world scenarios,' this caveat is buried relative to the headline framing throughout.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Architecture, training data, and compute details are all withheld, so alternative explanations for capability improvements (scale vs. data quality vs. architectural choices vs. RLHF quality) are never considered.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper equates benchmark and exam performance with 'human-level capability' without adequately distinguishing standardized test scores from real-world competence.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 5 is a dedicated limitations section discussing hallucinations, knowledge cutoffs, reasoning errors, biases, and calibration reduction after RLHF post-training.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Specific threats discussed include calibration degradation after RLHF (ECE from 0.007 to 0.074, Figure 8), per-exam contamination rates (Appendices C/D), and the model's tendency to double-down on incorrect information.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Section 2 explicitly states the report omits architecture, model size, hardware, training compute, and dataset construction; scope is bounded to capabilities, limitations, and safety properties.",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No formal funding disclosure is present; Microsoft compute support is acknowledged but there is no funding statement disclosing OpenAI's self-funding or commercial interests in the product.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All authors are identified as OpenAI employees; the paper is attributed to 'OpenAI' with extensive individual contributor roles listed in the acknowledgments.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "OpenAI is evaluating its own flagship commercial product; the organization has direct financial interest in GPT-4 being perceived as capable and safe.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement or financial interests declaration appears anywhere in the paper; OpenAI's commercial stake in GPT-4 is not formally disclosed.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "RLHF is explained, 'hallucination' is defined with a footnote acknowledging anthropomorphization concerns, and 'human-level performance' is operationalized via specific exam percentiles.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper clearly states it reports GPT-4's capabilities, limitations, and safety properties including predictable scaling methodology and RLHF safety mitigations.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper explicitly compares to GPT-3.5, Chinchilla, PaLM, and LLaMA; Table 2 positions results against both LLM SOTA and task-specific SOTA systems across multiple benchmarks.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "Model weights, training code, and architecture are not released; OpenAI Evals is open-sourced but without the model itself, reproduction of the main results is impossible.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "Training data and internal safety/factuality evaluation datasets are not released; only standard public benchmarks (MMLU, HumanEval) used in the paper are publicly available.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "Architecture, hardware, and training infrastructure are explicitly withheld; no environment specifications (requirements, Docker, etc.) are provided for any component.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No reproduction instructions are provided; the model and training details are not disclosed, making replication of any main result impossible.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "No confidence intervals or error bars are reported for main benchmark results; scaling law predictions show fit lines but with no formal uncertainty quantification.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No statistical significance tests are reported for any comparative claims between GPT-4 and GPT-3.5 or other baselines.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Percentage improvements with baseline context are reported: 82% reduction in disallowed content vs. GPT-3.5, 19pp higher adversarial factuality score, 70.2% human preference rate.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "Sample sizes are reported (e.g., 5,214 prompts for preference evaluation) but no power analysis or adequacy justification is provided.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Variance across runs is not reported for benchmark results; Codeforces averaging 100 simulations is mentioned but without standard deviation.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "GPT-3.5 is the primary baseline throughout; Table 2 compares to both LLM SOTA evaluated few-shot and task-specific SOTA systems.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Baselines include GPT-3.5, PaLM, Chinchilla, and LLaMA — all contemporary models from 2022-2023 published shortly before GPT-4.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Appendix B compares GPT-4 base model vs. RLHF model across all exams, finding minimal average difference (73.7% base vs. 74.0% RLHF) with bidirectional per-exam variation.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Evaluation uses dozens of exam types, NLP benchmarks (MMLU, HellaSwag, ARC, HumanEval, GSM-8K), safety metrics (toxicity, factuality, refusal rates), and human preference rates.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Human labelers evaluate response preferences for 5,214 prompts; expert third-party contractors grade free-response exam answers; 50+ domain experts conduct red teaming.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Section 4 states 'We ran a variant with [contaminated] questions removed and report the lower score'; Appendix A describes held-out vs. non-holdout exam methodology.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are broken down by exam type (Table 1), by language (Figure 5), by question difficulty bucket (Figure 2), and by safety category (Figure 6, 9).",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section 5 discusses hallucinations, reasoning errors, and biases; the System Card includes extensive failure case examples for harmful content, bias, and jailbreaks (Figures 1-4, 10).",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Codeforces performance is below the 5th percentile; AMC 10/12 results are mediocre; Appendix B shows RLHF sometimes reduces performance on individual exams (e.g., SAT Math 91.4% → 86.2%).",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "Specific model snapshots are identified: March 1, 2023 (MCQ evaluations), February 23, 2023 (free-response), and December 16, 2022 (USABO); GPT-3.5 standard ChatGPT snapshot used.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Appendix A.8 provides complete few-shot prompt templates with full chain-of-thought examples for both multiple-choice and free-response evaluation formats.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Temperature settings are reported: 0.3 for multiple-choice extraction, 0.6 for free-response generation, 0.0 for some follow-up answer sampling.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "The main evaluations involve direct prompting of a black-box model without agentic scaffolding; the tool-use illustration in the system card is not a systematic evaluation component.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Contamination-checking methodology is documented in Appendices C and D; exam preprocessing (image transcription, scoring methodology, percentile calculation) is described in Appendix A.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "Internal factuality and safety evaluation datasets, human preference data, and red teaming results are not publicly available for independent verification.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Human preference data collection is described in footnotes 7 and 30: prompts from ChatGPT/API users, filtering criteria for PII and disallowed content, labeler instructions, and randomization.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Red teamer selection criteria are described (domain expertise in alignment, cybersecurity, biorisk, etc.); labeler management practices reference industry standards (footnote 28).",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": false,
    289           "justification": "The evaluation pipeline for preference data is described, but the full pipeline from training data collection through final model is not documented due to the deliberately withheld architecture and training details.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": true,
    297           "justification": "Section 5 states 'GPT-4 generally lacks knowledge of events that have occurred after the vast majority of its pre-training data cuts off in September 2021.'",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": true,
    303           "justification": "Appendices C and D provide detailed contamination analysis using substring matching; contamination rates and their effect on scores are reported per exam and per benchmark.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": true,
    309           "justification": "Tables 9, 10, and 11 report per-exam contamination rates; results on non-contaminated subsets are shown, generally confirming contamination is not a major confounder.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "No pre-registration is mentioned for the human preference evaluations, red teaming studies, or expert adversarial testing.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": true,
    322           "answer": false,
    323           "justification": "No IRB or ethics approval is mentioned for human labeler work or the human participant interactions in the ARC autonomous replication tests.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": true,
    328           "answer": false,
    329           "justification": "Demographics of human labelers are not reported; the system card notes red teamers are primarily from English-speaking Western countries but provides no detailed demographic breakdown.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": true,
    334           "answer": false,
    335           "justification": "Criteria for filtering prompts (no PII, no disallowed content) are described but participant inclusion/exclusion criteria for human labelers are not stated.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": true,
    340           "answer": true,
    341           "justification": "Footnote 30 states 'the order in which the responses were presented was randomised' for the human preference evaluation.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": true,
    346           "answer": true,
    347           "justification": "Footnote 30 states 'The labelers were not told which response was generated by which model' — labelers were blind to model identity throughout.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": true,
    352           "answer": false,
    353           "justification": "No attrition or dropout information is reported for human labelers, expert red teamers, or any other human participant group.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "Inference cost and latency are not reported anywhere in the technical report or system card.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "Training compute is explicitly withheld: 'this report contains no further details about the architecture (including model size), hardware, training compute, dataset construction, training method, or similar.'",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "GPT-4 achieves a score in the top 10% of test takers on a simulated Uniform Bar Exam",
    376       "evidence": "Table 1 shows 298/400 (~90th percentile); Table 9 contamination analysis confirms the result holds on non-contaminated subset",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Smaller models can accurately predict GPT-4's performance via power law scaling using 1000-10000x less compute",
    381       "evidence": "Figures 1 and 2 show fitted power laws accurately predict GPT-4 loss and HumanEval pass rate; predictions registered before training completed",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "RLHF post-training reduces disallowed content generation by 82% compared to GPT-3.5",
    386       "evidence": "Figure 9 shows incorrect behavior rates; the 82% figure is stated in Section 6 but relies on internal evaluations with unreleased data",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "GPT-4 outperforms all prior LLMs on MMLU (86.4% vs. 70.7% previous best few-shot LM)",
    391       "evidence": "Table 2 shows GPT-4 at 86.4% vs. GPT-3.5 70.0% and best external LM at 70.7% on 5-shot evaluation",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "GPT-4 responses preferred over GPT-3.5 responses on 70.2% of user prompts",
    396       "evidence": "Section 4 describes evaluation of 5,214 prompts with blinded human labelers; randomization and filtering procedure described in footnote 30",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "RLHF post-training does not substantially alter GPT-4 base model capability on average",
    401       "evidence": "Appendix B shows 73.7% (base) vs. 74.0% (RLHF) average across exam benchmarks, though individual exams show bidirectional changes up to ±10pp",
    402       "supported": "moderate"
    403     },
    404     {
    405       "claim": "GPT-4 produces toxic content only 0.73% of the time vs. GPT-3.5's 6.48% on RealToxicityPrompts",
    406       "evidence": "Section 6 cites this external benchmark result; RealToxicityPrompts is a public dataset enabling independent verification unlike internal safety metrics",
    407       "supported": "strong"
    408     }
    409   ],
    410   "methodology_tags": [
    411     "benchmark-eval",
    412     "observational"
    413   ],
    414   "key_findings": "GPT-4 demonstrates substantially improved capabilities over GPT-3.5 across professional and academic benchmarks, achieving top 10% performance on bar exam simulations and outperforming prior LLMs on MMLU and other NLP benchmarks. The paper introduces a predictable scaling methodology whereby power law fits to models using 1000-10000x less compute accurately forecast GPT-4 performance, enabling pre-training predictions. RLHF-based safety interventions substantially reduce harmful content generation (82% reduction vs. GPT-3.5), though the paper explicitly withholds architectural details, training data, and compute information, making independent reproduction impossible and limiting scientific transparency.",
    415   "red_flags": [
    416     {
    417       "flag": "Zero reproducibility",
    418       "detail": "Architecture, model size, training compute, and dataset construction are explicitly withheld, making reproduction of any core result impossible — this is arguably the defining limitation of the paper."
    419     },
    420     {
    421       "flag": "Unverifiable internal safety metrics",
    422       "detail": "Key safety improvement claims (82% reduction in disallowed content, 19pp factuality improvement) are based on internal evaluations with unreleased prompts and scoring rubrics."
    423     },
    424     {
    425       "flag": "Self-evaluation conflict of interest",
    426       "detail": "OpenAI evaluates its own commercial product using red teamers hired and managed by OpenAI; no independent third-party verification of safety or capability claims is included."
    427     },
    428     {
    429       "flag": "RLHF degrades calibration",
    430       "detail": "Figure 8 shows ECE increases from 0.007 (base) to 0.074 (RLHF model), meaning post-training makes the model substantially overconfident; this is acknowledged but underemphasized relative to safety improvements."
    431     },
    432     {
    433       "flag": "Benchmark scores equated with capability",
    434       "detail": "Exam performance is framed as 'human-level performance' throughout without adequate discussion of the gap between standardized test scores and real-world task competence."
    435     }
    436   ],
    437   "cited_papers": [
    438     {
    439       "title": "Language Models are Few-Shot Learners",
    440       "relevance": "GPT-3 paper; primary predecessor and baseline model for capability comparisons throughout"
    441     },
    442     {
    443       "title": "Training language models to follow instructions with human feedback",
    444       "relevance": "InstructGPT/RLHF methodology foundational to GPT-4 alignment and safety training pipeline"
    445     },
    446     {
    447       "title": "Scaling laws for neural language models",
    448       "relevance": "Power law scaling methodology underpinning GPT-4 capability prediction framework"
    449     },
    450     {
    451       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    452       "relevance": "Chain-of-thought prompting used for GSM-8K evaluation; key technique enabling GPT-4 reasoning results"
    453     },
    454     {
    455       "title": "Measuring massive multitask language understanding",
    456       "relevance": "MMLU benchmark used as primary capability evaluation including multilingual extension"
    457     },
    458     {
    459       "title": "Evaluating large language models trained on code",
    460       "relevance": "HumanEval benchmark used for coding capability evaluation and predictive scaling demonstration"
    461     },
    462     {
    463       "title": "Training compute-optimal large language models",
    464       "relevance": "Chinchilla paper; baseline model for benchmark comparisons and scaling law framework"
    465     },
    466     {
    467       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    468       "relevance": "Benchmark used to evaluate GPT-4 factuality improvements and calibration after RLHF"
    469     }
    470   ],
    471   "engagement_factors": {
    472     "practical_relevance": {
    473       "score": 3,
    474       "justification": "GPT-4 is one of the most widely deployed AI models globally; directly accessible to practitioners via API at time of publication."
    475     },
    476     "surprise_contrarian": {
    477       "score": 1,
    478       "justification": "Results largely confirm expected improvements over GPT-3.5; the predictable scaling result is notable but built on prior scaling law work rather than contradicting conventional wisdom."
    479     },
    480     "fear_safety": {
    481       "score": 2,
    482       "justification": "System card raises specific concerns about CBRN proliferation risks, cybersecurity threat surface, disinformation capabilities, and preliminary evaluation of power-seeking emergent behaviors."
    483     },
    484     "drama_conflict": {
    485       "score": 2,
    486       "justification": "Significant community controversy around deliberate architectural opacity; the explicit decision not to disclose model details was criticized as departing from scientific norms."
    487     },
    488     "demo_ability": {
    489       "score": 3,
    490       "justification": "GPT-4 was immediately available via ChatGPT Plus and API at paper release; millions of users could directly verify capability claims."
    491     },
    492     "brand_recognition": {
    493       "score": 3,
    494       "justification": "OpenAI/GPT-4 is among the most widely recognized AI products globally with massive media coverage at launch."
    495     }
    496   },
    497   "hn_data": {
    498     "threads": [
    499       {
    500         "hn_id": "35804556",
    501         "title": "SparseGPT: Language Models Can Be Accurately Pruned in One-Shot",
    502         "points": 211,
    503         "comments": 62,
    504         "url": "https://news.ycombinator.com/item?id=35804556",
    505         "created_at": "2023-05-03T16:44:19Z"
    506       },
    507       {
    508         "hn_id": "37285396",
    509         "title": "PMET: Precise Model Editing in a Transformer",
    510         "points": 119,
    511         "comments": 13,
    512         "url": "https://news.ycombinator.com/item?id=37285396",
    513         "created_at": "2023-08-27T18:35:14Z"
    514       },
    515       {
    516         "hn_id": "37712713",
    517         "title": "Fake News Detectors Are Biased Against Texts Generated by Large Language Models",
    518         "points": 17,
    519         "comments": 13,
    520         "url": "https://news.ycombinator.com/item?id=37712713",
    521         "created_at": "2023-09-30T04:10:26Z"
    522       },
    523       {
    524         "hn_id": "47335095",
    525         "title": "I designed a bfloat16/FP8 alternative in a week using LLMs",
    526         "points": 3,
    527         "comments": 4,
    528         "url": "https://news.ycombinator.com/item?id=47335095",
    529         "created_at": "2026-03-11T13:08:40Z"
    530       },
    531       {
    532         "hn_id": "35431143",
    533         "title": "GPT-4 Technical Report",
    534         "points": 2,
    535         "comments": 1,
    536         "url": "https://news.ycombinator.com/item?id=35431143",
    537         "created_at": "2023-04-03T20:54:25Z"
    538       },
    539       {
    540         "hn_id": "35191967",
    541         "title": "OpenAI: GPT-4 Technical Report",
    542         "points": 2,
    543         "comments": 0,
    544         "url": "https://news.ycombinator.com/item?id=35191967",
    545         "created_at": "2023-03-17T02:07:00Z"
    546       },
    547       {
    548         "hn_id": "38646734",
    549         "title": "ClimSim: A large multi-scale dataset for hybrid physics-ML climate emulation",
    550         "points": 1,
    551         "comments": 1,
    552         "url": "https://news.ycombinator.com/item?id=38646734",
    553         "created_at": "2023-12-14T20:22:28Z"
    554       },
    555       {
    556         "hn_id": "40108707",
    557         "title": "Development of \"Cangaru\" GAI, GPT, LLM Accountable Reporting and Use Guidelines",
    558         "points": 1,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=40108707",
    561         "created_at": "2024-04-21T19:57:36Z"
    562       },
    563       {
    564         "hn_id": "34868424",
    565         "title": "Binary Embedding-Based Retrieval at Tencent",
    566         "points": 1,
    567         "comments": 0,
    568         "url": "https://news.ycombinator.com/item?id=34868424",
    569         "created_at": "2023-02-20T14:34:05Z"
    570       }
    571     ],
    572     "top_points": 211,
    573     "total_points": 357,
    574     "total_comments": 94
    575   }
    576 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs