scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26121B)
      1 {
      2   "paper": {
      3     "title": "Prover-Verifier Games improve legibility of LLM outputs",
      4     "authors": ["Jan Hendrik Kirchner", "Yining Chen", "Harri Edwards", "Jan Leike", "Nat McAleese", "Yuri Burda"],
      5     "year": 2024,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2407.13692",
      8     "doi": "10.48550/arXiv.2407.13692"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "theoretical"],
     13   "key_findings": "Training LLM provers for checkability by smaller LLM verifiers using a Prover-Verifier Game (PVG) improves the legibility of solutions to humans, demonstrating a surprising transfer from model-checkability to human-checkability. Optimizing solely for correctness via RL produces solutions that become unintelligible to time-constrained humans despite maintaining high accuracy. The Signed Relative Convincingness (SRC) reward yields better training dynamics than alternatives (CGC, goodharting). Results reveal a 'legibility tax' — a tension between solution correctness and human understandability.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, code archive, or link to released code is provided anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "They use the public GSM dataset but augmented it with 100k synthetic examples. The augmented dataset, human evaluation data, and model outputs are not released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, dependency lists, or hardware details are provided. Models are described only as 'from the GPT4 family.'"
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions are provided. The algorithmic description is detailed but insufficient for reproduction without model access and implementation details."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Figures show point estimates (mean accuracy) without confidence intervals or error bars. No uncertainty quantification on main results."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper makes comparative claims (e.g., 'human accuracy increases') based on comparing numbers across conditions without any statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Results are reported with baseline context — e.g., human accuracy from ~0.75 at initialization to ~0.85 after round 5 (Fig 8b), model accuracy from ~0.4 to ~0.65 (Fig 8a). Percentage points of change are interpretable from the figures."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "75 human trainers and 15,983 datapoints are stated but no justification for why these sample sizes are adequate. No power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No standard deviations, variance across runs, or spread measures are reported for training runs or human evaluation results."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares against a correctness-only RL baseline (Section 4.1) and the pretrained initialization. Alternative reward formulations (CGC, goodharting) are also compared in Section 5."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The task (legibility training) is novel, so no prior methods exist. The baselines (RL for correctness, initialization, alternative reward variants) are appropriate internal comparisons."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 5 ablates reward variants (SRC vs CGC vs goodharting). Additional ablations on verifier size (Figs S4, S5, 5c) and initialization (Fig S9) are provided."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Model accuracy, human judge accuracy, time-to-solve, verifier robustness (steps to exploit), balanced best-of-n accuracy, GPT-4 validity scores, and ROC AUC for prover separability are all reported."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "75 human contractors evaluated solution correctness under a 45-second time limit (Appendix I). Results in Figs 4, 7b, 8."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The GSM test set is used for evaluation. Training data is partitioned into separate prover and verifier splits (Section 3.2)."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by training round (Figs 5-8), verifier size (Figs 6a, 10), reward variant (Section 5), and prover role (helpful/sneaky)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Failure cases with very small verifiers (Fig S4) and very large verifiers (Fig S5) are discussed. Appendix B shows degenerate solutions. CGC reward failure dynamics are analyzed."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "CGC reward leads to worse training dynamics (Section 5.1). Goodharting training does not improve human legibility (Section 5.2, Fig S6). Very small and very large verifiers fail (Appendix D, E)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about accuracy-legibility tradeoff, verifier robustness increasing, helpful prover legibility increasing, and sneaky prover becoming more subtle are all supported by experimental results in Sections 4.1-4.3."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims ('checkability training improves legibility') are supported by controlled comparisons: same initialization with different training objectives (correctness-only vs. checkability), ablations across reward variants, and round-by-round progression."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The abstract claims 'legibility training against small verifiers as a practical avenue for increasing legibility of large LLMs to humans' and 'could help with alignment of superhuman models' — generalizing well beyond the tested setting of grade-school math. While limitations (Section 6.1) acknowledge single-dataset scope, the framing is broader than the evidence."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper discusses weight sharing confounds (Section 4.2), capability gap effects (Section 4.2), separation as alternative explanation for verifier improvement (Appendix C), and initialization effects (Appendix F)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper defines legibility formally, acknowledges that human accuracy under time constraints is 'only one of many ways to measure human legibility' (Section 2), and discusses the gap between model-checkability and true human legibility."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Models are described only as 'from the GPT4 family' (Section 3.2) with relative compute sizes (10^-4 to 10^0). No specific model version, checkpoint, or snapshot date is provided."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "System prompts for helpful and sneaky provers are provided in Appendix G (Fig S10) and few-shot prompts in Appendix H (Fig S11)."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Section 6.1 states 'We used the default ChatGPT post-training hyperparameters such as the optimizer, learning rate, or KL regularization coefficient' without providing specific values. V0=-2 is mentioned for CGC reward."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The setup involves standard RL training of language models."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3.2 describes dataset augmentation with 100k synthetic examples, random partitioning into equal Dπ and DV splits, and balanced sampling of correct/incorrect solutions for verifier training."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6.1 is a dedicated Limitations subsection with 7 specific limitation categories."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Limitations are specific: single dataset (GSM), conflating CoT with legible justifications, reliance on ground truth, heterogeneous stopping criteria across experiments, non-optimal hyperparameter tuning, and initialization with math-finetuned models."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 6.1 explicitly states what the paper does not show: no out-of-domain generalization, no unsupervised setup, only one dataset, only math domain, only one way to measure legibility."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data (human evaluations, model outputs, training logs) is made available for verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Appendix I describes human data collection: 75 trainers, 45-second time limit, 15,983 datapoints, balanced assignment, up to 4 hours per trainer, instructions shown in Fig S12."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "Contractors are referred to as 'human contractors' and 'human trainers' with no description of how they were recruited, from what platform, or what qualifications they had."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Appendix I documents: sampling solutions from each checkpoint → randomizing assignment to 75 trainers with balanced conditions → collecting judgments → excluding 4 trainers at chance level → 15,983 final datapoints."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding disclosure. All authors are from OpenAI but no explicit funding statement is provided."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are listed as affiliated with OpenAI on the first page."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "OpenAI has a direct commercial interest in demonstrating that alignment techniques work for their models. The paper evaluates OpenAI models using OpenAI infrastructure."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is provided."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The GPT-4 family model's training data cutoff is not stated. The system prompt mentions 'Knowledge cutoff: 2023-04' but this is the prompt setting, not a discussion of contamination risk."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether GSM test set problems appeared in GPT-4's pretraining data, despite GSM being published in 2021 and GPT-4 being trained on data through at least 2023."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "GSM8K was published in 2021. GPT-4 was trained on data well past that date. No contamination analysis is performed or discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No pre-registration mentioned for the human evaluation study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No IRB or ethics board approval is mentioned despite the study involving human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "No demographic information about the 75 human contractors is provided — no experience level, background, or other characteristics."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No inclusion criteria for contractor selection are stated. The only exclusion criterion mentioned is post-hoc removal of 4 trainers performing at chance."
    263       },
    264       "randomization_described": {
    265         "applies": true,
    266         "answer": true,
    267         "justification": "Appendix I states: 'We randomized assignment of those samples to 75 trainers (making sure each person see balanced correct vs. incorrect samples from each checkpoint).'"
    268       },
    269       "blinding_described": {
    270         "applies": true,
    271         "answer": true,
    272         "justification": "Appendix I states trainers 'are not told whether they judged correctly' and samples were randomized across conditions, so trainers didn't know which training round produced each solution."
    273       },
    274       "attrition_reported": {
    275         "applies": true,
    276         "answer": true,
    277         "justification": "Appendix I reports starting with 75 trainers and excluding 4 whose performance was statistically indistinguishable from chance, leaving 71."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference costs, API costs, or wall-clock times are reported for any component of the system."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total compute budget is stated. Verifier sizes are described only as relative fractions of prover compute (10^-4 to 10^0), with no absolute numbers."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No seed sensitivity analysis. Results appear to be from single training runs per configuration."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of independent training runs is not stated. It appears each configuration was run once."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Section 6.1 acknowledges 'We did not tune all hyperparameters thoroughly' and describes tuning sample ratios for one verifier size, but no search budget is reported."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Section 4.2 and Section 5 justify why SRC reward is preferred, with explicit comparisons to CGC and goodharting alternatives showing worse dynamics."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "OpenAI authors evaluate their own training method on their own models. No discussion of author-evaluation bias or independent evaluation."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Performance is shown as a function of RL steps (Figs 4a, 5) and across verifier sizes spanning 4 orders of magnitude (Fig 10, 6a), providing compute-performance relationships."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Section 6.1 acknowledges GSM is 'relatively easy and the solutions are quite standardized' and discusses the desire to extend to 'harder math datasets and coding, as well as fuzzier domains.'"
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No agentic scaffolding is used in this work."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether GPT-4's pretraining data includes GSM solutions or similar grade-school math content."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of feature leakage in the evaluation setup."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": true,
    353         "justification": "Section 3.2 describes partitioning training data into disjoint Dπ and DV subsets so 'the verifier and the prover are never optimized on the same prompt.'"
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection method (canary strings, membership inference, etc.) is applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Direct RL optimization for correctness produces solutions that are difficult for time-constrained humans to evaluate.",
    365       "evidence": "Section 4.1, Fig 4b: Human accuracy drops from ~0.82 (initialization) to ~0.70 (7k RL steps). Fig 4c shows evaluation time increases. Fig S1 shows degenerate solutions.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Checkability training achieves decent performance while retaining legibility to humans.",
    370       "evidence": "Section 4.3.3, Fig 8: Human accuracy increases from ~0.75 to ~0.87 across rounds while model accuracy increases from ~0.42 to ~0.65. Fig 1 shows the performance-legibility tradeoff.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Verifiers become increasingly robust to adversarial attacks over training rounds.",
    375       "evidence": "Section 4.3.1, Fig 6a: Steps to exploit increase across rounds. Fig 6b: Balanced best-of-n accuracy improves. Cross-attack generalization (SRC, CGC, goodharting) confirmed.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "The sneaky prover produces increasingly subtle flaws that are harder for humans to detect.",
    380       "evidence": "Section 4.3.2, Fig 7a: GPT-4 average validity score increases. Fig 7b: Human accuracy decreases on sneaky solutions. However, Fig 7b is from a pilot study with acknowledged design flaws.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Legibility to smaller LLM verifiers transfers to legibility to humans.",
    385       "evidence": "Section 4.3.3: Both human accuracy and model verifier robustness improve across rounds. This is shown for the helpful prover (Fig 8) and sneaky prover (Fig 7b, with caveats).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "The existence of a verifier-leading Stackelberg equilibrium is necessary and sufficient for completeness and soundness.",
    390       "evidence": "Theorem 1 with full proof in Appendix A. The proof assumes H is the set of all deterministic mappings, which does not hold for neural networks in practice.",
    391       "supported": "strong"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Company evaluating its own product",
    397       "detail": "All authors are from OpenAI evaluating an alignment technique on OpenAI's GPT-4 family models. No independent verification or external evaluation."
    398     },
    399     {
    400       "flag": "Pilot study with acknowledged design flaws",
    401       "detail": "Some human evaluation results (Fig 7b, S5c, S6) come from a pilot study where solutions were shown in round order across a month, introducing learning/fatigue effects. The authors acknowledge this but still present the results."
    402     },
    403     {
    404       "flag": "No model versions or compute details",
    405       "detail": "Models are described only as 'GPT-4 family' with relative compute sizes. Absolute model sizes, versions, and total compute budget are withheld, making reproduction impossible."
    406     },
    407     {
    408       "flag": "Single dataset evaluation",
    409       "detail": "All results are on GSM8K (grade-school math), a relatively easy benchmark. Claims about alignment of superhuman models extrapolate far beyond this setting."
    410     },
    411     {
    412       "flag": "No uncertainty quantification",
    413       "detail": "No error bars, confidence intervals, statistical tests, or multi-seed results are reported despite being a training-dynamics study where stochasticity could significantly affect conclusions."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Learning to give checkable answers with prover-verifier games",
    419       "authors": ["C. Anil", "G. Zhang", "Y. Wu", "R. Grosse"],
    420       "year": 2021,
    421       "arxiv_id": "2108.12099",
    422       "relevance": "Original prover-verifier game framework that this paper extends to LLMs at scale."
    423     },
    424     {
    425       "title": "Weak-to-strong generalization: Eliciting strong capabilities with weak supervision",
    426       "authors": ["C. Burns", "P. Izmailov", "J. H. Kirchner"],
    427       "year": 2023,
    428       "arxiv_id": "2312.09390",
    429       "relevance": "Closely related alignment paradigm of using weak models to supervise strong ones."
    430     },
    431     {
    432       "title": "Let's Verify Step by Step",
    433       "authors": ["H. Lightman", "V. Kosaraju", "Y. Burda"],
    434       "year": 2024,
    435       "relevance": "Process reward models for math reasoning — alternative approach to improving reasoning verification."
    436     },
    437     {
    438       "title": "Self-critiquing models for assisting human evaluators",
    439       "authors": ["W. Saunders", "C. Yeh", "J. Wu"],
    440       "year": 2022,
    441       "arxiv_id": "2206.05802",
    442       "relevance": "Scalable oversight via LLM self-critique, a complementary approach to prover-verifier games."
    443     },
    444     {
    445       "title": "AI safety via debate",
    446       "authors": ["G. Irving", "P. Christiano", "D. Amodei"],
    447       "year": 2018,
    448       "arxiv_id": "1805.00899",
    449       "relevance": "Foundational scalable oversight proposal; PVG is a single-turn variant of debate."
    450     },
    451     {
    452       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    453       "authors": ["J. Wei", "X. Wang", "D. Schuurmans"],
    454       "year": 2022,
    455       "relevance": "CoT prompting that this paper aims to make more legible through adversarial training."
    456     },
    457     {
    458       "title": "Open problems and fundamental limitations of reinforcement learning from human feedback",
    459       "authors": ["S. Casper", "X. Davies", "C. Shi"],
    460       "year": 2023,
    461       "relevance": "Systematic analysis of RLHF limitations motivating complementary approaches like PVG."
    462     },
    463     {
    464       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    465       "authors": ["E. Hubinger", "C. Denison", "J. Mu"],
    466       "year": 2024,
    467       "arxiv_id": "2401.05566",
    468       "relevance": "Demonstrates persistent deceptive behavior in LLMs, motivating legibility-based oversight."
    469     },
    470     {
    471       "title": "Sycophancy to Subterfuge: Investigating Reward-Tampering in Large Language Models",
    472       "authors": ["C. Denison", "M. MacDiarmid", "F. Barez"],
    473       "year": 2024,
    474       "arxiv_id": "2406.10162",
    475       "relevance": "Studies reward hacking and deceptive alignment in LLMs, directly relevant to safety."
    476     },
    477     {
    478       "title": "Scaling laws for reward model overoptimization",
    479       "authors": ["L. Gao", "J. Schulman", "J. Hilton"],
    480       "year": 2023,
    481       "relevance": "Characterizes reward hacking/goodharting which the PVG approach aims to mitigate."
    482     },
    483     {
    484       "title": "LLM Critics Help Catch LLM Bugs",
    485       "authors": ["N. McAleese", "Rai", "J. F. C. Uribe"],
    486       "year": 2024,
    487       "relevance": "Complementary scalable oversight method using LLM critics for bug detection."
    488     },
    489     {
    490       "title": "Debating with More Persuasive LLMs Leads to More Truthful Answers",
    491       "authors": ["A. Khan", "J. Hughes", "D. Valentine"],
    492       "year": 2024,
    493       "arxiv_id": "2402.06782",
    494       "relevance": "Shows debate with persuasive LLMs improves truthfulness — related scalable oversight method."
    495     }
    496   ]
    497 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs