scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26241B)
      1 {
      2   "paper": {
      3     "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation",
      4     "authors": [
      5       "Xueying Du",
      6       "Mingwei Liu",
      7       "Kaixin Wang",
      8       "Hanlin Wang",
      9       "Junwei Liu",
     10       "Yixuan Chen",
     11       "Jiayi Feng",
     12       "Chaofeng Sha",
     13       "Xin Peng",
     14       "Yiling Lou"
     15     ],
     16     "year": 2023,
     17     "venue": "arXiv",
     18     "arxiv_id": "2308.01861"
     19   },
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The benchmark is publicly available at https://github.com/FudanSELab/ClassEval, referenced in the abstract and Section 1."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The benchmark data (100 class-level coding tasks with test suites and canonical solutions) is released via the GitHub repository. The paper states 'Our benchmark is available at https://github.com/FudanSELab/ClassEval'."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Section 4.5 mentions 'eight A800-80G GPUs' and that open-source LLMs were run 'based on the documentation', but no requirements.txt, Dockerfile, or detailed environment specification with library versions is provided in the paper."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper describes the experimental setup in Sections 4.3-4.5 but does not provide step-by-step reproduction instructions, README with commands, or scripts to replicate the main experiments."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Tables 7 and Figure 5 report only point estimates (e.g., '37.6% Pass@1') without confidence intervals or error bars."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper makes multiple comparative claims (e.g., 'GPT-4 and GPT-3.5 still exhibit dominate superior', 'WizardCoder outperforms Instruct-CodeGen by 27.5%') but uses no statistical significance tests to support any of these comparisons."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper reports percentage point differences with baseline context throughout. For example, Section 5.1 states 'GPT-4 and GPT-3.5 achieve 85.4%/68.9% correctness on method-level tasks in HumanEval, but only 37.0%/27.0% correctness on class-level tasks in ClassEval', providing both absolute values and magnitudes of difference."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The benchmark has 100 tasks and n=5 samples per task, but no justification is given for why these sizes are sufficient for the comparative claims made. No power analysis is discussed."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper uses the unbiased Pass@k estimator (Eq. 1, following Chen et al.), but does not report standard deviations or variance across runs. Only point estimates are provided."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper compares 11 LLMs against each other and also compares ClassEval performance against HumanEval performance for each model (Figure 5, Table 7). Multiple models serve as mutual baselines."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The 11 studied LLMs include models released in 2022-2023 (e.g., GPT-4, WizardCoder, StarCoder), which were state-of-the-art at the time of the study (July 2023). The paper explicitly states they 'focus on recent models released since 2022' (Section 4.1)."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "While not a traditional ablation, the comparison of three generation strategies (holistic, incremental, compositional) in Section 5.2/RQ2 effectively ablates the impact of generation approach on each model, isolating the contribution of different strategies."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The paper uses Pass@1, Pass@3, Pass@5 at both class-level and method-level granularity (Table 7, Figure 5), plus DEP(F) and DEP(M) metrics for dependency generation (Section 4.4, Figure 7)."
     90       },
     91       "human_evaluation": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "This is a benchmark evaluation paper where correctness is determined by automated test suites (unit tests with pass/fail outcomes). Human evaluation of generated code quality beyond test-passing is not relevant to the core claims about Pass@k correctness."
     95       },
     96       "held_out_test_set": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "This is a benchmark evaluation, not a training study. There is no tuning on the benchmark data; models are evaluated directly. The concept of held-out vs. dev sets does not apply in this context."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper provides breakdowns by generation strategy (Figure 6), by dependency type (Figure 7, Table 5), by error type (Figure 9), and by number of method dependencies (Figure 8). It also compares class-level vs. method-level Pass@k separately."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 5.4 (RQ4: Bad Case Analysis) explicitly analyzes error types (AttributeError, TypeError, KeyError, etc.) in Figure 9, and provides a concrete code example of a KeyError caused by field dependency misinterpretation (Figure 10)."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports that holistic generation is worse for most models (Section 5.2), that incremental strategy can negatively affect some models like Instruct-StarCoder and WizardCoder (Section 5.2), and that all models show 'much worse performance on class-level code generation' (Finding 1)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract's four main claims are all supported: (1) worse class-level vs. method-level performance (Figure 5), (2) GPT-4/GPT-3.5 dominance (Table 7, Figure 5), (3) holistic strategy best only for GPT models (Figure 6), (4) limited dependent code generation (Figure 7). Each is backed by experimental results."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper makes causal-like claims about why certain strategies work better, e.g., 'most models exhibit rather limited capability of utilizing long input contexts' (Section 5.2), citing Liu et al. [47] as support but without directly testing this explanation. The claim that performance drops are 'attributed to the complexity of generating code that depends on other context' (Section 5.1) is offered as an explanation without controlled experimentation isolating that factor."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper titles itself 'Evaluating LLMs on Class-level Code Generation' broadly, but results are limited to Python only and 100 tasks. While Section 6 acknowledges 'limited size and programming languages', the title, abstract, and findings (e.g., Finding 1: 'all existing LLMs show much worse performance on class-level code generation') generalize beyond the tested setting without consistent qualification."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Section 5.2 discusses alternative explanations for why holistic generation works better for GPT models (better long-context understanding vs. others' limited capability), and considers both the 'helpful hints' and 'misleading information' effects of incremental generation. Section 6 also acknowledges that prompt design choices might impact findings."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "Section 4.5 states 'we use the OpenAI API interface, specifically the \"gpt-4\" and \"gpt-3.5-turbo\" model interface, in July 2023' but does not provide exact snapshot versions (e.g., 'gpt-4-0613'). For open-source models, Table 6 lists model names and sizes but not exact checkpoint versions or commit hashes."
    144       },
    145       "prompts_provided": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Section 4.3 provides the full prompt templates for all three generation strategies for both IF and non-IF models, including system prompt, Instruction-H, Instruction-I, and Instruction-C with their variable placeholders. Since the class skeleton format is fully defined (Table 2, Figure 2), the prompts are reconstructable."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 4.5 reports: temperature of 0.2 for nucleus sampling, default top_p, do_sample=false for greedy decoding (temperature 0), maximum window length of 2,048 tokens, n=5 samples for nucleus sampling."
    154       },
    155       "scaffolding_described": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "No agentic scaffolding is used. The models are prompted directly without tool use, feedback loops, or multi-step orchestration beyond the generation strategies themselves."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section 3.2 documents the full benchmark construction pipeline: task selection (three sources with exclusion criteria in Section 3.2.1), class skeleton construction with design principles (Section 3.2.2), test construction methodology (Section 3.2.3), and canonical solution construction (Section 3.2.4). Each step describes the process and quality controls."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 6 'Threats to Validity' provides a dedicated discussion of threats in both benchmark construction and the empirical study."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 6 discusses specific threats: (1) potential data leakage mitigated by manual construction, (2) limited benchmark size and Python-only scope affecting generalizability, (3) prompt sensitivity addressed by a pilot study on prompt candidates, (4) randomness addressed by greedy decoding results."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "While Section 6 mentions 'limited size and programming languages' and plans to extend, the paper does not explicitly state what the results do NOT show (e.g., does not claim results apply only to Python, does not bound conclusions to the specific 100 tasks). The findings are stated broadly (e.g., 'all existing LLMs show much worse performance on class-level code generation') without scope qualifiers."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The benchmark data (class skeletons, test suites, canonical solutions) is publicly available at the GitHub repository. Generated code samples and raw evaluation logs are not mentioned as being released, but the benchmark itself enables independent verification."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Section 3.2 provides detailed description of the data collection/construction process: task selection from three sources (existing benchmarks, PyPI, brainstorming), explicit exclusion criteria, and the four-step construction pipeline."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": false,
    196         "answer": false,
    197         "justification": "No human participants are recruited for a study. The benchmark constructors are the paper's authors. This is a benchmark construction + automated evaluation paper."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Figure 3 illustrates the full construction pipeline. Each step is documented: task selection (100 tasks from 3 sources with exclusion criteria), class skeleton construction (5 participants, double-checking), test construction (method-level + class-level tests, coverage metrics), and canonical solution construction (4 separate participants, double-checking)."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding information or acknowledgments section is visible in the paper text. There is no mention of grants, sponsors, or funding sources."
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "All authors are listed with their affiliation at Fudan University, Shanghai, China, along with their email addresses. Since they are evaluating third-party models (GPT-4, WizardCoder, etc.) and not their own products, there is no affiliation conflict."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding source is disclosed, so independence of the funder cannot be assessed. The absence of disclosure is not the same as absence of conflict."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No competing interests or financial interests statement is present in the paper."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "While Table 6 lists model release dates, the actual training data cutoff dates for the models are not stated. The paper does not specify when training data collection ended for any model."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "Section 3.2 explicitly addresses this: 'To avoid the coding tasks being seen by LLMs during their training, our benchmark is constructed completely manually, so as to mitigate potential data leakages from existing code sources.' This is the central contamination mitigation strategy."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "The paper's entire benchmark design philosophy addresses contamination. Section 3.2 states the benchmark was 'constructed completely manually' with '500 person-hours' specifically to avoid overlap with training data. Section 6 also lists data leakage as a threat and the manual construction as mitigation."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants study. The paper constructs a benchmark and evaluates LLMs automatically."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants study. The benchmark constructors are the authors themselves."
    254       },
    255       "demographics_reported": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants study. The paper notes benchmark constructors have '2-8 years of Python development experience' but this characterizes workers, not study participants."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants study."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants study."
    269       },
    270       "blinding_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants study."
    274       },
    275       "attrition_reported": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants study."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "The paper uses OpenAI APIs (GPT-4, GPT-3.5) and runs 11 models with nucleus sampling (5 samples per task) and greedy sampling across 100 tasks and 3 strategies, but reports no API costs, tokens consumed, or wall-clock time."
    286       },
    287       "compute_budget_stated": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "Section 4.5 mentions 'eight A800-80G GPUs' as infrastructure but does not report total GPU hours, total API spend, or training/inference time."
    291       }
    292     }
    293   },
    294   "claims": [
    295     {
    296       "claim": "All existing LLMs show much worse performance on class-level code generation compared to standalone method-level code generation benchmarks like HumanEval.",
    297       "evidence": "Figure 5 shows GPT-4 drops from 85.4% (HumanEval) to 37.0% (ClassEval class-level), GPT-3.5 from 68.9% to 27.0%, WizardCoder from 59.8% to 11.0%, with consistent drops across all 11 models. Method-level Pass@1 on ClassEval also lower than HumanEval (Section 5.1).",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "GPT-4 and GPT-3.5 dominate other LLMs on class-level code generation, with Instruct-StarCoder, Instruct-CodeGen, and WizardCoder forming a similar second tier.",
    302       "evidence": "Table 7 and Figure 5: GPT-4 achieves 37.6% class-level Pass@1 (nucleus), GPT-3.5 at 29.6%, while third-ranked WizardCoder is at 12.2%. Second-tier models (WizardCoder, Instruct-StarCoder, Instruct-CodeGen) range from 10.0%-11.1% in greedy Pass@1 (Section 5.1).",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Holistic generation is the best strategy only for GPT-4 and GPT-3.5, while method-by-method generation is better for other models.",
    307       "evidence": "Figure 6 shows holistic generation achieves 6%-9% higher class-level Pass@5 for GPT-4 and 4%-14% for GPT-3.5 compared to other strategies. For other models like CodeGeeX and SantaCoder, incremental strategy produces 9% and 7% more correct classes than holistic (Section 5.2).",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "The disparity in generation strategy effectiveness stems from models' limited capability of understanding long instructions and utilizing middle information.",
    312       "evidence": "Section 5.2 cites Liu et al. [47] showing LLMs become less effective with increasing input length and tend to use beginning/end information better. This is offered as a potential explanation, not directly tested.",
    313       "supported": "weak"
    314     },
    315     {
    316       "claim": "It is easier for all models to generate field-accessing code than method-invoking code.",
    317       "evidence": "Figure 7 shows DEP(F) consistently higher than DEP(M) across all 11 models. GPT-4 leads with at least 12.6%/6.3% improvement over other models in DEP(F)/DEP(M) (Section 5.3).",
    318       "supported": "strong"
    319     },
    320     {
    321       "claim": "Method-level coding ability cannot equivalently reflect class-level coding ability among LLMs.",
    322       "evidence": "Section 5.1: WizardCoder and Instruct-StarCoder have much higher HumanEval scores (59.8% and 34.1%) than SantaCoder (14.6%), but all three perform similarly on ClassEval (around 10-11% Pass@1). Rankings do not transfer.",
    323       "supported": "strong"
    324     }
    325   ],
    326   "methodology_tags": [
    327     "benchmark-eval"
    328   ],
    329   "key_findings": "ClassEval is a manually-crafted benchmark of 100 class-level Python code generation tasks requiring ~500 person-hours to construct. All 11 evaluated LLMs perform substantially worse on class-level code generation than on method-level benchmarks like HumanEval, with GPT-4 dropping from 85.4% to 37.0% Pass@1. The optimal generation strategy varies by model: holistic generation works best for GPT-4/GPT-3.5, while method-by-method generation is better for weaker models. Models consistently find it easier to generate field-dependent code than method-dependent code.",
    330   "red_flags": [
    331     {
    332       "flag": "No statistical significance tests",
    333       "detail": "The paper makes numerous comparative claims about model performance differences (e.g., 'GPT-4 and GPT-3.5 still exhibit dominate superior') without any statistical significance testing. With only 100 tasks and n=5 samples, the differences between closely-ranked models (e.g., WizardCoder at 11.0% vs. Instruct-StarCoder at 10.0%) may not be statistically significant."
    334     },
    335     {
    336       "flag": "No variance or uncertainty quantification",
    337       "detail": "Despite using nucleus sampling with temperature=0.2, no standard deviations, confidence intervals, or error bars are reported. The Pass@k estimator is unbiased but its variance is not quantified, making it impossible to assess the reliability of the reported numbers."
    338     },
    339     {
    340       "flag": "Causal explanations without causal evidence",
    341       "detail": "The paper attributes performance differences to 'limited capability of understanding long instructions and utilizing the middle information' (Section 5.2) and 'complexity of generating code that depends on other context' (Section 5.1) but these are post-hoc explanations citing related work, not experimentally validated causal claims."
    342     }
    343   ],
    344   "cited_papers": [
    345     {
    346       "title": "Evaluating large language models trained on code",
    347       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    348       "year": 2021,
    349       "arxiv_id": "2107.03374",
    350       "relevance": "Introduces HumanEval and the Pass@k metric, which is the primary baseline benchmark and evaluation metric used in this study."
    351     },
    352     {
    353       "title": "Program synthesis with large language models",
    354       "authors": ["J. Austin", "A. Odena", "M. I. Nye"],
    355       "year": 2021,
    356       "arxiv_id": "2108.07732",
    357       "relevance": "Introduces the MBPP benchmark for code generation evaluation, one of the key baselines for comparing benchmark complexity."
    358     },
    359     {
    360       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    361       "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"],
    362       "year": 2023,
    363       "arxiv_id": "2305.01210",
    364       "relevance": "Introduces HumanEval+ with enhanced test suites, relevant to benchmark quality and rigor in code generation evaluation."
    365     },
    366     {
    367       "title": "WizardCoder: Empowering code large language models with evol-instruct",
    368       "authors": ["Z. Luo", "C. Xu", "P. Zhao"],
    369       "year": 2023,
    370       "arxiv_id": "2306.08568",
    371       "relevance": "One of the evaluated code LLMs; represents instruction-tuned code generation models."
    372     },
    373     {
    374       "title": "StarCoder: May the source be with you!",
    375       "authors": ["R. Li", "L. B. Allal", "Y. Zi"],
    376       "year": 2023,
    377       "arxiv_id": "2305.06161",
    378       "relevance": "Foundation model for several evaluated LLMs (WizardCoder, Instruct-StarCoder); large-scale open code model."
    379     },
    380     {
    381       "title": "CoderEval: A benchmark of pragmatic code generation with generative pre-trained models",
    382       "authors": ["H. Yu", "B. Shen", "D. Ran"],
    383       "year": 2023,
    384       "arxiv_id": "2302.00288",
    385       "relevance": "Related benchmark for non-standalone code generation that motivates ClassEval's focus on method dependencies."
    386     },
    387     {
    388       "title": "Lost in the middle: How language models use long contexts",
    389       "authors": ["N. F. Liu", "K. Lin", "J. Hewitt"],
    390       "year": 2023,
    391       "arxiv_id": "2307.03172",
    392       "relevance": "Cited to explain why holistic generation strategy fails for most models -- relevant to understanding LLM context utilization limitations."
    393     },
    394     {
    395       "title": "GPT-4 technical report",
    396       "authors": ["OpenAI"],
    397       "year": 2023,
    398       "arxiv_id": "2303.08774",
    399       "relevance": "The top-performing model in the ClassEval evaluation; foundational reference for frontier LLM capabilities."
    400     },
    401     {
    402       "title": "A survey on evaluation of large language models",
    403       "authors": ["Y. Chang", "X. Wang", "J. Wang"],
    404       "year": 2023,
    405       "arxiv_id": "2307.03109",
    406       "relevance": "Comprehensive survey of LLM evaluation approaches, contextualizing ClassEval within the broader evaluation landscape."
    407     },
    408     {
    409       "title": "Competition-level code generation with AlphaCode",
    410       "authors": ["Y. Li", "D. H. Choi", "J. Chung"],
    411       "year": 2022,
    412       "arxiv_id": "2203.07814",
    413       "relevance": "Competitive programming code generation approach; represents a different type of challenging code generation benchmark."
    414     },
    415     {
    416       "title": "Large language models meet NL2Code: A survey",
    417       "authors": ["D. Zan", "B. Chen", "F. Zhang"],
    418       "year": 2023,
    419       "relevance": "Survey of code generation with LLMs, providing broader context for the ClassEval evaluation."
    420     },
    421     {
    422       "title": "Multi-lingual evaluation of code generation models",
    423       "authors": ["B. Athiwaratkun", "S. K. Gouda", "Z. Wang"],
    424       "year": 2023,
    425       "relevance": "Multi-lingual code generation benchmark (MBXP, Multi-HumanEval); extends HumanEval-style evaluation to multiple languages."
    426     }
    427   ]
    428 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs