scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26265B)
      1 {
      2   "paper": {
      3     "title": "CodeReviewQA: The Code Review Comprehension Assessment for Large Language Models",
      4     "authors": [
      5       "Hong Yi Lin",
      6       "Chunhua Liu",
      7       "Haoyu Gao",
      8       "Patanamon Thongtanunam",
      9       "Christoph Treude"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv",
     13     "arxiv_id": "2503.16167"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The dataset is released on Hugging Face (https://huggingface.co/datasets/Tomo-Melb/CodeReviewQA) as stated in the abstract footnote. However, no source code repository URL is provided for the evaluation pipeline itself. The Hugging Face link is a dataset release. Treating this as YES because the benchmark data artifact is released."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The benchmark dataset of 900 curated examples is publicly available on Hugging Face at https://huggingface.co/datasets/Tomo-Melb/CodeReviewQA, as stated in the footnote on page 1."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "Appendix E mentions hardware (64 cores Intel Xeon Platinum 8462Y+, 928GB RAM, 4x NVIDIA H100-80GB SXM5) and that vLLM was used with Hugging Face models, but no requirements.txt, Dockerfile, or specific library versions are provided."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper describes the experimental setup and prompts used, but does not provide step-by-step reproduction instructions, scripts, or a README with commands to replicate the experiments."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Results are reported as point estimates (e.g., '50.3% exact match rate', '78.8% invariant accuracy') throughout Tables 2-6 and the appendix tables. No confidence intervals or error bars are provided."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper makes comparative claims like 'performs vastly better (> 10% increase)' and 'only achieves a 3.7% increase' without statistical significance tests. Model comparisons are based solely on comparing numbers."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper consistently reports performance differences with baseline context, e.g., 'Llama-3.1-70B-Instruct only achieves a 3.7% increase over Qwen2.5-Coder-14B-Instruct, despite wielding a 5-fold increase in parameter count' (Section 6). Absolute percentages with comparisons are provided throughout."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The benchmark consists of 900 examples (100 per language) but no justification is given for why this number was chosen, no power analysis is provided, and no discussion of whether 900 is sufficient for the comparative claims made."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Results are single-run numbers using greedy decoding (temperature=0). No variance, standard deviation, or spread measures across runs are reported. The invariance testing (N! permutations) is about answer order robustness, not experimental variance."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper compares 72 LLMs across five scale classes and also compares its MCQA evaluation approach against traditional ACR text-matching evaluation. Existing benchmarks (CodeReviewer, CodeReview-New) are compared in Table 4."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The 72 evaluated models include recent state-of-the-art models as of March 2025, including DeepSeek-R1 (Jan 2025), Falcon3 (Dec 2024), and QwQ-32B. The comparison benchmarks include CodeReview-New (2024)."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper includes ablation-like analysis: (1) comparing easy vs. hard distractor difficulty levels for CL and SI tasks, (2) analyzing advanced prompting strategies (zero-shot, one-shot, two-shot, chain-of-thought) in Table 6/Appendix G, and (3) decomposing ACR failures into constituent probe failures in Table 3."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The paper uses multiple metrics: exact match rate for ACR, invariant accuracy for MCQA probes, Proportion of Plurarity Agreement (PPA) for symbol binding, perplexity scores and 5-gram accuracy for contamination evaluation."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "Human annotation was used to curate the benchmark dataset (manual verification with Cohen's Kappa = 0.89), but no human evaluation of the LLMs' outputs or code review comprehension was conducted. Evaluation of model performance is entirely automated."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The 900 curated examples serve as the test set. For few-shot experiments (Appendix G), examples were 'held out from the benchmark' for use as demonstrations, indicating separation."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Results are broken down per scale class (5 classes), per task (CTR, CL easy/hard, SI easy/hard), and per model. Full results for all 72 models across all tasks are provided in Appendix F (Tables 7-13). The benchmark also covers 9 programming languages."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 7 ('Insights from Probing ACR Failures') is entirely dedicated to analyzing failure cases. Table 3 quantifies failure percentages across probes for non-exact match cases, and the paper discusses specific model weaknesses."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Several negative results are reported: (1) advanced prompting (chain-of-thought, few-shot) does not outperform zero-shot for most tasks (Table 6), (2) CTR capability plateaus with scale, (3) models that achieve exact match often cannot answer MCQA probes correctly (49%-99.6% of successful cases, suggesting memorization rather than comprehension)."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims that CodeReviewQA 'enables fine-grained assessment of model capabilities' (supported by Section 7's probe analysis), 'mitigates data contamination risks' (supported by Section 8's perplexity/n-gram analysis), and evaluates '72 recently released LLMs on 900 manually curated examples across nine programming languages' (all verified in the paper)."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper makes causal-adjacent claims like 'the inability to improve on CTR presents a fundamental limit to downstream ACR performance' (Section 6) and that exact match without probe success 'alludes to prior exposure and rote memorisation' (Section 7). These causal interpretations are not formally justified; the probe analysis is correlational, not controlled manipulation."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The Limitations section (Section 10) explicitly bounds findings: modest dataset size, line-level rather than token-level localization, reliance on specific surrogate LLM for distractors, same prompt for all models, and MCQA only mitigates surface-level not semantic contamination. Claims are largely bounded to the tested models and tasks."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper discusses alternative explanations: exact match success without probe success may indicate memorization (Section 7); lower perplexity on older benchmarks may indicate training inclusion (Section 8); the 'Interaction among capabilities' limitation acknowledges they did not investigate causal relationships between probes."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Table 7 in Appendix F lists all 72 models with exact Hugging Face model identifiers (e.g., 'meta-llama/Llama-3.1-70B-Instruct', 'Qwen/Qwen2.5-Coder-14B-Instruct'), parameter counts, organizations, and release dates. These are specific open-source model versions."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Full prompt templates for all tasks (ACR, CTR, CL, SI) are provided in Figure 3 (zero-shot) and Figures 4-6 (few-shot). The templates include placeholders with clear definitions of what fills them (e.g., {code_snippet}, {code_review}, {option_a})."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Appendix E states 'we set the temperature to zero for greedy decoding' for all tasks. Section 5.1 specifies 'max output length of one' for MCQA. Section 3.4 mentions the surrogate LLM used 'temperature of 3.5' for distractor generation."
    149       },
    150       "scaffolding_described": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No agentic scaffolding is used. The evaluation involves direct prompting of LLMs with single-turn MCQA and generation tasks."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 4 documents the data pipeline in detail: source dataset (9,367 examples from Guo et al., 2024), heuristic filtering (Appendix C), manual verification by two annotators with Cohen's Kappa = 0.89, stratified sampling per language to 100 each, with 3,761 examples discarded (13% retention rate). The pipeline from raw data to final benchmark is well documented."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 10 is a dedicated 'Limitations' section with substantive discussion spanning five specific limitation areas: dataset size, distractor construction, interaction among capabilities, diversity of prompts, and semantic-level data contamination."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The limitations are specific to this study: 'our benchmark has a relatively modest size due to the difficulty of scaling rigorous manual verification', 'Codestral-22B-v0.1 is considered competitive at the time of this writing, benchmark saturation remains inevitable', 'we did not investigate the causal relationships between the capabilities tested in the probes', 'we used the same prompt and hyperparameters for each task'."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "The paper explicitly states boundaries: only open-source models up to 72B parameters, MCQA only mitigates surface-level not semantic contamination, probes are designed independently rather than causally linked, same prompt for all models rather than optimized per model, line-level rather than token-level localization."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The benchmark dataset is publicly available on Hugging Face (https://huggingface.co/datasets/Tomo-Melb/CodeReviewQA), allowing independent verification of the 900 curated examples."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 4 describes the data source (Guo et al., 2024 dataset of code reviews after January 1, 2022, from 259 most-starred GitHub repositories), filtering to 9 languages, and the full curation pipeline. Appendices A-C provide additional detail on noise types and heuristic filters."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants were involved in the study beyond the two annotator-authors who curated the dataset. The study evaluates LLMs on a benchmark, not human subjects."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The pipeline is documented with counts: 9,367 initial examples, 3,761 discarded by heuristic filtering and manual verification, 900 final examples (100 per language). The 13% retention rate is explicitly stated. Cohen's Kappa (mean 0.89, std 0.11) for inter-annotator agreement across 3k independently annotated examples and 46 conflict resolution rounds is reported."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of grants, corporate sponsors, or funding agencies."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are clearly listed: University of Melbourne and Singapore Management University. These are academic institutions with no apparent conflict of interest with the evaluated models."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding is disclosed, so it is impossible to assess funder independence. The absence of funding disclosure means this cannot be verified."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests or financial interests statement is present in the paper."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "While the paper extensively discusses data contamination (Section 8) and notes code reviews were from after January 1, 2022, the specific training data cutoff dates for the 72 evaluated models are not stated. Model release dates are listed in Table 7 but training cutoffs are not."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Section 8 is entirely dedicated to evaluating data contamination. The paper measures perplexity and 5-gram accuracy to assess whether models have memorized benchmark examples, comparing original ACR form with MCQA probe form. The paper explicitly discusses the contamination risk from public GitHub code reviews."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "Data contamination is a central concern of the paper. Section 8 compares contamination metrics across benchmarks (Table 4), demonstrating that MCQA reformulation yields higher perplexity and lower n-gram accuracy than the ACR format on the same examples, mitigating contamination effects. The paper also identifies that models achieving exact match without probe success likely memorized examples (Section 7)."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants were involved. The study evaluates LLMs on a benchmark."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants were involved. The study evaluates LLMs on publicly available code review data."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants were involved in the study."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants were involved in the study."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants were involved in the study."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants were involved in the study."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants were involved in the study."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "The paper evaluates 72 models with N! permutation runs per question per model (6 or 24 runs per question depending on options), which is computationally substantial, but no inference cost, latency, or wall-clock time is reported."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "Appendix E specifies the hardware (4x NVIDIA H100-80GB SXM5) but does not report total GPU hours, wall-clock time, or any quantification of the computational budget required to run all 72 models."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "CodeReviewQA enables fine-grained assessment of model capabilities disentangled from generative ACR results.",
    292       "evidence": "Section 7 and Table 3 show that MCQA probe failures can expose specific weaknesses (e.g., CL failure accounts for 74-99.4% of ACR failures in smaller models) that are not visible from ACR exact match rates alone. Models with similar ACR scores (e.g., Qwen2.5-Coder-14B, gemma-2-27b-it, Mistral-Small within 1%) show substantially different probe performance.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "MCQA reformulation mitigates data contamination effects compared to the original ACR format.",
    297       "evidence": "Table 4 shows MCQA format yields higher perplexity (6.0 vs 4.5 for Llama-3.1-70B) and lower 5-gram accuracy (25.1% vs 40.3%) compared to the same examples in ACR format, suggesting reduced memorization.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "Llama-3.1-70B-Instruct achieved the highest exact match rate of 50.3% on ACR.",
    302       "evidence": "Table 2 directly reports this result.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Models can achieve ACR exact match without comprehending the code review, suggesting memorization.",
    307       "evidence": "Section 7 reports that models could not accurately answer all probes for 49.0%-99.6% of their successful exact-match cases, with a strict inverse relationship with model size.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Change localisation is the most difficult comprehension task, with proficiency depending heavily on model scale.",
    312       "evidence": "Section 6 CL Results and Table 2 show most <=3B models achieve 0-3% invariant accuracy on CL, while <=72B models achieve >70% on CLE and >60% on CLH. Table 10 in Appendix F provides full results.",
    313       "supported": "strong"
    314     },
    315     {
    316       "claim": "Advanced prompting (few-shot, chain-of-thought) does not outperform zero-shot for most MCQA probes.",
    317       "evidence": "Table 6 in Appendix G shows chain-of-thought consistently degrades performance, and few-shot only outperforms zero-shot on CTR and CL easy. Zero-shot remains best for SI tasks.",
    318       "supported": "strong"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "benchmark-eval"
    323   ],
    324   "key_findings": "CodeReviewQA decomposes automated code refinement evaluation into three MCQA probes (change type recognition, change localisation, solution identification), revealing that models with similar ACR exact-match scores have substantially different comprehension capabilities. Change localisation is the most difficult task, with most small models (<=3B) achieving near-zero accuracy. The MCQA format reduces data contamination signals compared to the original generative format, and analysis shows 49-99.6% of models' successful exact-match cases correspond to failures on at least one comprehension probe, suggesting memorization over genuine understanding.",
    325   "red_flags": [
    326     {
    327       "flag": "No statistical significance testing",
    328       "detail": "All model comparisons across 72 LLMs on 4 tasks are based on point estimates without any statistical significance tests, confidence intervals, or error bars. Claims like '>10% increase' and 'substantially different' are made without formal testing."
    329     },
    330     {
    331       "flag": "Single-run evaluation with deterministic decoding",
    332       "detail": "All experiments use temperature=0 (greedy decoding), producing a single deterministic output. While the invariance testing permutes answer order, there is no assessment of result stability across different configurations or seeds."
    333     },
    334     {
    335       "flag": "No inference cost disclosure for large-scale evaluation",
    336       "detail": "Evaluating 72 models with N! permutations per question (900 questions x multiple probes) on 4x H100 GPUs represents substantial compute, but the total cost, GPU hours, and wall-clock time are never reported."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    342       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R. Narasimhan"],
    343       "year": 2024,
    344       "relevance": "Major benchmark for evaluating LLMs on real-world software engineering tasks; directly relevant to code generation and agentic programming evaluation."
    345     },
    346     {
    347       "title": "Evaluating Large Language Models Trained on Code",
    348       "authors": ["Mark Chen", "Jerry Tworek"],
    349       "year": 2021,
    350       "arxiv_id": "2107.03374",
    351       "relevance": "Introduces HumanEval benchmark for code generation, foundational to LLM code evaluation methodology."
    352     },
    353     {
    354       "title": "Exploring the Potential of ChatGPT in Automated Code Refinement: An Empirical Study",
    355       "authors": ["Qi Guo", "Junming Cao", "Xiaofei Xie", "Shangqing Liu", "Xiaohong Li", "Bihuan Chen", "Xin Peng"],
    356       "year": 2024,
    357       "relevance": "Source dataset for CodeReviewQA; evaluates ChatGPT on automated code refinement, directly relevant to LLM code review capabilities."
    358     },
    359     {
    360       "title": "Breaking the Silence: The Threats of Using LLMs in Software Engineering",
    361       "authors": ["June Sallou", "Thomas Durieux", "Annibale Panichella"],
    362       "year": 2024,
    363       "relevance": "Discusses data contamination threats in LLM-based software engineering evaluation, a core concern of this paper."
    364     },
    365     {
    366       "title": "Benchmarking Benchmark Leakage in Large Language Models",
    367       "authors": ["Ruijie Xu", "Zengzhi Wang", "Run-Ze Fan", "Pengfei Liu"],
    368       "year": 2024,
    369       "arxiv_id": "2404.18824",
    370       "relevance": "Provides the n-gram accuracy contamination detection methodology used in this paper's evaluation."
    371     },
    372     {
    373       "title": "Fine-Tuning and Prompt Engineering for Large Language Models-Based Code Review Automation",
    374       "authors": ["Chanathip Pornprasit", "Chakkrit Tantithamthavorn"],
    375       "year": 2024,
    376       "relevance": "Evaluates LLM fine-tuning and prompting for code review automation, directly relevant to AI-assisted code review."
    377     },
    378     {
    379       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    380       "authors": ["Terry Yue Zhuo"],
    381       "year": 2025,
    382       "relevance": "Recent code generation benchmark evaluating LLMs, relevant to the broader landscape of LLM code evaluation."
    383     },
    384     {
    385       "title": "Automating Code Review Activities by Large-Scale Pre-Training",
    386       "authors": ["Zhiyu Li", "Shuai Lu", "Daya Guo"],
    387       "year": 2022,
    388       "relevance": "Introduces CodeReviewer benchmark used as a comparison baseline for data contamination evaluation in this paper."
    389     },
    390     {
    391       "title": "Code Review Automation: Strengths and Weaknesses of the State of the Art",
    392       "authors": ["Rosalia Tufano", "Ozren Dabic", "Antonio Mastropaolo", "Matteo Ciniselli", "Gabriele Bavota"],
    393       "year": 2024,
    394       "relevance": "Identifies critical quality issues in code review benchmarks, motivating the manual curation approach of CodeReviewQA."
    395     },
    396     {
    397       "title": "CLEAN-EVAL: Clean Evaluation on Contaminated Large Language Models",
    398       "authors": ["Wenhong Zhu"],
    399       "year": 2024,
    400       "relevance": "Addresses data contamination in LLM evaluation through task reformulation, directly related to CodeReviewQA's MCQA approach."
    401     },
    402     {
    403       "title": "Program Synthesis with Large Language Models",
    404       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell I. Nye"],
    405       "year": 2021,
    406       "arxiv_id": "2108.07732",
    407       "relevance": "Introduces MBPP benchmark for code generation, used as a model selection criterion in this work."
    408     },
    409     {
    410       "title": "Too Noisy to Learn: Enhancing Data Quality for Code Review Comment Generation",
    411       "authors": ["Chunhua Liu", "Hong Yi Lin", "Patanamon Thongtanunam"],
    412       "year": 2025,
    413       "relevance": "Identifies noise issues in code review datasets; by some of the same authors, directly related to the data quality motivation of CodeReviewQA."
    414     }
    415   ]
    416 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs