scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26184B)
      1 {
      2   "paper": {
      3     "title": "Auto-Cypher: Improving LLMs on Cypher generation via LLM-supervised generation-verification framework",
      4     "authors": [
      5       "Aman Tiwari",
      6       "Shiva Krishna Reddy Malay",
      7       "Vikas Yadav",
      8       "Masoud Hashemi",
      9       "Sathwik Tejaswi Madhusudhan"
     10     ],
     11     "year": 2024,
     12     "venue": "arXiv preprint",
     13     "arxiv_id": "2412.12612"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No GitHub link or code repository is provided. The paper mentions a HuggingFace dataset link (https://huggingface.co/datasets/ServiceNow-AI/SynthCypher) but does not release the pipeline code, training scripts, or evaluation code."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The SynthCypher dataset is publicly available on HuggingFace at https://huggingface.co/datasets/ServiceNow-AI/SynthCypher, as stated on the first page of the paper."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment setup section is provided. The paper mentions training hyperparameters (learning rate, batch size, epochs) but not the software environment or library versions."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No step-by-step reproduction instructions, README with commands, or scripts to replicate experiments are provided. The pipeline is described at a high level, and prompts are in the appendix, but there are no instructions for a researcher to reproduce the full pipeline or training runs."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Table 2 and Figure 3 report only point estimates (e.g., '71.4' accuracy) with no confidence intervals or error bars."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper claims SynthCypher leads to 'significant improvements' and '40% absolute improvement' but provides no statistical significance tests (no p-values, t-tests, or similar). Comparisons are based solely on comparing point estimates."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper reports absolute percentage improvements with baseline context, e.g., 'up to 40% accuracy improvement on 7B & 8B models' and Table 2 shows both baseline and fine-tuned scores (e.g., Llama-3.1-8B goes from 30.9 to 71.4 on SynCy-test), allowing readers to assess the magnitude of improvement."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No justification is provided for the training set size of 29.8k or test set size of 2k/4k. No power analysis or discussion of whether these sizes are adequate for the claims made."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No standard deviation, variance, or spread measures are reported across experimental runs. Results appear to be from single training runs with no indication of multiple seeds or runs."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Table 2 includes three baseline setups: (1) Base IFT (models fine-tuned on generic instruction data without SynthCypher), (2) off-the-shelf Instruct model versions, and (3) comparison with Neo4j Labs (tomasonjo_gpt4o) dataset in Figure 3."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The baselines use contemporary models (Llama 3.1, Qwen 2.5, Mistral v0.2) and the only other existing Text2Cypher dataset (Neo4j Labs/tomasonjo_gpt4o from 2024). The paper acknowledges that Text2Cypher is an underexplored area with limited prior work."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No ablation study is presented. The pipeline has multiple components (schema generation, LLM-As-Database-Filler, chain-of-thought Cypher generation, validation), but no experiments isolate the contribution of individual components. For example, no experiment shows what happens without the validation step or without the database-filling strategy."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "Only a single metric is used: LLM-as-Judge exact match (GPT-4o gives 1 if all requested information is present, 0 otherwise). No other metrics such as syntactic validity rate, execution success rate, or partial credit are reported."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No human evaluation of the generated Cypher queries or model outputs is performed. The only human involvement is manual vetting of 25% of generated schemas during data creation (Appendix B.2), which evaluates the dataset itself, not the fine-tuned model's outputs."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Section 4 states: 'For testing, we employed a separate dataset of 4k samples, covering all 109 query types across 165 schemas not included in train.' Additionally, the SPIDER-Cypher benchmark is a separate external test set."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Despite having 109 query types of varying complexity, the paper only reports overall accuracy numbers in Table 2 and Figure 3. No per-query-type or per-complexity-level breakdown is provided."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No error analysis, qualitative examples of failures, or discussion of where the fine-tuned models produce incorrect Cypher queries. The paper only shows aggregate success rates."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "Every experiment shows improvement from SynthCypher training. No approaches that were tried and abandoned, or configurations that failed, are discussed. No negative results or unsuccessful variations are reported."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims 'up to 40% on the Text2Cypher test split and 30% on the SPIDER benchmark.' Table 2 supports these: Llama-3.1-8B improves from 30.9 to 71.4 on SynCy-test (~40%) and from 30.8 to 62.2 on SPIDER (~31%). The other abstract claims about pipeline quality and dataset diversity are supported by the methodology description."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The paper's causal claims are of the form 'fine-tuning on SynthCypher improves performance.' This is supported by a controlled comparison: same models fine-tuned with vs. without SynthCypher data (Table 2, comparing Base IFT vs. Base + SynCy rows). This is a standard ablation-style controlled comparison adequate for this type of causal claim."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The title 'Improving LLMs on Cypher generation' is broader than what was tested. Results are only for 7B/8B parameter models; the Limitations section acknowledges 'we have limited this research to smaller models and it is not clear if the same strategy would work on larger models,' but neither the title nor the abstract bounds claims to this model size range."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "No discussion of alternative explanations for the improvements. For example, no consideration of whether the improvement comes from the additional training data volume rather than data quality, or whether the LLM-as-Judge metric might favor certain query styles. The Limitations section discusses bias risks and model size constraints but not alternative explanations for the observed results."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "Some models are partially versioned (e.g., 'Mistral-v0.2-7B', 'Llama-3.1-8B', 'Qwen2-7B', 'QwenCoder-2.5-7B'). However, 'GPT-4' used in the pipeline (Section 3) and 'GPT-4o' used as LLM-as-Judge have no snapshot date or API version specified. These are critical components whose behavior varies across versions."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The appendix (Figures 4-14) provides the full prompt text for all major pipeline steps: schema generation (Fig 4-5), question generation (Fig 6), ground truth generation (Fig 7), database infilling (Fig 8-9), Cypher generation steps (Fig 10-13), and the LLM-as-Judge prompt (Fig 14). These are actual prompts with placeholders whose fill values are specified."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Training hyperparameters are partially reported: 'learning rate=1e-05, batch size of 128 over three epochs' (Section 4). However, temperature/sampling settings for GPT-4 and GPT-4o used in the pipeline and evaluation are not stated, nor are temperature settings for Mixtral during schema/question generation (Appendix B mentions 'higher temperature (0.8)' for domain generation only)."
    149       },
    150       "scaffolding_described": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The multi-step pipeline is described in detail in Section 3 (Steps 1-5) and the appendix. The chain-of-thought Cypher generation process using four sequential LLM calls (analyze question, identify nodes, incorporate best practices, generate final Cypher) is documented in Appendix F and Figures 10-13."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "The full data generation pipeline is documented in Section 3 with five explicit steps. Table 1 shows final dataset sizes (29,838 train, 2,000 test). The validation step describes how queries are retried up to 5 times before discarding, and only execution-correct queries are retained. Schema generation, question generation, and database population processes are all documented with prompts in the appendix."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 7 'Limitations' is present with substantive discussion of synthetic data biases, model size limitations, and benchmark contamination risks."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The Limitations section (Section 7) discusses specific threats: (1) synthetic data may reinforce model biases and not model real-world scenarios, (2) research is limited to smaller (7B/8B) models and may not transfer to larger models, (3) SPIDER test dataset was publicly released and may be in pre-training data. These are specific to this study."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "While the Limitations section mentions smaller models and synthetic data risks, it does not explicitly state what the results do NOT show. For example, it does not state that results do not generalize to production Neo4j databases with real (not synthetic) schemas, or that the LLM-as-Judge metric may not capture all quality dimensions."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The SynthCypher dataset is publicly available on HuggingFace (https://huggingface.co/datasets/ServiceNow-AI/SynthCypher), allowing independent verification of the training and test data."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The data collection procedure is thoroughly described in Section 3 (Steps 1-5) and Appendices B-F, covering schema generation, question generation, ground truth generation, database population, Cypher generation, and validation."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants. Data is synthetically generated using LLMs and standard benchmark datasets."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The full pipeline from schema generation through validation is documented in Section 3 with five explicit steps. Table 1 shows final dataset counts. The paper explains that only queries producing correct results are retained after up to 5 retries. However, the exact attrition numbers at each step (how many were generated vs. retained) are not fully reported."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No funding source or acknowledgments section is present in the paper. All authors are from ServiceNow, but no funding disclosure is made."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "All five authors are listed as affiliated with ServiceNow, with ServiceNow email addresses, clearly stated on the first page."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "All authors are ServiceNow employees. ServiceNow offers graph database products and AI capabilities, giving them a potential commercial interest in demonstrating strong Text2Cypher performance. No statement of funder independence is made."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests statement, patent disclosures, or financial interest declarations are present in the paper."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No training data cutoff dates are stated for any of the models used (Llama 3.1, Mistral, Qwen, CodeLlama, GPT-4, GPT-4o). This is relevant because the models are evaluated on SPIDER-Cypher, derived from the public SPIDER benchmark."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "The Limitations section (Section 7) states: 'SPIDER test dataset has been publicly released as of Feb-2024 and it is not clear if any of that data went into the pre-training of base models or the Instruct models we considered.' This explicitly flags potential train/test overlap."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "While the paper acknowledges the SPIDER contamination risk in the Limitations section, no mitigation steps are taken (e.g., no canary strings, no temporal analysis, no comparison with contamination-safe benchmarks). The acknowledgment is present but the contamination is not addressed — just flagged."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants in this study."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No inference costs, latency, or per-example costs are reported for either the data generation pipeline (which uses GPT-4 and Mixtral) or the fine-tuned models at inference time."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No total computational budget is stated. The paper does not report GPU hours for training, API costs for GPT-4/GPT-4o calls during data generation and evaluation, or hardware specifications used."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "SynthCypher fine-tuning leads to up to 40% absolute accuracy improvement on the Text2Cypher test split over base IFT models.",
    292       "evidence": "Table 2 shows Llama-3.1-8B improves from 30.9 (Base IFT) to 71.4 (Base + SynCy) on SynCy-test, a 40.5% absolute improvement. Similar improvements are seen across other models (Section 5).",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "SynthCypher fine-tuning leads to up to 30% improvement on the SPIDER benchmark adapted for Cypher.",
    297       "evidence": "Table 2 shows Llama-3.1-8B improves from 30.8 (Base IFT) to 62.2 (Base + SynCy) on SPIDER, a 31.4% absolute improvement (Section 5).",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "SynthCypher outperforms the Neo4j Labs (tomasonjo_gpt4o) dataset when training data size is controlled.",
    302       "evidence": "Figure 3 shows Llama-3.1-8B fine-tuned on down-sampled SynthCypher (~7.7k) outperforms the same model fine-tuned on Neo4j Labs data (7.7k) on both SynthCypher and SPIDER test splits (Section 5).",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "Off-the-shelf instruct LLMs and base IFT models achieve very low performance on Text2Cypher, demonstrating the need for task-specific data.",
    307       "evidence": "Table 2 shows instruct models range from 27.7% to 40.2% on SynCy-test and 25.2% to 50.8% on SPIDER. Base IFT models range from 14.6% to 50.85% on SynCy-test (Section 5).",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "The LLM-As-Database-Filler is a novel strategy not explored in prior literature.",
    312       "evidence": "Section 3, Step 3 states: 'To the best of our knowledge, this strategy of filling the database conditioned on an arbitrarily chosen dummy ground truth has not been explored in literature before.' No prior work citation contradicts this claim.",
    313       "supported": "weak"
    314     }
    315   ],
    316   "methodology_tags": [
    317     "benchmark-eval"
    318   ],
    319   "key_findings": "Auto-Cypher presents a synthetic data generation pipeline for Text2Cypher that produces the SynthCypher dataset (29.8k training instances across 700 domains and 109 query types). Fine-tuning 7B/8B LLMs on SynthCypher yields up to 40% absolute accuracy improvement on the Text2Cypher test set and 30% on an adapted SPIDER benchmark, compared to models fine-tuned only on generic instruction data. The pipeline's key innovation is the LLM-As-Database-Filler strategy, which generates synthetic Neo4j databases conditioned on expected answers to validate Cypher query correctness through execution.",
    320   "red_flags": [
    321     {
    322       "flag": "No variance or error bars",
    323       "detail": "All results appear to be single-run numbers with no standard deviations, confidence intervals, or multiple seeds. Given the well-known variance in LLM fine-tuning outcomes across random seeds, the reported improvements may not be robust."
    324     },
    325     {
    326       "flag": "Single evaluation metric",
    327       "detail": "The only metric is an LLM-as-Judge (GPT-4o) binary match. This metric itself is not validated against human judgment for Cypher queries, and it may systematically favor certain query styles or miss partial correctness."
    328     },
    329     {
    330       "flag": "No ablation study",
    331       "detail": "The pipeline has multiple components (schema diversity, database filling, chain-of-thought generation, execution validation), but no ablation study isolates which components drive the improvement. The improvement could come simply from having more in-domain training data rather than from the pipeline's specific innovations."
    332     },
    333     {
    334       "flag": "Evaluating on own synthetic test set",
    335       "detail": "The primary evaluation (SynCy-test) is on a test split of the authors' own synthetically generated dataset, using the same pipeline that generated the training data. The test set may share distributional biases with the training set, inflating reported performance. The SPIDER-Cypher results partially mitigate this concern."
    336     },
    337     {
    338       "flag": "Undisclosed costs",
    339       "detail": "The pipeline uses GPT-4 for multiple steps (schema validation, database population code, Cypher generation, validation) and GPT-4o as the evaluation judge, but no API costs are reported. This is relevant for assessing whether the pipeline is practically scalable."
    340     },
    341     {
    342       "flag": "Corporate affiliation without conflict disclosure",
    343       "detail": "All authors are ServiceNow employees. ServiceNow has commercial interests in AI and data management products. No competing interests statement or conflict of interest disclosure is present."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "Can LLM Already Serve as a Database Interface? A Big Bench for Large-Scale Database Grounded Text-to-SQLs",
    349       "authors": ["Jinyang Li", "Binyuan Hui", "Ge Qu"],
    350       "year": 2023,
    351       "arxiv_id": "2305.03111",
    352       "relevance": "Major benchmark (BIRD) for evaluating LLMs on text-to-SQL, directly relevant to understanding LLM code generation capabilities for database queries."
    353     },
    354     {
    355       "title": "Spider: A Large-Scale Human-Labeled Dataset for Complex and Cross-Domain Semantic Parsing and Text-to-SQL Task",
    356       "authors": ["Tao Yu", "Rui Zhang", "Kai Yang"],
    357       "year": 2018,
    358       "relevance": "Foundational text-to-SQL benchmark adapted for Cypher evaluation in this paper; widely used in code generation evaluation."
    359     },
    360     {
    361       "title": "GPT-4 Technical Report",
    362       "authors": ["OpenAI"],
    363       "year": 2023,
    364       "arxiv_id": "2303.08774",
    365       "relevance": "GPT-4 is used as a core component of the data generation pipeline, relevant to understanding LLM capabilities in code generation tasks."
    366     },
    367     {
    368       "title": "WizardCoder: Empowering Code Large Language Models with Evol-Instruct",
    369       "authors": ["Ziyang Luo", "Can Xu", "Pu Zhao"],
    370       "year": 2023,
    371       "arxiv_id": "2306.08568",
    372       "relevance": "Influential code LLM training approach using synthetic instruction evolution, relevant to LLM code generation methodology."
    373     },
    374     {
    375       "title": "Mistral 7B",
    376       "authors": ["Albert Q Jiang", "Alexandre Sablayrolles", "Arthur Mensch"],
    377       "year": 2023,
    378       "arxiv_id": "2310.06825",
    379       "relevance": "One of the base models fine-tuned in this work; relevant as a widely-used open-source LLM for code tasks."
    380     },
    381     {
    382       "title": "The Llama 3 Herd of Models",
    383       "authors": ["Laurens Van Der Maaten et al."],
    384       "year": 2024,
    385       "arxiv_id": "2407.21783",
    386       "relevance": "Llama 3.1 is the best-performing base model in the paper's experiments; relevant as a state-of-the-art open-source LLM."
    387     },
    388     {
    389       "title": "Qwen2.5-Coder Technical Report",
    390       "authors": ["Binyuan Hui", "Jian Yang", "Zeyu Cui"],
    391       "year": 2024,
    392       "arxiv_id": "2409.12186",
    393       "relevance": "QwenCoder is one of the code-specialized models evaluated, relevant to understanding specialized code LLMs."
    394     },
    395     {
    396       "title": "Mixtral of Experts",
    397       "authors": ["Albert Q. Jiang", "Alexandre Sablayrolles"],
    398       "year": 2024,
    399       "arxiv_id": "2401.04088",
    400       "relevance": "Mixtral 8x22B is used for schema and question generation in the pipeline; relevant as a mixture-of-experts architecture for code generation."
    401     },
    402     {
    403       "title": "Training Language Models to Follow Instructions with Human Feedback",
    404       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    405       "year": 2022,
    406       "relevance": "Foundational RLHF paper relevant to understanding instruction-following capabilities of LLMs used in code generation."
    407     },
    408     {
    409       "title": "Synthet2C: Generating Synthetic Data for Fine-Tuning Large Language Models on the Text2Cypher Task",
    410       "authors": ["Ziije Zhong", "Linqing Zhong", "Zhaoze Sun"],
    411       "year": 2024,
    412       "arxiv_id": "2406.10710",
    413       "relevance": "Concurrent work on synthetic Text2Cypher data generation; directly comparable approach limited to medical domain."
    414     },
    415     {
    416       "title": "A Survey on Employing Large Language Models for Text-to-SQL Tasks",
    417       "authors": ["Liang Shi", "Zhengju Tang", "Nan Zhang"],
    418       "year": 2024,
    419       "arxiv_id": "2407.15186",
    420       "relevance": "Survey of LLM text-to-SQL approaches, relevant to understanding the broader landscape of LLM-based database query generation."
    421     }
    422   ]
    423 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs