scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25344B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Decomposed Prompting: A Modular Approach for Solving Complex Tasks",
      6     "authors": [
      7       "Tushar Khot",
      8       "Harsh Trivedi",
      9       "Matthew Finlayson",
     10       "Yao Fu",
     11       "Kyle Richardson",
     12       "Peter Clark",
     13       "Ashish Sabharwal"
     14     ],
     15     "year": 2022,
     16     "venue": "International Conference on Learning Representations",
     17     "arxiv_id": "2210.02406",
     18     "doi": "10.48550/arXiv.2210.02406"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "All abstract claims about DECOMP outperforming prior few-shot prompting on symbolic and textual tasks are backed by Figures 7-16 across 8 datasets; modular structure, recursive decomposition, and symbolic integration are all demonstrated empirically.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The 'CoT w/ rollout' ablation uses the identical reasoning procedure as DECOMP but in a monolithic prompt, isolating modularization as the causal factor; alternative decomposition schemes in Appendix E further support robustness.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The title and conclusion claim DECOMP as a general approach for 'complex tasks' but evaluations cover only 8 NLP benchmarks; no explicit discussion of where DECOMP would not generalize.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper does not discuss whether improvements stem from higher-quality prompt engineering for DECOMP, greater computation per query, or other confounds beyond modular structure.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Exact Match and Answer F1 are used as direct measures of task correctness and match the granularity of the claims; no conflation of measurement with broader capabilities.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion paragraph is brief and does not systematically discuss shortcomings.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No specific threats are discussed, such as sensitivity to prompt wording choices, benchmark contamination in GPT-3 training, or limited dataset diversity.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The paper does not explicitly state what results do not show or which task types DECOMP would be unsuitable for.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Acknowledgements state: 'This work was supported in part by the National Science Foundation under grants IIS2007290.'",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations are clearly listed on the title page: Allen Institute for AI, Stony Brook University, and University of Edinburgh.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "NSF is an independent government funding agency with no financial stake in whether DECOMP outperforms CoT prompting.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests statement, patent disclosures, or equity declarations appear anywhere in the paper.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 3 formally defines 'decomposer,' 'sub-task handler,' 'prompting program,' and the inference procedure with mathematical notation (P = (f1,Q1,A1),...) and illustrative figures.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper explicitly states it contributes DECOMP, a new modular prompting approach supporting hierarchical decomposition, recursion, and symbolic module integration.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 2 engages substantively with CoT, Least-to-Most, Successive Prompting, and Neural Modular Networks, explaining specifically how DECOMP differs from and extends each approach.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "Footnote 1 states: 'Datasets, Code and Prompts available at https://github.com/allenai/DecomP.'",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "All benchmarks used (HotpotQA, 2WikiMultihopQA, MuSiQue, CommaQA, GSM8K, MultiArith) are standard publicly available datasets.",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No requirements.txt, Dockerfile, or software dependency specifications are mentioned; only model names are identified.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": true,
    144           "justification": "Appendix G reproduces all prompts verbatim across 50+ pages, Section 3.2 describes the inference procedure step-by-step with Figure 3, and code is released on GitHub.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Results are point estimates averaged over 3 prompts; no standard deviations, confidence intervals, or error bars are reported anywhere in the paper.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No statistical significance tests are applied despite multiple comparative claims between DECOMP and baselines.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Absolute score differences are reported throughout (e.g., 14-17 pt math QA improvement, EM going from 22.7% to 98% for letter concatenation at N=3).",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Sample sizes (100, 200, 300 examples) are chosen for API cost reasons without power analysis or formal justification.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Results are averaged over 3 prompts but standard deviation is not reported; Appendix D shows per-prompt results without variance statistics.",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Multiple baselines: standard prompting, CoT, CoT w/ rollout, Least-to-Most w/ rollout; for open-domain QA also no-context and no-decomposition retrieval baselines.",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "CoT (Wei et al., 2022) and Least-to-Most (Zhou et al., 2023) were the leading few-shot prompting approaches at the time of submission.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "'CoT w/ rollout' ablation uses DECOMP's identical reasoning steps in a single prompt to isolate the effect of modularity; Appendix E tests alternative decomposition schemes.",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Exact Match is used for symbolic and CommaQA tasks; Answer F1 is used for open-domain QA datasets; task-appropriate metrics throughout.",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": false,
    207           "answer": false,
    208           "justification": "Human evaluation is not applicable; all tasks use standard automated metrics on NLP benchmarks with ground-truth answers.",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "For open-domain QA, results are on '300 held-out dev questions in each dataset' separate from the 100-question hyperparameter tuning set; symbolic tasks use separate test sets.",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Results broken down by dataset (8 datasets), input length (N=3,4,5 for letter concatenation), and decomposition granularity (coarse vs. fine for CommaQA); per-prompt breakdowns in Appendix D.",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Appendix F provides explicit error analysis with concrete examples of failure modes for both DECOMP and CoT on letter concatenation and CommaQA tasks.",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "The paper reports DECOMP is only 'comparable' to the retrieval baseline on HotpotQA with Codex, and that performance drops to near-zero for smaller models (curie-001).",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Specific model identifiers are given: text-davinci-002, code-davinci-002, davinci-001, text-curie-001, Flan-T5-Large/XL/XXL with parameter counts (0.7B, 3B, 11B).",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Appendix G provides all prompts verbatim — decomposer prompts and every sub-task handler prompt for every task — covering 50+ pages of the paper.",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "Temperature, top-p, and other generation hyperparameters are not reported; only the retrieval count K is described as a tuned hyperparameter.",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Section 3.2 and Figure 3 describe the inference procedure in detail: how the controller iteratively passes inputs/outputs between the decomposer and sub-task handlers until EOQ.",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Appendix A describes retrieval corpus creation (430,225 paragraphs for 2WikiMultihopQA, 139,416 for MuSiQue) and CommaQA truncation to fit GPT-3 context limits.",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "All benchmarks (HotpotQA, 2WikiMultihopQA, MuSiQue, CommaQA, GSM8K, MultiArith) are publicly available; code for generating test examples is released.",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Symbolic task test examples described (names from popularity lists, 100 examples per condition); open-domain QA corpus construction from train/dev/test paragraphs described in Appendix A.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants were recruited; all evaluation uses standard NLP benchmarks.",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "The pipeline from corpus creation through hyperparameter tuning on a 100-question held-out set to final evaluation on 300 questions is described in Appendix A.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "Training data cutoffs for GPT-3 (text-davinci-002, code-davinci-002) are not stated in the paper.",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "Potential overlap between GPT-3 training data and benchmark test sets is not discussed anywhere in the paper.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "Several benchmarks (HotpotQA 2018, MultiArith 2015) were publicly available before GPT-3's training cutoff; this contamination risk is not acknowledged.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants involved.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants involved.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants involved.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants involved.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants involved.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants involved.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants involved.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "API costs are acknowledged implicitly (subsampling to 300/200 examples 'due to costs') but no actual cost figures or call counts are reported.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "No total compute budget, API call counts, or wall-clock time estimates are provided.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "DECOMP outperforms CoT and Least-to-Most prompting on kth letter concatenation, particularly for longer inputs (N=4,5 words)",
    377       "evidence": "Figure 7: DECOMP achieves 96-98% EM across N=3,4,5 vs. CoT 22.7/12.0/6.0% and L2M 74.7/70.5/66.0%",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Recursive DECOMP enables length generalization for list reversal far beyond what CoT achieves",
    382       "evidence": "Figure 8: DECOMP achieves 42% EM at N=10 items vs. CoT 4.5%; base CoT 'does not generalize at all to longer sequences'",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "DECOMP outperforms CoT on long-context multi-hop QA (CommaQA-E) including compositional generalization",
    387       "evidence": "Figure 10: DecomP(fine) 64.2% vs. CoT 55% on IID; 59.7% vs. 33.8% on compositional generalization split",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "DECOMP with retrieval (Decomp-Ctxt) outperforms retrieval baselines on open-domain multi-hop QA",
    392       "evidence": "Figure 12: Decomp-Ctxt outperforms NoDecomp-Ctxt on MuSiQue and 2WikiMultihopQA; HotpotQA with Codex is 'comparable' rather than better",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "DECOMP-based error correction improves CoT math QA by 14-17 points through a targeted answer-extraction sub-task",
    397       "evidence": "Figure 16: GSM8K 36.0→50.7% (+14.7), MultiArith 78.0→95.0% (+17) by adding a GPT-3 answer-extraction sub-module",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "Modular structure itself (not just the reasoning procedure) drives DECOMP's improvements over CoT",
    402       "evidence": "Figure 7: CoT w/ rollout (same reasoning, monolithic) scores 74.7/70.5/66.0% vs. DECOMP 98/96/97% for N=3,4,5; rolled-out reasoning fails without modularity",
    403       "supported": "strong"
    404     }
    405   ],
    406   "methodology_tags": [
    407     "benchmark-eval"
    408   ],
    409   "key_findings": "Decomposed Prompting (DECOMP) outperforms Chain-of-Thought and Least-to-Most prompting across symbolic reasoning and multi-hop QA tasks by decomposing complex tasks into modular sub-tasks with dedicated few-shot prompts. The central finding is that separate sub-task prompts are more effective than unrolling the same reasoning steps into a single CoT — demonstrating that modularity itself drives improvements, not just the reasoning procedure. DECOMP uniquely enables recursive decomposition for length generalization on list reversal, hierarchical decomposition for sub-tasks too hard for few-shot prompting, and seamless integration of symbolic systems like ElasticSearch for open-domain QA, and achieves 14-17 point gains on math QA through targeted error-correction post-processing.",
    410   "red_flags": [
    411     {
    412       "flag": "No confidence intervals or significance tests",
    413       "detail": "All results are point estimates averaged over 3 prompts with no standard deviations or statistical significance testing, making it impossible to assess whether improvements are reliable."
    414     },
    415     {
    416       "flag": "Benchmark contamination unaddressed",
    417       "detail": "Several benchmarks (HotpotQA 2018, MultiArith 2015) were publicly available before GPT-3's training cutoff; no discussion of potential contamination."
    418     },
    419     {
    420       "flag": "Subsampled evaluation due to API costs",
    421       "detail": "GSM8K subsampled to 300 examples and MultiArith to 200 'due to costs with API usage' without power analysis; may reduce result reliability."
    422     },
    423     {
    424       "flag": "No limitations section",
    425       "detail": "The paper lacks any dedicated discussion of limitations, failure modes beyond error analysis appendix, or conditions under which DECOMP would be expected to underperform."
    426     },
    427     {
    428       "flag": "Generation hyperparameters unreported",
    429       "detail": "Temperature, top-p, and other generation hyperparameters for GPT-3 API calls are not reported, impeding exact reproduction."
    430     }
    431   ],
    432   "cited_papers": [
    433     {
    434       "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    435       "relevance": "Primary baseline and motivation; DECOMP is explicitly designed to overcome CoT's limitations on complex multi-step tasks"
    436     },
    437     {
    438       "title": "Least-to-Most Prompting Enables Complex Reasoning in Large Language Models",
    439       "relevance": "Closest related work; directly compared as baseline with rollout variant; DECOMP differs by allowing non-linear decomposition structures"
    440     },
    441     {
    442       "title": "Language Models are Few-Shot Learners (GPT-3)",
    443       "relevance": "Foundation model used throughout experiments; establishes the few-shot in-context learning paradigm DECOMP builds on"
    444     },
    445     {
    446       "title": "Successive Prompting for Decomposing Complex Questions",
    447       "relevance": "Related decomposition approach; DECOMP extends with diverse and recursive decomposition structures beyond sequential question generation"
    448     },
    449     {
    450       "title": "PAL: Program-aided Language Models",
    451       "relevance": "Related work on integrating symbolic computation with LLM reasoning; context for DECOMP's symbolic module integration"
    452     },
    453     {
    454       "title": "MuSiQue: Multi-hop Questions via Single-hop Question Composition",
    455       "relevance": "Key evaluation benchmark for multi-hop open-domain QA"
    456     },
    457     {
    458       "title": "HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering",
    459       "relevance": "Key evaluation benchmark; results show DECOMP comparable but not clearly better than retrieval baseline with Codex"
    460     },
    461     {
    462       "title": "Text Modular Networks: Learning to Decompose Tasks in the Language of Existing Models",
    463       "relevance": "Direct precursor to DECOMP using supervised training for decomposition; DECOMP replaces supervised next-question generator with few-shot LLM"
    464     },
    465     {
    466       "title": "Training Verifiers to Solve Math Word Problems (GSM8K)",
    467       "relevance": "Math QA benchmark demonstrating DECOMP's error-correction improvement of 14 points"
    468     },
    469     {
    470       "title": "Training Language Models to Follow Instructions with Human Feedback (InstructGPT)",
    471       "relevance": "Primary model (text-davinci-002) used in most experiments"
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 3,
    477       "justification": "Code and all prompts released on GitHub; technique directly usable by any developer with GPT-3 API access; demonstrates improvements on real NLP tasks."
    478     },
    479     "surprise_contrarian": {
    480       "score": 2,
    481       "justification": "Counterintuitive finding that modular prompts outperform CoT even when CoT uses identical reasoning steps (rollout); modularity matters independently of the reasoning procedure."
    482     },
    483     "fear_safety": {
    484       "score": 0,
    485       "justification": "No AI safety or risk concerns raised; purely a performance improvement paper on NLP benchmarks."
    486     },
    487     "drama_conflict": {
    488       "score": 1,
    489       "justification": "Mild competitive framing against CoT which was a dominant paradigm at the time; no major controversy."
    490     },
    491     "demo_ability": {
    492       "score": 3,
    493       "justification": "Code on GitHub, all prompts provided in the paper appendix; can be replicated with GPT-3 API access; worked examples in paper are immediately tryable."
    494     },
    495     "brand_recognition": {
    496       "score": 2,
    497       "justification": "Allen Institute for AI (AI2) is a well-known NLP research lab; uses GPT-3 (text-davinci-002) which was the flagship model at publication time."
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [
    502       {
    503         "hn_id": "37816614",
    504         "title": "Language Agent Tree Search Unifies Reasoning Acting and Planning in LMs",
    505         "points": 79,
    506         "comments": 11,
    507         "url": "https://news.ycombinator.com/item?id=37816614",
    508         "created_at": "2023-10-09T03:24:13Z"
    509       },
    510       {
    511         "hn_id": "25773418",
    512         "title": "Adversarial Grammatical Error Correction",
    513         "points": 3,
    514         "comments": 0,
    515         "url": "https://news.ycombinator.com/item?id=25773418",
    516         "created_at": "2021-01-14T07:48:57Z"
    517       },
    518       {
    519         "hn_id": "33182502",
    520         "title": "Code Librarian: A Software Package Recommendation System",
    521         "points": 2,
    522         "comments": 0,
    523         "url": "https://news.ycombinator.com/item?id=33182502",
    524         "created_at": "2022-10-12T20:19:58Z"
    525       },
    526       {
    527         "hn_id": "39202830",
    528         "title": "Low-Resource Languages Jailbreak GPT-4",
    529         "points": 1,
    530         "comments": 0,
    531         "url": "https://news.ycombinator.com/item?id=39202830",
    532         "created_at": "2024-01-31T12:11:05Z"
    533       }
    534     ],
    535     "top_points": 79,
    536     "total_points": 85,
    537     "total_comments": 11
    538   }
    539 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs