scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25860B)
      1 {
      2   "paper": {
      3     "title": "LimAgents: Multi-Agent LLMs for Generating Research Limitations",
      4     "authors": ["Ibrahim Al Azher", "Zhishuai Guo", "Hamed Alhoori"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2601.11578",
      8     "doi": "10.48550/arXiv.2601.11578"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "GitHub link provided in the abstract: https://github.com/IbrahimAlAzhar/LimAgents."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "HuggingFace dataset link provided in the abstract: https://huggingface.co/datasets/iaadlab/LimAgents."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No environment specifications, requirements files, or dependency details are mentioned in the paper."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided in the paper. The code link is given but no README or reproduction guide is described."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "All results in Tables I-VII are reported as point estimates with no confidence intervals or error bars."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper claims improvements (e.g., '+15.51%', '+4.41%') based solely on comparing numbers with no statistical significance tests."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Effect sizes are reported as percentage point improvements over baselines with context, e.g., '+15.51 point improvement over its zero-shot baseline' (from 49.43% to 64.94%), '+4.41 point improvement over the zero-shot baseline (62.04%)' in Section VII."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The dataset comprises 2,700 NeurIPS papers but no justification is given for why this number was chosen or whether it provides sufficient statistical power."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. It is unclear whether experiments were run multiple times."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Zero-shot prompting baselines are included for both Llama 3 8B and GPT-4o mini (Table I)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The only baselines are zero-shot prompting with the same models. No comparison against other limitation generation systems (e.g., BAGELS framework [1], LimGen [14], or other multi-agent review systems like AgentReview or MARG) is provided."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Extensive ablation studies in Section VIII: agent quantity, citation agent context, input granularity, core agent contributions (Tables VI, VII), feedback iterations, and chunk enrichment."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple metrics used: Ground Truth Coverage, ROUGE-L, BLEU, Cosine Similarity, Jaccard Similarity, plus qualitative metrics (Faithfulness, Soundness, Importance)."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Human evaluation conducted: 500-sample annotation study with 3 graduate students for data extraction verification, and a 100-pair validation study with 2 independent annotators for LLM-as-Judge reliability (Section III and V.A)."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No mention of train/dev/test splits. It is unclear whether the evaluation was performed on data separate from any tuning or development decisions."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Per-agent breakdowns provided in Tables IV, V, VI showing performance of individual agents across multiple quality dimensions."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Failure cases extensively discussed: Graph Agent caused -50.74 point performance collapse, Gemini 1.5 Flash failed in Extractor role, DeepSeek was incompatible, 9-agent setup was counterproductive, feedback degraded Llama 3 performance (Sections VII-VIII)."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Multiple negative results reported: feedback loop reducing coverage by -12.62 points (Table II), 9-agent config -12.23 points worse, Graph Agent -50.74 points, dual-stage RAG unsuccessful, specialized agents consistently underperforming (Sections VII-VIII)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims of '+15.51% coverage gain' for GPT-4o mini 4-agent and '+4.41% improvement' for Llama 3 8B 3-agent are supported by Table I results."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "The paper makes causal claims like 'agent-based design distributes complex reasoning' and attributes improvements to the framework design. However, as an observational comparison (not controlled for confounds like prompt length, number of LLM calls, or token budget), these causal attributions are not adequately justified."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The title and abstract frame this as a general framework for 'generating research limitations' but experiments are limited to NeurIPS 2022-2023 papers with only two models (Llama 3 8B and GPT-4o mini). The limitations section acknowledges this but the framing throughout is broad."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "No discussion of alternative explanations for the improvements. For example, the multi-agent setup uses more tokens/compute than zero-shot — the improvement could be due to increased inference budget rather than agent decomposition. This confound is not addressed."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper measures 'Ground Truth Coverage' but frames results as showing the framework generates 'more meaningful and contextually grounded limitations.' The ground truth itself is constructed via LLM extraction from author statements and OpenReview, yet this proxy is treated as measuring limitation quality without discussing the gap."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "Models referred to as 'Llama 3 8B', 'GPT-4o mini', 'DeepSeek R1 Qwen Distil', 'Gemini' without specific version identifiers, snapshot dates, or API versions."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full prompt text for all agents (Extractor, Analyzer, Reviewer, Citation, Master, Judge, Evaluation) provided in Figures 2-9 in the Appendix."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Only 'zero temperature for consistency' mentioned for extraction and feedback threshold of 8/10. No temperature, top-p, or max tokens settings reported for the worker agents, master agent, or evaluation."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Detailed scaffolding description in Sections IV and Figure 1: sequential agent pipeline, RAG system with hybrid BM25+FAISS retrieval, LLM re-ranker, self-feedback loop, quality thresholds, master agent consolidation."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Data preprocessing described in Section III: ScienceParse for text extraction, Selenium for OpenReview scraping, rule-based limitation extraction with keyword scanning and section filtering, LLM refinement, and merger/deduplication steps."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section XI 'Limitations and Future Work' provides a dedicated limitations discussion."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Specific threats discussed: findings based solely on NeurIPS dataset, only two models tested, input truncation for Llama, human evaluation confined to extraction task, prompt incompatibility with certain reasoning models (Section XI)."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section XI explicitly states boundaries: 'findings are based solely on the NeurIPS dataset and only two models (Llama 3 8B and GPT-4o mini), which restricts generalizability.'"
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Dataset released on HuggingFace (https://huggingface.co/datasets/iaadlab/LimAgents) enabling independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section III describes data collection: 2,700 NeurIPS papers from 2022-2023, ScienceParse for parsing, Selenium for OpenReview scraping, two-stream extraction (author-stated and reviewer-stated), with detail on the pipeline."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "For the human evaluation, 3 graduate students with ML/NLP expertise were recruited but no description of how they were selected, potential biases, or recruitment process. The 2 annotators for the validation study are also undescribed."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "While the pipeline is described qualitatively, key numbers are missing. The paper states 2,700 papers yielded 51,300 limitations total but doesn't document how many were filtered at each stage or how many papers failed parsing."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All three authors' affiliations at Northern Illinois University Department of Computer Science are clearly listed."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding information disclosed, so independence of funder cannot be assessed."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No training data cutoff dates stated for Llama 3 8B or GPT-4o mini. These models were evaluated on NeurIPS 2022-2023 papers which could be in their training data."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether NeurIPS 2022-2023 papers (and their OpenReview comments) appeared in the training data of the LLMs used."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "NeurIPS 2022-2023 papers and OpenReview comments were publicly available before Llama 3 and GPT-4o mini training. The models may have seen both the papers and the ground truth limitations. This is not addressed."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human subjects study — the human annotators evaluated system outputs rather than being study participants."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human subjects study in the traditional sense. Annotators evaluated extraction quality."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human subjects study. Annotators are described only as 'graduate students with ML/NLP expertise.'"
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human subjects study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human subjects study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human subjects study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human subjects study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No API costs, token counts, or wall-clock time reported despite the framework making many LLM calls per paper (multiple agents × 2,700 papers)."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No total computational budget, GPU hours, or API spend mentioned."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No mention of random seeds or seed sensitivity. Results appear to be from single runs."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The number of experimental runs is never stated. It is unclear whether results are from single or multiple runs."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No hyperparameter search budget reported despite multiple design choices (feedback threshold 8/10, RAG top-20, relevance score ≥8, etc.)."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Many configurations were tried (Tables I, VII) but no clear validation set or principled selection procedure is described. The 'proposed' configurations appear selected based on test set performance."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Many comparisons across configurations, models, and metrics with no multiple comparison correction or even statistical tests."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "Authors evaluate their own framework against their own zero-shot baselines without acknowledging self-comparison bias or seeking independent evaluation."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The multi-agent framework makes far more LLM calls than zero-shot baselines (4+ agent calls + judge + self-feedback + master vs. 1 call). This compute difference is never discussed or controlled for."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The ground truth is constructed via LLM extraction from author statements and OpenReview comments, then used to evaluate LLM-generated outputs. No discussion of whether this LLM-constructed ground truth is valid or complete for measuring limitation generation quality."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The RAG pipeline, retrieval system, and agent scaffolding are intertwined with the model comparison. When comparing Llama 3 8B vs GPT-4o mini, the scaffold confound (different optimal configs) is not isolated."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "NeurIPS 2022-2023 papers and their OpenReview comments were publicly available before model training. The LLMs may have memorized the ground truth limitations. This temporal leakage is not discussed."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup leaks information. The Citation Agent accesses cited/citing papers which may contain limitation-related content overlapping with the ground truth."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of potential non-independence between papers (e.g., papers from the same research group, papers citing each other, shared limitations)."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No concrete leakage detection or prevention method is used."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "RAG + multi-agent GPT-4o mini achieves +15.51% coverage gain over zero-shot baseline",
    363       "evidence": "Table I: GPT-4o mini 4-Agent achieves 64.94% CGT vs 49.43% zero-shot baseline (Section VII.A)",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "Llama 3 8B 3-agent setup yields +4.41% improvement over zero-shot baseline",
    368       "evidence": "Table I: Llama 3 8B 3-Agent achieves 66.45% vs 62.04% zero-shot (Section VII.A)",
    369       "supported": "moderate"
    370     },
    371     {
    372       "claim": "Optimized 3-Agent Llama 3 8B outperforms GPT-4o mini's optimal 4-Agent configuration by +1.51 points",
    373       "evidence": "Table I: Llama 3 8B 3-Agent 66.45% vs GPT-4o mini 4-Agent 64.94% (Section VII.A)",
    374       "supported": "weak"
    375     },
    376     {
    377       "claim": "Feedback-driven refinement enhances quality but reduces coverage breadth",
    378       "evidence": "Table II: Coverage drops from 66.45% to 53.83% with feedback, while LLM-generated text coverage improves from 36.59% to 44.77% (Section VII.B)",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Excessive agent decomposition (9 agents) is counterproductive, causing -12.23 point drop",
    383       "evidence": "Table I: 9-agent config achieves 49.81% vs 62.04% zero-shot (Section VIII)",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Human-LLM judge agreement is 0.98 and 0.95 for the two annotators",
    388       "evidence": "Section V.A: validation on 100 random pairs with two independent annotators",
    389       "supported": "moderate"
    390     }
    391   ],
    392   "methodology_tags": ["benchmark-eval"],
    393   "key_findings": "LimAgents, a multi-agent LLM framework for generating research limitations, outperforms zero-shot prompting for both Llama 3 8B (+4.41pp) and GPT-4o mini (+15.51pp) on ground truth coverage. Optimal agent configuration depends on model capacity: smaller models benefit from focused 3-agent setups while larger models leverage 4-agent configurations including a Citation Agent with RAG. Feedback-driven refinement improves quality at the expense of coverage breadth, and excessive agent decomposition (9 agents) degrades performance.",
    394   "red_flags": [
    395     {
    396       "flag": "Compute confound in comparisons",
    397       "detail": "Multi-agent configurations make 4-6+ LLM calls per paper vs 1 for zero-shot baselines. The improvement may be attributable to increased inference budget rather than agent decomposition. No compute-matched baselines are included."
    398     },
    399     {
    400       "flag": "No statistical tests for any claims",
    401       "detail": "All claims of improvement are based on comparing single point estimates with no significance tests, confidence intervals, or variance reporting across the 2,700-paper corpus."
    402     },
    403     {
    404       "flag": "LLM-constructed ground truth evaluated by LLM",
    405       "detail": "Ground truth limitations are extracted by GPT-4o mini, and the pointwise evaluation judge is also GPT-4o mini. This circular dependency could inflate coverage scores if the judge and generator share similar biases."
    406     },
    407     {
    408       "flag": "Contamination risk unaddressed",
    409       "detail": "NeurIPS 2022-2023 papers and OpenReview comments were public before model training. LLMs may have memorized limitations, inflating both ground truth extraction quality and generation coverage."
    410     },
    411     {
    412       "flag": "Cross-model comparison not controlled",
    413       "detail": "The claim that Llama 3 8B 3-agent outperforms GPT-4o mini 4-agent (+1.51 points) compares different input configurations and model capacities without controlling for confounds."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "BAGELS: Benchmarking the Automated Generation and Extraction of Limitations from Scholarly Text",
    419       "authors": ["Ibrahim Al Azher"],
    420       "year": 2025,
    421       "arxiv_id": "2505.18207",
    422       "relevance": "Benchmark framework for limitation generation/extraction that this paper builds upon."
    423     },
    424     {
    425       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    426       "authors": ["Jason Wei"],
    427       "year": 2022,
    428       "relevance": "Foundational prompting technique that inspired the 'chain of limitations' approach used in LimAgents."
    429     },
    430     {
    431       "title": "Why do multi-agent LLM systems fail?",
    432       "authors": ["Mert Cemri"],
    433       "year": 2025,
    434       "arxiv_id": "2503.13657",
    435       "relevance": "Directly relevant to understanding failure modes of multi-agent LLM systems."
    436     },
    437     {
    438       "title": "AgentReview: Exploring Peer Review Dynamics with LLM Agents",
    439       "authors": ["Yiqiao Jin"],
    440       "year": 2024,
    441       "arxiv_id": "2406.12708",
    442       "relevance": "Multi-agent LLM system for peer review that models reviewers, authors, and area chairs."
    443     },
    444     {
    445       "title": "Are we there yet? Revealing the risks of utilizing large language models in scholarly peer review",
    446       "authors": ["Rui Ye"],
    447       "year": 2024,
    448       "arxiv_id": "2412.01708",
    449       "relevance": "Evaluates risks and shortcomings of LLM-based peer review including hallucination and bias."
    450     },
    451     {
    452       "title": "Can large language models provide useful feedback on research papers? A large-scale empirical analysis",
    453       "authors": ["Weixin Liang"],
    454       "year": 2024,
    455       "relevance": "Large-scale study of LLM feedback quality on research papers."
    456     },
    457     {
    458       "title": "Is LLM a reliable reviewer? A comprehensive evaluation of LLM on automatic paper reviewing tasks",
    459       "authors": ["Ruiyang Zhou"],
    460       "year": 2024,
    461       "relevance": "Evaluation of LLM reliability for automatic paper reviewing."
    462     },
    463     {
    464       "title": "OpenReviewer: A specialized large language model for generating critical scientific paper reviews",
    465       "authors": ["Maximilian Idahl"],
    466       "year": 2024,
    467       "arxiv_id": "2412.11948",
    468       "relevance": "Fine-tuned LLM for structured review generation, directly relevant to AI-assisted peer review."
    469     },
    470     {
    471       "title": "DeepReview: Improving LLM-based paper review with human-like deep thinking process",
    472       "authors": ["Minjun Zhu"],
    473       "year": 2025,
    474       "arxiv_id": "2503.08569",
    475       "relevance": "Multi-stage reasoning framework for LLM-based review generation addressing hallucinations."
    476     },
    477     {
    478       "title": "MARG: Multi-Agent Review Generation for Scientific Papers",
    479       "authors": ["Mike D'Arcy"],
    480       "year": 2024,
    481       "arxiv_id": "2401.04259",
    482       "relevance": "Multi-agent approach to scientific paper review generation."
    483     },
    484     {
    485       "title": "Dynamic LLM-Agent Network: An LLM-agent collaboration framework with agent team optimization",
    486       "authors": ["Zijun Liu"],
    487       "year": 2023,
    488       "arxiv_id": "2310.02170",
    489       "relevance": "Framework for LLM agent collaboration and team optimization relevant to multi-agent system design."
    490     },
    491     {
    492       "title": "Can LLMs Identify Critical Limitations within Scientific Research? A Systematic Evaluation on AI Research Papers",
    493       "authors": ["Zhijian Xu"],
    494       "year": 2025,
    495       "arxiv_id": "2507.02694",
    496       "relevance": "Systematic evaluation of LLMs for identifying research limitations in AI papers."
    497     }
    498   ]
    499 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs