scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26641B)
      1 {
      2   "paper": {
      3     "title": "Humanity's Last Exam",
      4     "authors": ["Long Phan", "Alice Gatti", "Ziwen Han", "Nathaniel Li", "Josephina Hu", "Hugh Zhang", "Chen Bo Calvin Zhang", "Mohamed Shaaban", "John Ling", "Sean Shi", "Michael Choi", "Anish Agrawal", "Arnav Chopra", "Adam Khoja", "Ryan Kim", "Richard Ren", "Jason Hausenloy", "Oliver Zhang", "Mantas Mazeika", "Summer Yue", "Alexandr Wang", "Dan Hendrycks"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2501.14249"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No source code repository URL is provided in the paper. The paper releases the dataset at https://lastexam.ai but does not mention releasing evaluation code or scripts."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The dataset of 2,500 questions is publicly released at https://lastexam.ai. Section 1 states: 'we publicly release HLE at https://lastexam.ai.' A private held-out test set is also maintained."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, requirements files, or dependency lists are provided. The paper describes using various model APIs but does not provide setup details for reproducing the evaluation pipeline."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. While Section C.1.1 provides evaluation prompts and Section C.5 lists model versions, there are no instructions for running the full evaluation pipeline."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Tables 1-3 report only point estimates of accuracy and calibration error (e.g., '8.0%' accuracy for O1) with no confidence intervals, error bars, or uncertainty measures."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper compares model performance across models and categories but uses no statistical significance tests. Differences between models are presented as raw numbers without any test of whether differences are meaningful."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No effect sizes are reported. Model performance is presented as raw accuracy percentages and calibration errors without contextualizing the magnitude of differences between models."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The benchmark consists of 2,500 questions but no justification is given for why this number was chosen. No power analysis or statistical reasoning for the dataset size is provided."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance or standard deviation across experimental runs is reported. Results appear to be from single evaluation runs per model. Section 4.2 acknowledges 'inherent noise in model inference' but does not quantify it with repeated runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares 8 frontier models against each other (Table 1) and compares HLE difficulty against prior benchmarks like MMLU, GPQA, ARC Prize, and others in Figure 1."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The evaluated models are contemporary frontier models including GPT-4o, Claude 3.5 Sonnet, Gemini 2.0 Flash Thinking, o1, DeepSeek-R1, and o3-mini (high), all from late 2024 to early 2025."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "This is a benchmark paper, not a system paper. There is no system with components to ablate. The paper's contribution is the dataset itself."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper reports both accuracy (%) and RMS calibration error (%) for each model in Table 1. Token count analysis is also provided in Figure 5 and Section C.4."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Expert human reviewers evaluated question quality through two rounds of review (Section 3.2). An audit process with university students verified answers (Section B.2). Expert disagreement rates are reported in Section B.3."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 3 states: 'we publicly release these questions, while maintaining a private test set of held out questions to assess model overfitting.' Figure 4 also confirms a private held-out set."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 3 (Section C.3) provides category-wise performance breakdowns across Math, Biology/Medicine, Physics, CS/AI, Humanities, Chemistry, Engineering, and Other, for both text-only and full dataset evaluations."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "The paper does not provide qualitative analysis of specific failure cases or error patterns. It notes low accuracy and high calibration error but does not analyze what types of errors models make or why they fail on specific questions."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports several negative findings: all models achieve very low accuracy, all models show poor calibration (high RMS calibration error above 70%), and models confidently provide incorrect answers. Section B.3 reports a 15.4% expert disagreement rate, acknowledging dataset limitations."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims that 'State-of-the-art LLMs demonstrate low accuracy and calibration on HLE,' which is supported by Table 1 showing accuracy ranging from 2.7% to 13.4% and calibration errors of 73-89%. The claim of 'broad subject coverage' is supported by Figure 3 and Section B.4."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper makes no causal claims. It presents a benchmark and reports descriptive performance results without claiming causal relationships."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 5 explicitly bounds claims: 'High accuracy on HLE would demonstrate expert-level performance on closed-ended, verifiable questions...but it would not alone suggest autonomous research capabilities or \"artificial general intelligence.\" HLE tests structured academic problems rather than open-ended research or creative problem-solving abilities.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 4.2 discusses that low accuracy is 'partially by design' due to the filtering process and acknowledges 'inherent noise in model inference.' Section B.3 discusses expert disagreement as a factor affecting measured accuracy. The paper also acknowledges that about 15.4% of questions may have expert disagreements."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section C.5 (Table 4) provides exact model versions: gpt-4o-2024-11-20, claude-3-5-sonnet-20241022, gemini-1.5-pro-002, o1-2024-12-17, o3-mini-2025-01-31, and specifies January 20, 2025 release for DeepSeek-R1. Grok 2 uses 'grok-2-latest' which is less precise."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full evaluation prompts are provided in Section C.1.1, including both the system prompt for model evaluation and the judge prompt used for o3-mini answer verification. An example structured judge response is also included."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Table 4 states: 'All models use temperature 0.0 when configurable and not otherwise stated. o3-mini and o1 models only support temperature 1.0.' Gemini 2.0 Flash Thinking is noted as 'sampled at temperature 0.7.'"
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. Models are evaluated in a straightforward zero-shot prompting setup."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The data collection and filtering pipeline is documented in detail. Section 3.1 describes submission criteria, Section 3.2 describes the multi-stage review (LLM difficulty check, two rounds of expert review), and Figure 4 provides a visual pipeline. Section B.2 describes post-release refinement including community feedback and audit processes."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5 (Discussion) serves as a limitations section, discussing what HLE does and does not measure. Section B.3 discusses expert disagreement rates as a limitation. However, there is no dedicated 'Limitations' subsection."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section B.3 discusses specific threats: expert disagreement rate of 15.4%, higher disagreement in health/medicine domains (~18%), the challenge of verifying research-experience-based questions, and the limitation that reviewers could not always verify solution rationales within 5 minutes. Section 4.2 notes that 'small inflections close to zero accuracy are not strongly indicative of progress.'"
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5 explicitly states: 'HLE tests structured academic problems rather than open-ended research or creative problem-solving abilities, making it a focused measure of technical knowledge and reasoning. HLE may be the last academic exam we need to give to models, but it is far from the last benchmark for AI.'"
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The 2,500 questions comprising the public set are released at https://lastexam.ai. The raw benchmark data is available for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3.1 describes the collection process in detail: questions from ~1,000 expert contributors across 500+ institutions and 50 countries, submission format requirements, $500,000 prize pool incentive structure, and question style requirements."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 3.1 describes contributor recruitment: 'questions from nearly 1000 subject expert contributors affiliated with over 500 institutions across 50 countries – comprised mostly of professors, researchers, and graduate degree holders.' The prize pool is described as an incentive mechanism. Co-authorship was offered as an additional incentive."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Figure 4 documents the full pipeline from submission through LLM difficulty check, expert review rounds 1 and 2, organizer approval, and post-release refinement. Section B.1 notes over 70,000 attempts resulted in approximately 13,000 questions forwarded to human review, culminating in 2,500 accepted questions."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or grant acknowledgment section is found in the paper. The $500,000 prize pool is mentioned but its source is not disclosed. The lead affiliations are Center for AI Safety and Scale AI, but corporate funding details are not provided."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: organizing team members are from Center for AI Safety and Scale AI. Section A provides detailed institutional affiliations for all contributors. Anthropic and OpenAI employees are among contributors."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Scale AI (co-organizer) is an AI data company that has a financial interest in benchmark creation and AI evaluation. Center for AI Safety has a mission-aligned interest in demonstrating capability gaps. The funder of the $500,000 prize pool is not disclosed, making independence impossible to assess."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is provided. Scale AI is a commercial company, and some contributors are from companies whose products are evaluated (Anthropic, Google DeepMind, OpenAI), but no conflict-of-interest statement addresses this."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates frontier LLMs on the HLE benchmark but does not state training data cutoff dates for any of the models tested. This is relevant because questions may have been created and shared before model training concluded."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Section B.2 addresses contamination: searchable questions were audited using 'GPT-4o mini/GPT-4o search and Perplexity Sonar models' to identify and remove questions that could be found via web search. The design process itself (Section 3.1) requires questions to be 'resistant to simple internet lookup or database retrieval.'"
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "The benchmark is designed to resist contamination: questions must be original, non-searchable, and pass LLM difficulty checks. Section B.2 describes systematic removal of searchable questions. The private held-out set addresses gaming concerns. However, the paper does not discuss whether pre-release question submission data could have leaked into training data."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "This is a benchmark paper, not a human subjects study. The expert reviewers and contributors are part of the dataset creation process, not research participants."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human subjects study is conducted. Expert contributors voluntarily submit questions and reviewers evaluate them as part of a collaborative benchmark creation effort."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human subjects study is conducted. Contributors are characterized as 'professors, researchers, and graduate degree holders' from 500+ institutions across 50 countries, but this is dataset provenance, not participant demographics."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human subjects study is conducted. The paper describes question inclusion/exclusion criteria (Sections 3.1, 3.2) but these are for dataset items, not research participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human subjects experiment is conducted that would require randomization."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human subjects experiment is conducted that would require blinding."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human subjects study is conducted. The pipeline from 70,000 LLM attempts to 13,000 candidate questions to 2,500 accepted questions is documented, but this is data filtering, not participant attrition."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "While token counts are reported in Figure 5 and Section C.4, no monetary costs or wall-clock evaluation times are reported. The cost of running evaluations across all models is not disclosed."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget is stated. The paper does not disclose how much API spend was required for the 70,000+ LLM attempts during question submission, or for the final evaluations."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "State-of-the-art LLMs achieve very low accuracy on HLE, with the best model (o3-mini high) achieving only 13.4% accuracy.",
    286       "evidence": "Table 1 shows accuracy ranging from 2.7% (GPT-4o) to 13.4% (o3-mini high) across 8 frontier models on the full 2,500-question benchmark.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "All models exhibit poor calibration on HLE, with RMS calibration errors above 70%.",
    291       "evidence": "Table 1 reports RMS calibration errors ranging from 73% (DeepSeek-R1) to 89% (GPT-4o), indicating models are confidently incorrect.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "HLE is the first benchmark to combine expert-level difficulty with broad subject coverage across dozens of academic disciplines.",
    296       "evidence": "Section 3 and Figure 3 show coverage across 100+ subjects. The design process (Section 3.1) explicitly filters for expert difficulty. However, the claim of being 'the final closed-ended benchmark' is aspirational rather than empirically justified.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "HLE questions are resistant to internet retrieval and memorization.",
    301       "evidence": "Section 3.1 requires questions to be 'resistant to simple internet lookup or database retrieval.' Section B.2 describes systematic auditing of searchable questions using search-enabled models. However, the effectiveness of this filtering is only implicitly validated ('current frontier model performance on HLE after applying this procedure is similar to their performance on HLE before applying this procedure').",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "The expert disagreement rate on HLE is approximately 15.4% for the public set.",
    306       "evidence": "Section B.3 reports: 'This iterative process yielded a final estimated expert disagreement rate of 15.4% for the public set.' A targeted review on biology, chemistry, and health found approximately 18% disagreement.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "Humanity's Last Exam (HLE) is a 2,500-question multi-modal benchmark spanning 100+ academic subjects, created by ~1,000 expert contributors from 500+ institutions. All frontier LLMs tested achieve very low accuracy (2.7% to 13.4%) and exhibit poor calibration (RMS errors 73-89%), showing a significant gap between current AI capabilities and expert-level academic knowledge. Reasoning models use substantially more tokens but only achieve moderate improvements. The benchmark includes both a public set and a private held-out test set to detect overfitting.",
    312   "red_flags": [
    313     {
    314       "flag": "No uncertainty quantification",
    315       "detail": "All accuracy and calibration results are reported as single-run point estimates with no confidence intervals, error bars, or variance across runs. Given the acknowledged 'inherent noise in model inference,' repeated runs could yield meaningfully different results, especially at low accuracy levels."
    316     },
    317     {
    318       "flag": "Conflict of interest",
    319       "detail": "Scale AI is a co-organizing institution that commercially benefits from AI evaluation and benchmarking. Several model providers (Anthropic, OpenAI, Google DeepMind) have employees among the contributors. No conflict-of-interest statement is provided."
    320     },
    321     {
    322       "flag": "15.4% expert disagreement rate",
    323       "detail": "A 15.4% expert disagreement rate on the public set (18% for biology/chemistry/health) means a non-trivial fraction of questions may have debatable answers, which complicates interpretation of model accuracy at these low levels. At ~10% accuracy, the signal-to-noise ratio between real model capability and label noise becomes concerning."
    324     },
    325     {
    326       "flag": "Selection bias in benchmark construction",
    327       "detail": "Questions are only included if LLMs fail to answer them correctly (Section 3.2). This adversarial filtering guarantees low initial accuracy by design, making it difficult to interpret whether low scores reflect genuine capability gaps or benchmark construction methodology. The paper acknowledges this partially in Section 4.2."
    328     },
    329     {
    330       "flag": "Prize pool source undisclosed",
    331       "detail": "A $500,000 USD prize pool was offered but the paper does not disclose who funded it, making it impossible to assess potential conflicts of interest in question selection and benchmark design."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "Measuring massive multitask language understanding",
    337       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart", "Andy Zou", "Mantas Mazeika", "Dawn Song", "Jacob Steinhardt"],
    338       "year": 2021,
    339       "arxiv_id": "2009.03300",
    340       "relevance": "MMLU is the canonical multi-task benchmark that HLE aims to succeed, demonstrating benchmark saturation in LLM evaluation."
    341     },
    342     {
    343       "title": "GPQA: A graduate-level google-proof Q&A benchmark",
    344       "authors": ["David Rein", "Betty Li Hou", "Asa Cooper Stickland"],
    345       "year": 2023,
    346       "arxiv_id": "2311.12022",
    347       "relevance": "Prior expert-level benchmark using multi-stage review and expert-written questions, directly comparable to HLE's methodology."
    348     },
    349     {
    350       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    351       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    352       "year": 2024,
    353       "arxiv_id": "2310.06770",
    354       "relevance": "Major code generation benchmark for evaluating LLM software engineering capabilities."
    355     },
    356     {
    357       "title": "FrontierMath: A benchmark for evaluating advanced mathematical reasoning in AI",
    358       "authors": ["Elliot Glazer", "Ege Erdil", "Tamay Besiroglu"],
    359       "year": 2024,
    360       "arxiv_id": "2411.04872",
    361       "relevance": "Contemporary expert-level math benchmark using multi-stage review, directly comparable in methodology to HLE."
    362     },
    363     {
    364       "title": "Evaluating large language models trained on code",
    365       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    366       "year": 2021,
    367       "arxiv_id": "2107.03374",
    368       "relevance": "HumanEval benchmark paper, foundational for LLM code generation evaluation and relevant to benchmark contamination discussions."
    369     },
    370     {
    371       "title": "MLE-bench: Evaluating machine learning agents on machine learning engineering",
    372       "authors": ["Jun Shern Chan", "Neil Chowdhury", "Oliver Jaffe"],
    373       "year": 2024,
    374       "arxiv_id": "2410.07095",
    375       "relevance": "Benchmark for evaluating LLM agents on ML engineering tasks, relevant to agentic AI evaluation methodology."
    376     },
    377     {
    378       "title": "RE-Bench: Evaluating frontier AI R&D capabilities of language model agents against human experts",
    379       "authors": ["Hjalmar Wijk", "Tao Lin", "Joel Becker"],
    380       "year": 2024,
    381       "arxiv_id": "2411.15114",
    382       "relevance": "Benchmark comparing AI agent R&D capabilities against human experts, complementary to HLE's assessment of closed-ended academic capabilities."
    383     },
    384     {
    385       "title": "CyBench: A framework for evaluating cybersecurity capabilities and risks of language models",
    386       "authors": ["Andy K. Zhang", "Neil Perry", "Riya Dulepet"],
    387       "year": 2024,
    388       "arxiv_id": "2408.08926",
    389       "relevance": "Benchmark evaluating LLM cybersecurity capabilities, relevant to AI safety and capability assessment."
    390     },
    391     {
    392       "title": "Measuring short-form factuality in large language models",
    393       "authors": ["Jason Wei", "Narina Karina", "Hyung Won Chung"],
    394       "year": 2024,
    395       "arxiv_id": "2411.04368",
    396       "relevance": "Provides the calibration measurement methodology used in HLE's evaluation setup."
    397     },
    398     {
    399       "title": "Evaluating frontier models for dangerous capabilities",
    400       "authors": ["Mary Phuong", "Matthew Aitchison", "Elliot Catt"],
    401       "year": 2024,
    402       "arxiv_id": "2403.13793",
    403       "relevance": "Evaluation of frontier model dangerous capabilities, relevant to AI safety benchmark methodology."
    404     },
    405     {
    406       "title": "The WMDP benchmark: Measuring and reducing malicious use with unlearning",
    407       "authors": ["Nathaniel Li", "Alexander Pan", "Anjali Gopal"],
    408       "year": 2024,
    409       "arxiv_id": "2403.03218",
    410       "relevance": "Benchmark measuring potential for malicious use of LLMs, relevant to AI safety evaluation methodology."
    411     }
    412   ]
    413 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs