scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27299B)
      1 {
      2   "paper": {
      3     "title": "AInstein: Assessing the Feasibility of AI-Generated Approaches to Research Problems",
      4     "authors": [
      5       "Shambhavi Mishra",
      6       "Gaurav Sahu",
      7       "Marco Pedersoli",
      8       "Laurent Charlin",
      9       "Jose Dolz",
     10       "Christopher Pal"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv preprint",
     14     "arxiv_id": "2510.05432"
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. There is no mention of code being available."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The paper describes a curated dataset of 1,214 ICLR 2025 papers but does not provide a download link or release the curated set, problem statements, or generated solutions."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No environment setup, dependency list, or software versions are specified. The paper does not include a requirements.txt, Dockerfile, or equivalent."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No step-by-step instructions for reproducing the experiments are provided. The methodology is described at a high level but without specific runnable commands or scripts."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Tables 2 and 6 report mean ± std for various metrics (e.g., '8.80 ± 0.77' for Fidelity, '0.868 ± 0.035' for cosine similarity). Standard deviations serve as a measure of uncertainty."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Appendix B (Table 7) reports two-sample t-tests and Mann-Whitney U tests with Bonferroni correction (α ≈ 0.0036) comparing GPT-OSS-120B and Mistral-24B performance, including p-values."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Table 7 in Appendix B reports Cohen's d for each comparison (e.g., d = 0.021, d = -0.033, d = 0.070), providing standardized effect size measures alongside significance tests."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper uses 1,214 papers with intentional oversampling of Oral and Spotlight tiers (Table 1) but provides no formal justification for why this sample size is sufficient or why this specific stratification ratio was chosen. No power analysis is presented."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Standard deviations are reported in Tables 2 and 6 (e.g., '8.80 ± 0.77'). These indicate variance across the experimental runs."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper compares three model families (GPT-OSS-120B, Qwen-235B, Mistral-24B) against each other and also includes 'Human Abstracts' as a gold-standard reference in the ELO tournament (Table 5)."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "All three models tested (GPT-OSS-120B, Qwen-235B, Mistral-24B) are current-generation large language models. The human abstracts from ICLR 2025 serve as the primary comparison point."
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The systematic testing of all Internal (Mi) × External (Me) model pairings (9 configurations per problem source, across 3 problem sources = 27 configurations in Table 3) functions as an ablation that isolates the contribution of the internal model vs. external model vs. problem source."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper uses three primary metrics (Success Rate, Rediscovery, Novel & Valid), supplemented by semantic coherence (cosine similarity, Euclidean distance), readability scores (Flesch-Kincaid), and the Generalizer deficit score with four sub-criteria."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Section 5 describes a 'Human-Verified Competitive Ranking' where 'the authors of this study served as human evaluators' in a head-to-head tournament with pairwise preferences, yielding ELO ratings (Table 5). Detailed case studies are in Appendix C."
     91       },
     92       "held_out_test_set": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The entire evaluation dataset consists of ICLR 2025 papers with submission deadlines after the models' knowledge cutoffs, creating a natural temporal separation: 'all models used in our experiments have knowledge cutoffs that predate the ICLR 2025 submission deadline' (Section 4.1)."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Figure 3 provides per-tier breakdowns (Oral, Spotlight, Poster) of Success Rate, Rediscovery Rate, and Novel & Valid Score. The qualitative analysis in Section 5 and Table 8 provides per-cluster breakdowns of solution archetypes."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper discusses cases where models fail at strict rediscovery (τ=5 scores plummet to 15-20% from 75-84% at τ=4, Table 3). The ELO tournament includes cases where LLM solutions lose to human abstracts (Appendix C, Examples 2 and 7)."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper reports that strict rediscovery is 'exceptionally rare' (15-20% at τ=5), that Mistral-24B substantially underperforms larger models, and acknowledges 'problem-solving ability remains fragile and highly sensitive to framing' (Abstract)."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims LLMs can 'rediscover feasible solutions and occasionally propose creative alternatives' — supported by Success Rate data in Table 3 (up to 74% strict). The claim that 'problem-solving ability remains fragile and highly sensitive to framing' is supported by the large gap between τ=4 and τ=5 thresholds and the dependence on problem source."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The primary causal claim is that the internal model's capability determines solution quality. This is supported by the systematic combinatorial design (all Mi × Me pairings) that controls for the external model and problem source, effectively isolating the internal model variable. The ablation-style design is adequate for this claim."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The title claims to assess 'AI-Generated Approaches to Research Problems' broadly, but the study tests only on AI/ML papers from ICLR 2025. The conclusion acknowledges 'Our work is primarily focused on the AI domain' but the title and abstract ('research problems') suggest broader generality. The paper also uses only 3 model families."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper does not substantively discuss alternative explanations for its results. For instance, the high Success Rates could reflect LLM-as-a-judge bias (the judge is one of the tested models — GPT-OSS-120B) rather than genuine problem-solving. The paper validates with a second judge (Qwen-235B) but does not discuss whether both judges share systematic biases, or whether 'successful' solutions are just plausible-sounding text."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper uses pseudonymized model names (GPT-OSS-120B, Qwen-235B, Mistral-24B) without specifying exact model versions, snapshot dates, or API version identifiers. No specific version strings (e.g., 'gpt-4-0613') are provided. The use of pseudonyms makes it impossible to determine what was actually tested."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Appendix E provides the exact prompts for the Generalizer agent, Solver agent, Generalizer critic, and Solution critic, including system roles and user prompts with placeholder variables clearly marked."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper reports MaxInternalAttempts=20 and MaxExternalAttempts=20 but does not specify LLM inference parameters such as temperature, top-p, max tokens, or other sampling settings for any of the three models used."
    150       },
    151       "scaffolding_described": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "The dual-loop iterative refinement scaffold (internal + external critique) is described in detail in Section 3.2 and Algorithm 1, including the roles of Mi and Me, accept/reject signals, and maximum iteration counts."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 4.1 and Table 1 describe the data curation: 1,214 papers from ICLR 2025 stratified by acceptance tier, with the stratification ratios compared against the full ICLR corpus. The oversampling of Oral and Spotlight papers is explained."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion (Section 6) mentions two limitations in a single sentence: 'Our work is primarily focused on the AI domain, and the LLM-as-a-judge paradigm carries inherent biases.' This does not constitute substantive discussion."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "The paper's only acknowledgment of threats is the brief mention of domain limitation and judge bias in the conclusion. No specific threats are analyzed, such as the potential for the judge model (GPT-OSS-120B) to favor its own outputs, or the ELO tournament being evaluated by paper authors rather than independent evaluators."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Beyond stating the work is 'primarily focused on the AI domain,' the paper does not explicitly state what the results do NOT show. It does not bound claims about reasoning vs. recall, does not state that the results cannot speak to whether models truly 'understand' science, and does not qualify the ELO results given the small number of human comparisons (42 matches total)."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "Neither the curated dataset of 1,214 papers, the generated problem statements, the generated solutions, nor the LLM judge scores are made available for independent verification."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 4.1 describes data collection: papers curated from ICLR 2025 conference submissions, drawing from the ICLR Dataset (González-Márquez & Kobak, 2024), stratified by acceptance tier (Oral, Spotlight, Poster), with excluded tiers (Rejected, Withdrawn) noted."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "The human evaluation in the ELO tournament was conducted by 'the authors of this study.' No description of how evaluators were selected (beyond being the authors), their qualifications, or potential biases from self-evaluation is provided."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The pipeline is documented in Sections 3 and 4: abstract → problem extraction (Generalizer) → solution generation (Solver) → LLM-as-a-judge evaluation. Algorithm 1 formalizes the process. Table 1 shows input/output counts."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding sources, grants, or sponsors are acknowledged anywhere in the paper. The authors are affiliated with ServiceNow Research and academic institutions, but no funding disclosure is made."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are listed on the first page, including LIVIA/ÉTS Montréal, Mila, HEC Montréal, ServiceNow Research, Canada CIFAR AI Chair, Université de Montréal, and Polytechnique Montréal."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding is disclosed, so independence cannot be assessed. Two authors are affiliated with ServiceNow Research, which has commercial interests in AI capabilities, yet this potential conflict is not addressed."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is included in the paper. Authors from ServiceNow Research may have commercial interests related to the findings, but this is not disclosed."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "The paper states 'all models used in our experiments have knowledge cutoffs that predate the ICLR 2025 submission deadline' (Section 4.1) but does not specify the actual training cutoff dates for any of the three models. The exact cutoff dates are needed to assess contamination risk."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "Section 4.1 explicitly discusses this: 'To prevent data leakage and ensure our evaluation tests reasoning rather than retrieval, all models used in our experiments have knowledge cutoffs that predate the ICLR 2025 submission deadline.' The use of post-cutoff papers is the contamination mitigation strategy."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "The paper's experimental design specifically addresses contamination by using ICLR 2025 submissions that postdate model training cutoffs. Additionally, the problem extraction phase strips solution details from abstracts, creating a second layer of contamination protection."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "The human evaluation component is a small expert annotation task by the paper authors (ELO tournament), not a formal human subjects study requiring pre-registration."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "The human evaluation involved only the paper's authors as evaluators, not external human subjects. No IRB review would be required."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "The evaluators were the paper's authors, not recruited study participants. Demographics are not applicable."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No external participants were recruited; the paper's authors served as evaluators."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No formal human subjects experiment was conducted. The ELO tournament used randomized and anonymized presentation of solutions, but this is not a human subjects study."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No formal human subjects study. The ELO tournament mentions solutions were 'presented in a randomized and anonymized fashion,' but this is part of the annotation protocol, not a human subjects blinding design."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No external human participants were recruited, so attrition is not applicable."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "No API costs, token counts, or per-example costs are reported. The framework uses up to 20 internal × 20 external iterations per paper across 1,214 papers and 27 configurations, representing potentially massive API costs, but these are not quantified."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No total compute budget, GPU hours, API spend, or wall-clock time is reported. Given the scale (1,214 papers × 27 configurations × up to 400 iterations each), the total cost could be substantial but is not disclosed."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "The internal model's capability is the single most predictive factor of success in scientific problem-solving.",
    293       "evidence": "Table 3 shows GPT-OSS-120B as internal agent achieves 74.05% strict Success Rate vs. 43.82% for Qwen-235B and 34.60% for Mistral-24B. This pattern holds across all 27 configurations. Table 5 ELO ratings confirm GPT-OSS-120B self-play (1119) far exceeds other configurations (939, 927, 828).",
    294       "supported": "strong"
    295     },
    296     {
    297       "claim": "LLMs rarely achieve perfect rediscovery of human solutions but often find novel, valid alternatives.",
    298       "evidence": "Table 3 shows Rediscovery rates drop from 75-84% (τ=4) to 15-20% (τ=5), while Novel & Valid remains relatively stable (e.g., 59.39% at strict threshold for top configuration). Section 5 states 'perfect rediscovery is exceptionally rare.'",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "LLM problem-solving ability is not significantly impacted by the perceived quality or difficulty of the research paper.",
    303       "evidence": "Figure 3 shows Success Rates of 69.0% (Oral), 77.8% (Spotlight), 72.5% (Poster) for the top agent, described as a 'counter-intuitive result' (Section 5). However, the differences across tiers are modest and not tested for significance.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "The findings are robust to evaluator choice.",
    308       "evidence": "Table 4 replicates results using Qwen3-235B as judge instead of GPT-OSS-120B, showing the same patterns (Rediscovery drop from τ=4 to τ=5, GPT-OSS-120B dominance). However, both judges are LLMs and may share systematic biases.",
    309       "supported": "moderate"
    310     },
    311     {
    312       "claim": "These findings provide the first large-scale evidence on whether LLMs can act as autonomous scientific problem-solvers.",
    313       "evidence": "The study covers 1,214 ICLR papers, which is larger than prior work cited. However, 'first large-scale' is hard to verify without exhaustive literature review, and the scope is limited to AI/ML papers.",
    314       "supported": "moderate"
    315     }
    316   ],
    317   "methodology_tags": [
    318     "benchmark-eval"
    319   ],
    320   "key_findings": "AInstein evaluates LLM scientific problem-solving on 1,214 ICLR 2025 papers using iterative critique loops for problem extraction and solution generation. The key finding is that the internal model's capability is the dominant factor in solution quality, with GPT-OSS-120B achieving 74% strict Success Rate vs. 44% for Qwen-235B. Perfect rediscovery of human solutions is rare (15-20% at strict threshold), but models frequently generate novel, valid alternatives (59% for the best configuration). Performance is surprisingly stable across paper quality tiers (Oral, Spotlight, Poster).",
    321   "red_flags": [
    322     {
    323       "flag": "LLM-as-a-judge uses same model being evaluated",
    324       "detail": "GPT-OSS-120B serves as both the primary LLM judge and one of the models being evaluated. This creates a potential self-preferencing bias — the judge may systematically favor outputs with stylistic or structural properties it tends to produce. While validation with Qwen-235B as an alternative judge partially addresses this, both are LLMs and may share systematic biases toward plausible-sounding but incorrect solutions."
    325     },
    326     {
    327       "flag": "Pseudonymized model names obscure reproducibility",
    328       "detail": "The paper uses pseudonyms (GPT-OSS-120B, Qwen-235B, Mistral-24B) without revealing the actual model identifiers or versions. This makes the experiments impossible to replicate and prevents readers from assessing whether the models' training data truly predates ICLR 2025 submissions."
    329     },
    330     {
    331       "flag": "Authors served as own human evaluators",
    332       "detail": "The ELO tournament was conducted by the paper's authors, who may have biases toward their own framework's outputs or toward the larger model. The evaluation was done in 'randomized and anonymized fashion,' but the small number of matches (42 total, per Table 5 win/loss counts) and lack of inter-annotator agreement metrics limit the strength of this validation."
    333     },
    334     {
    335       "flag": "No cost reporting despite massive scale",
    336       "detail": "The experiment tests 27 model configurations across 1,214 papers with up to 400 LLM calls per paper (20 internal × 20 external iterations), plus LLM-as-a-judge scoring. The total API cost could be very substantial but is never reported, making it impossible to assess the method's practicality."
    337     },
    338     {
    339       "flag": "Minimal limitations discussion",
    340       "detail": "A study making claims about 'genuine reasoning' vs. 'sophisticated recall' in LLMs provides only two sentences of limitations in the conclusion. The paper does not discuss whether 'successful' solutions are truly novel reasoning or whether LLMs recombine memorized techniques in plausible-sounding ways — which is the central question the paper claims to address."
    341     },
    342     {
    343       "flag": "No artifacts released",
    344       "detail": "Despite proposing a reusable evaluation framework, the paper releases no code, data, problem statements, generated solutions, or judge scores. This prevents any independent verification of the results."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    350       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    351       "year": 2023,
    352       "relevance": "Foundational work on iterative LLM self-refinement, directly used in AInstein's critique loop design."
    353     },
    354     {
    355       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    356       "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath"],
    357       "year": 2023,
    358       "relevance": "Agentic LLM framework using verbal feedback for iterative improvement, closely related to AInstein's agent architecture."
    359     },
    360     {
    361       "title": "G-Eval: NLG Evaluation Using GPT-4 with Better Human Alignment",
    362       "authors": ["Yang Liu", "Dan Iter", "Yichong Xu"],
    363       "year": 2023,
    364       "relevance": "LLM-as-a-judge methodology used as the basis for AInstein's evaluation paradigm."
    365     },
    366     {
    367       "title": "Curie: Evaluating LLMs on Multitask Scientific Long Context Understanding and Reasoning",
    368       "authors": ["Hao Cui", "Zahra Shamsi"],
    369       "year": 2025,
    370       "arxiv_id": "2503.13517",
    371       "relevance": "Benchmark for evaluating LLM scientific reasoning capabilities, directly relevant to the survey scope."
    372     },
    373     {
    374       "title": "SciBench: Evaluating College-Level Scientific Problem-Solving Abilities of Large Language Models",
    375       "authors": ["Xiaoxuan Wang", "Ziniu Hu", "Pan Lu"],
    376       "year": 2024,
    377       "arxiv_id": "2307.10635",
    378       "relevance": "Benchmark evaluation of LLM scientific problem-solving, closely related to AInstein's goals."
    379     },
    380     {
    381       "title": "Towards Large Reasoning Models: A Survey of Reinforced Reasoning with Large Language Models",
    382       "authors": ["Fengli Xu", "Qianyue Hao"],
    383       "year": 2025,
    384       "arxiv_id": "2501.09686",
    385       "relevance": "Survey of LLM reasoning capabilities, directly relevant to the survey's coverage of LLM capability evaluation."
    386     },
    387     {
    388       "title": "Knowledge Augmented Complex Problem Solving with Large Language Models: A Survey",
    389       "authors": ["Da Zheng", "Lun Du", "Junwei Su"],
    390       "year": 2025,
    391       "arxiv_id": "2505.03418",
    392       "relevance": "Survey on LLM problem-solving with knowledge augmentation, relevant to understanding LLM capability boundaries."
    393     },
    394     {
    395       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    396       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    397       "year": 2023,
    398       "relevance": "Critical analysis of LLM emergent abilities and measurement artifacts, relevant to methodology quality in AI evaluations."
    399     },
    400     {
    401       "title": "Mathematical Discoveries from Program Search with Large Language Models",
    402       "authors": ["Bernardino Romera-Paredes"],
    403       "year": 2024,
    404       "relevance": "Demonstrates LLMs making novel mathematical discoveries, directly relevant to the AI scientific reasoning literature."
    405     },
    406     {
    407       "title": "Language Models are Few-Shot Learners",
    408       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    409       "year": 2020,
    410       "relevance": "Foundational work on in-context learning capabilities of LLMs, relevant to the survey's scope on LLM capabilities."
    411     },
    412     {
    413       "title": "Advancing the Scientific Method with Large Language Models: From Hypothesis to Discovery",
    414       "authors": ["Yanbo Zhang", "Sumeer A. Khan"],
    415       "year": 2025,
    416       "arxiv_id": "2505.16477",
    417       "relevance": "Directly relevant to AI-driven scientific discovery, assessing LLM capability for hypothesis generation."
    418     },
    419     {
    420       "title": "SciMON: Scientific Inspiration Machines Optimized for Novelty",
    421       "authors": ["Qingyun Wang", "Doug Downey", "Heng Ji", "Tom Hope"],
    422       "year": 2024,
    423       "relevance": "System for generating novel scientific ideas, directly comparable to AInstein's goal of LLM-generated research approaches."
    424     }
    425   ]
    426 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs