scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25827B)
      1 {
      2   "paper": {
      3     "title": "The Ramón Llull's Thinking Machine for Automated Ideation",
      4     "authors": [
      5       "Xinran Zhao",
      6       "Boyuan Zheng",
      7       "Chenglei Si",
      8       "Haofei Yu",
      9       "Ken Ziyu Liu",
     10       "Runlong Zhou",
     11       "Ruochen Li",
     12       "Tong Chen",
     13       "Xiang Li",
     14       "Yiming Zhang",
     15       "Tongshuang Wu"
     16     ],
     17     "year": 2025,
     18     "venue": "COLM 2025 (workshop paper)",
     19     "arxiv_id": "2508.19200",
     20     "doi": "10.48550/arXiv.2508.19200"
     21   },
     22   "scan_version": 2,
     23   "active_modules": ["experimental_rigor", "data_leakage"],
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper says 'We will open-source our code, data, and generated research ideas' (Section 1) — future tense constitutes a promise, not an actual release. The GitHub URL is provided but 'will' indicates it was not released at time of writing."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper uses papers collected from Paper Copilot (publicly available) from ICLR 24, COLM 24, COLT 24, ACL 24/23/22, which are public conference proceedings. The element extraction data is promised for release but the source data is public."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No environment specifications, requirements files, or dependency details are provided. Only 'Scikit-Learn' is mentioned for TF-IDF/t-SNE (Appendix A.6)."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided. The pipeline is described conceptually but there are no runnable commands or scripts documented."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Table 4 reports only point estimates for diversity, similarity, and relevance metrics with no confidence intervals or error bars."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper compares four ideation methods (Table 4) and claims differences in diversity/similarity/relevance but uses no statistical tests to support these comparisons."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Raw metric values are reported but no effect sizes, percentage improvements with baseline context, or magnitude-of-difference measures are provided."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "100 ideas per method are used for comparison (Table 4), and 2000 papers sampled from ICLR 24, but no justification is given for why these sample sizes are adequate."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No variance, standard deviation, or spread measures are reported for any experimental results."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper compares against Si et al. (2024) and Yu et al. (2024) as baselines for ideation quality (Table 4, Section 4.2)."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Both baselines — Si et al. (2024) and Yu et al. (2024) — are recent and represent current state-of-the-art in LLM-based ideation."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Two variants of the thinking machine are compared: Ramón Llull (Top) using most-visited elements and Ramón Llull (Random) using random sampling, which tests the effect of element selection strategy (Table 4)."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Three metrics are used: diversity (distinct-1), similarity (Jaccard), and relevance (BLEU) (Section 4.2, Table 4)."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No human evaluation of generated ideas is performed. The paper explicitly acknowledges this gap: 'Rigorous idea quality evaluation may involve extensive expert annotation' (footnote 7) and discusses it in Limitations (A.1)."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "ACL 2025 main paper titles (released after June 2025) are used as the reference set for similarity and relevance, which post-dates the models used (Gemini 1.5 Pro released Feb 2024), providing a temporal hold-out (Section 4.2)."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Results are broken down per conference (Table 2, Table 5, Figure 2) and per element type (Theme, Domain, Method), with qualitative comparisons in Table 3."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 4.4 discusses what is NOT covered by the A+B+C framework (perturbation, 4th axis, negation), and the bijective coverage analysis (Table 5) shows only 16.4% reconstructibility, which is discussed as a limitation."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The low reconstructibility rate (16.4%, Table 5) is reported and discussed honestly. The paper also reports that Ramón Llull (Random) has much lower relevance (0.05) compared to prior methods (0.28, 0.18)."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims ideas are 'diverse, relevant, and grounded in current literature.' Table 4 shows diversity and relevance metrics; the element mining from conference papers provides grounding. The claims are hedged appropriately."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper implicitly claims that conditioning on explicit concept combinations 'helps' diversity (Section 1) and that random sampling 'leads to' higher diversity (Section 4.2). These are causal claims from observational comparisons without controlling for confounds (different generation processes, different models for rewriting)."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper bounds its claims to ML/NLP conferences and notes the framework's limitations. Footnote 2 explicitly states diversity and coverage 'do not necessarily suggest the novelty and utility of the ideas.' Section 4.4 discusses what isn't covered."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper does not discuss alternative explanations for its metric results. For instance, the higher diversity of Ramón Llull (Random) could be due to random word combinations rather than genuinely diverse ideas. No confounds are discussed."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper is careful to distinguish its metrics (diversity, similarity, relevance) from actual idea quality. Footnote 7 states 'The results from our metrics do not suggest the superior quality of any method.' Appendix A.1 discusses the gap between lexical diversity and scientific merit."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper uses 'Gemini 2.0 Flash', 'Gemini-1.5 Pro', and 'Claude 3.7' without specific version IDs or snapshot dates. These are marketing names only."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Full prompts are provided in Appendix A.6 for element extraction, element merging, idea rewriting, and in A.8 for decomposition and reconstruction evaluation. These are complete, usable prompts."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Appendix A.6 states: 'max output token to be 8192, temperature to be 0.7, top p to be 0.7, and top k to be 50.'"
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. The pipeline consists of direct LLM API calls for extraction, merging, and rewriting."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The element mining pipeline is described: papers collected from Paper Copilot, processed by Gemini for element extraction, then merged based on semantic similarity. Table 2 provides counts at each stage. Section 3.2 describes the pipeline."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Appendix A.1 contains a dedicated limitations section with two substantive subsections: 'Evaluating idea quality and novelty' and 'Organizing the elements.'"
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The limitations discuss specific issues: lexically unique ideas may be conceptually trivial, high relevance may indicate incrementality rather than novelty, LLM-as-judge may favor well-phrased but shallow ideas, and flat element lists lose hierarchical structure (Appendix A.1)."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 4.4 explicitly identifies dimensions NOT covered by the framework (perturbation, 4th axis, negation). Footnote 2 states what the metrics do NOT measure. The paper states 'the proposed Ram´on Llull thinking machine is NOT intended to' with specific exclusions (Section 5)."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The extracted elements, templates, and generated ideas are promised for future release but not available at time of writing. Only aggregated statistics (Table 2) are shown."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Papers were collected from Paper Copilot for six specific conferences. 2000 papers randomly sampled from ICLR 24. Processing used Gemini 2.0 Flash with documented prompts (Section 3.2, Appendix A.6)."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants are studied. The PhD student annotators in the pilot (Section 3.2) are contributors, not study subjects. Data comes from public conference proceedings."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The pipeline from paper collection → element extraction → merging → template combination → LLM rewriting is documented with prompts (A.6), statistics (Table 2), and processing details."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Acknowledgments section lists ONR Award N000142312840, OpenAI Research Credit program, Amazon AI Research Gift Fund, and Gemma Academic Program GCP Credit Award."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "All author affiliations are listed: CMU, OSU, Stanford, UIUC, UT Dallas, UW."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The work is funded by OpenAI Research Credits and uses Gemini models. Both OpenAI and Google have financial interests in LLM ideation capabilities. The paper does not acknowledge this potential conflict."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial interests statement is present in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The paper notes Gemini 1.5 Pro was released Feb 2024 to justify using ACL 2025 as reference, but does not state training cutoff dates for Gemini 2.0 Flash or Claude 3.7, which are used for element extraction and idea generation."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "Section 4.2 explicitly addresses this: 'the accepted papers are released after June 2025. To reduce the chance of LLMs seeing the paper titles in their training data (Gemini 1.5 Pro was released Feb 2024).' Temporal separation is used as a mitigation."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "While temporal separation is used for the ACL 2025 reference set, the element extraction from ICLR/ACL/COLT/COLM papers (2022-2024) is done with models that likely trained on these papers. This contamination of the element mining step is not discussed."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are studied."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are studied."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are studied."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are studied."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants are studied."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants are studied."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants are studied."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "The pipeline processes 7,483 papers through multiple LLM calls (extraction, merging, rewriting, evaluation) but no API costs, token counts, or wall-clock times are reported."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No information about total API spend, compute resources, or processing time is provided despite extensive LLM usage across the pipeline."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No seed sensitivity analysis is reported. LLM generation with temperature 0.7 is inherently stochastic but no multiple-run analysis is performed."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "It is unclear how many times the LLM generation was run. Results appear to be from single runs with no repetition stated."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Temperature 0.7, top_p 0.7, top_k 50 are used but no justification or search process for these settings is described."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The choice of top-20 elements per disk, the basic A+B+C template for visualization, and the sampling of 100 ideas for comparison are not justified."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors compare their own system against baselines from Si et al. (2024) and Yu et al. (2024) without acknowledging potential bias in how the comparison is set up (e.g., choice of metrics, reference set)."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "Different methods likely have very different compute costs (e.g., Ramón Llull with simple template fills vs. multi-agent simulation in Yu et al.) but no compute-performance comparison is made."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "The paper discusses construct validity extensively: footnote 7 notes metrics don't measure quality, Appendix A.1 discusses how diversity/relevance can be misleading, and Section 4.4 discusses what the framework cannot capture."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No agentic scaffolding is involved in the evaluation."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": true,
    353         "justification": "The paper uses ACL 2025 titles (released June 2025) as reference, explicitly noting Gemini 1.5 Pro was released Feb 2024 to create temporal separation (Section 4.2)."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The element extraction uses LLMs that likely have the source conference papers in their training data. When Gemini extracts elements from ICLR 2024 papers, it may leverage memorized content rather than extracting from the provided text. This is not discussed."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "Elements are mined from the same conferences used in evaluation (e.g., elements from ACL 2024 compared against ACL 2025). Papers within conferences share topics, authors, and citation networks, creating non-independence that is not addressed."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No concrete leakage detection method is applied. Only temporal separation (using post-training reference set) is used, which is a prevention strategy for the reference set but not for the element mining step."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Near-universal decomposability (99.5%): almost all research papers can be decomposed into the A+B+C (Theme, Domain, Method) framework.",
    375       "evidence": "Table 5 shows 99.5% decomposition rate across 7,483 papers from six conferences (Section 4.3).",
    376       "supported": "weak"
    377     },
    378     {
    379       "claim": "Reconstructibility is limited (16.4%): only a fraction of papers can be faithfully reconstructed from their elements.",
    380       "evidence": "Table 5 shows 16.4% reconstruction rate using ≥30% Jaccard similarity threshold (Section 4.3).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Ramón Llull (Random) achieves the highest diversity (0.41 distinct-1) among compared methods.",
    385       "evidence": "Table 4 compares four methods: Si et al. (0.29), Yu et al. (0.29), Ramón Llull Top (0.21), Ramón Llull Random (0.41).",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "There is a trade-off between diversity and similarity/relevance in the ideation methods.",
    390       "evidence": "Table 4 shows Ramón Llull (Random) has highest diversity (0.41) but lowest relevance (0.05), while Ramón Llull (Top) has highest similarity (0.26) but lowest diversity (0.21).",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Different conferences occupy different regions of the idea space, with COLT being most standalone and COLM lying at the intersection.",
    395       "evidence": "Figure 2 shows t-SNE density heatmaps of generated ideas by conference, with qualitative analysis in Section 4.1.",
    396       "supported": "weak"
    397     }
    398   ],
    399   "methodology_tags": ["benchmark-eval", "qualitative"],
    400   "key_findings": "The paper proposes a combinatorial ideation framework based on Ramón Llull's thinking machine, using Theme/Domain/Method elements mined from 7,483 conference papers. The framework achieves 99.5% decomposability but only 16.4% reconstructibility, suggesting research ideation is partially but not fully combinatorial. Random element sampling produces more diverse but less relevant ideas than top-element enumeration, revealing a diversity-relevance trade-off. The approach serves as a lightweight baseline for LLM ideation research.",
    401   "red_flags": [
    402     {
    403       "flag": "Decomposability metric is trivially high",
    404       "detail": "99.5% decomposability is evaluated by asking Gemini whether it can assign theme/domain/method labels to paper titles. Given these are very broad categories, nearly anything can be tagged with some theme/domain/method. This near-100% rate reveals more about the breadth of the categories than about the framework's validity."
    405     },
    406     {
    407       "flag": "No statistical rigor in comparisons",
    408       "detail": "Table 4 compares four ideation methods on three metrics with no error bars, no significance tests, and no repeated runs. With stochastic LLM generation (temperature 0.7), single-run comparisons are unreliable."
    409     },
    410     {
    411       "flag": "Evaluation metrics may not measure idea quality",
    412       "detail": "The paper acknowledges this (footnote 7, Appendix A.1), but the entire experimental section is built on these metrics. Distinct-1 diversity, Jaccard similarity, and BLEU relevance are lexical metrics that say nothing about scientific merit, feasibility, or novelty of generated ideas."
    413     },
    414     {
    415       "flag": "Unfair baseline comparison",
    416       "detail": "Si et al. (2024) ideas were carefully filtered for quality (93 ideas on 7 topics), while Yu et al. (2024) used multi-agent discussion. The Ramón Llull pipeline uses simple template filling. Comparing these very different generation processes on lexical diversity/similarity metrics without controlling for generation effort or filtering is misleading."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Can LLMs Generate Novel Research Ideas? A Large-Scale Human Study with 100+ NLP Researchers",
    422       "authors": ["Chenglei Si", "Diyi Yang", "Tatsunori Hashimoto"],
    423       "year": 2024,
    424       "relevance": "Directly relevant as a major baseline for LLM-based research ideation with human evaluation."
    425     },
    426     {
    427       "title": "The Ideation-Execution Gap: Execution Outcomes of LLM-Generated versus Human Research Ideas",
    428       "authors": ["Chenglei Si", "Tatsunori Hashimoto", "Diyi Yang"],
    429       "year": 2025,
    430       "arxiv_id": "2506.20803",
    431       "relevance": "Studies execution outcomes of LLM-generated research ideas, directly relevant to evaluating AI-assisted ideation."
    432     },
    433     {
    434       "title": "ResearchTown: Simulator of Human Research Community",
    435       "authors": ["Haofei Yu"],
    436       "year": 2024,
    437       "relevance": "Multi-agent community simulation for research ideation, a baseline compared in this paper."
    438     },
    439     {
    440       "title": "ResearchAgent: Iterative Research Idea Generation over Scientific Literature with Large Language Models",
    441       "authors": ["Jinheon Baek", "Sujay Kumar Jauhar", "Silviu Cucerzan", "Sung Ju Hwang"],
    442       "year": 2025,
    443       "relevance": "Iterative LLM-based ideation with knowledge augmentation, part of the automatic ideation landscape."
    444     },
    445     {
    446       "title": "CodeScientist: End-to-End Semi-Automated Scientific Discovery with Code-Based Experimentation",
    447       "authors": ["Peter Jansen"],
    448       "year": 2025,
    449       "arxiv_id": "2503.22708",
    450       "relevance": "End-to-end automated scientific discovery system, related to AI-assisted research workflows."
    451     },
    452     {
    453       "title": "Scideator: Human-LLM Scientific Idea Generation Grounded in Research-Paper Facet Recombination",
    454       "authors": ["Marissa Radensky"],
    455       "year": 2024,
    456       "relevance": "Concurrent work on facet-based recombination for ideation, closely related approach to this paper."
    457     },
    458     {
    459       "title": "The Impact of Large Language Models on Scientific Discovery: A Preliminary Study Using GPT-4",
    460       "authors": ["Microsoft Research AI4Science", "Microsoft Quantum"],
    461       "year": 2023,
    462       "relevance": "Early study of LLM impact on scientific discovery, foundational to the AI-for-science direction."
    463     },
    464     {
    465       "title": "AI2 Scholar QA: Organized Literature Synthesis with Attribution",
    466       "authors": ["Amanpreet Singh"],
    467       "year": 2025,
    468       "arxiv_id": "2504.10861",
    469       "relevance": "AI-powered literature synthesis tool, relevant to automated research workflows."
    470     }
    471   ]
    472 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs