scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (18661B)
      1 {
      2   "paper": {
      3     "title": "ChatGPT is not all you need. A State of the Art Review of large Generative AI models",
      4     "authors": [
      5       "Roberto Gozalo-Brizuela",
      6       "Eduardo C. Garrido-Merchán"
      7     ],
      8     "year": 2023,
      9     "venue": "arXiv",
     10     "arxiv_id": "2301.04655"
     11   },
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "No code, analysis scripts, or repository URL is provided for the survey itself. The paper references APIs/GitHub links for the individual models reviewed, but releases no artifacts of its own."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No dataset or structured data from the survey is released. The paper does not provide a downloadable corpus of reviewed models or any structured extraction."
     23       },
     24       "environment_specified": {
     25         "applies": false,
     26         "answer": false,
     27         "justification": "This is a narrative survey/review paper with no computational experiments, so environment specifications are structurally inapplicable."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No instructions are provided for reproducing the survey methodology — there is no description of search strategy, inclusion/exclusion criteria, or how models were selected for review."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": false,
     38         "answer": false,
     39         "justification": "This is a narrative survey that does not run experiments or perform statistical aggregation of results."
     40       },
     41       "significance_tests": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "No experiments are conducted and no comparative statistical claims are made by the authors themselves."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "No experiments are conducted; the paper only describes models qualitatively."
     50       },
     51       "sample_size_justified": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "No experiments or data collection is performed by the authors."
     55       },
     56       "variance_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No experiments are run; the paper is a descriptive survey."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not compare against prior surveys or reviews of generative AI models. It presents its taxonomy without reference to alternative organizational schemes."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No baseline surveys are compared against, so contemporaneity cannot be assessed."
     72       },
     73       "ablation_study": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "This is a survey paper with no system components to ablate."
     77       },
     78       "multiple_metrics": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "No evaluation metrics are used; the paper is a descriptive taxonomy."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No evaluation of the survey's own outputs is conducted, and no claims about output quality require human evaluation."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No experiments requiring train/test splits are conducted."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The paper organizes models into 9 categories (text-to-image, text-to-3D, image-to-text, text-to-video, text-to-audio, text-to-text, text-to-code, text-to-science, other) and discusses each category separately in Section 3."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 4 (Conclusions and further work) discusses limitations of generative AI models including dataset scarcity, computational cost, bias, accuracy issues, and ethical concerns like deepfakes."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "No negative results from the authors' own analysis are reported. The survey does not identify models that failed to deliver on claims or areas where the taxonomy breaks down."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims the paper provides a taxonomy and describes the main models and sectors affected by generative AI. The paper delivers on this descriptive promise through Figures 1-3 and the detailed section-by-section review."
    114       },
    115       "causal_claims_justified": {
    116         "applies": false,
    117         "answer": false,
    118         "justification": "The paper makes no causal claims. It is a descriptive survey of existing models."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper makes broad claims like 'generative AI models will not replace humans but enhance our content' (Section 1) and 'the implications that these generative models have in the industry and society are enormous' without bounding these to specific contexts or evidence."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": false,
    127         "answer": false,
    128         "justification": "This is a descriptive survey/taxonomy with no empirical results of its own, so alternative explanations are not applicable."
    129       }
    130     },
    131     "setup_transparency": {
    132       "model_versions_specified": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "The paper does not use any models in experiments. It describes models in a review capacity."
    136       },
    137       "prompts_provided": {
    138         "applies": false,
    139         "answer": false,
    140         "justification": "No prompting is used in the paper's own methodology."
    141       },
    142       "hyperparameters_reported": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "No experiments are conducted that require hyperparameter reporting."
    146       },
    147       "scaffolding_described": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "No agentic scaffolding is used."
    151       },
    152       "data_preprocessing_documented": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The paper does not describe any search strategy, database queries, inclusion/exclusion criteria, or paper selection pipeline. There is no mention of how the models were identified or selected for the taxonomy."
    156       }
    157     },
    158     "limitations_and_scope": {
    159       "limitations_section_present": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4 (Conclusions and further work) contains substantial discussion of limitations of generative AI models, covering dataset issues, computational cost, accuracy, bias, and ethical concerns."
    163       },
    164       "threats_to_validity_specific": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "The limitations discussed in Section 4 are about the reviewed models, not about the survey methodology itself. There is no discussion of threats to the validity of the survey's own analysis, such as selection bias in which models were included."
    168       },
    169       "scope_boundaries_stated": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "The paper does not explicitly state what it does NOT cover. It says 'We do not study the technical aspects of every model' but does not explain why certain models were included/excluded or what domains are out of scope."
    173       }
    174     },
    175     "data_integrity": {
    176       "raw_data_available": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No raw data, model lists, or structured survey data is made available for verification."
    180       },
    181       "data_collection_described": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper does not describe how the models were identified or selected for inclusion. There is no search methodology, database queries, or systematic process described."
    185       },
    186       "recruitment_methods_described": {
    187         "applies": false,
    188         "answer": false,
    189         "justification": "No human participants are involved; this is a literature survey."
    190       },
    191       "data_pipeline_documented": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "No data pipeline is documented. The paper does not explain how it went from the universe of generative AI models to the specific set reviewed."
    195       }
    196     },
    197     "conflicts_of_interest": {
    198       "funding_disclosed": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No funding source or acknowledgments section is present in the paper."
    202       },
    203       "affiliations_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Author affiliations are clearly stated: Quantitative Methods Department, Universidad Pontificia Comillas, Madrid, Spain."
    207       },
    208       "funder_independent_of_outcome": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding is disclosed, so independence of the funder cannot be assessed. The absence of a funding disclosure is itself a transparency gap."
    212       },
    213       "financial_interests_declared": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No competing interests or financial interests statement is provided anywhere in the paper."
    217       }
    218     },
    219     "contamination": {
    220       "training_cutoff_stated": {
    221         "applies": false,
    222         "answer": false,
    223         "justification": "The paper does not evaluate any pre-trained model on a benchmark. It is a descriptive survey."
    224       },
    225       "train_test_overlap_discussed": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "No benchmark evaluation is conducted by the authors."
    229       },
    230       "benchmark_contamination_addressed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "No benchmark evaluation is conducted by the authors."
    234       }
    235     },
    236     "human_studies": {
    237       "pre_registered": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No human participants are involved in this survey."
    241       },
    242       "irb_or_ethics_approval": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants are involved."
    246       },
    247       "demographics_reported": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants are involved."
    251       },
    252       "inclusion_exclusion_criteria": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants are involved."
    256       },
    257       "randomization_described": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants or experimental conditions."
    261       },
    262       "blinding_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants or experimental conditions."
    266       },
    267       "attrition_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants are involved."
    271       }
    272     },
    273     "cost_and_practicality": {
    274       "inference_cost_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "This is a survey paper with no method of its own whose cost would need reporting."
    278       },
    279       "compute_budget_stated": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "This is a survey paper with no computational experiments."
    283       }
    284     }
    285   },
    286   "claims": [
    287     {
    288       "claim": "Generative AI models are revolutionizing several industries including art and education.",
    289       "evidence": "Stated in the introduction (Section 1) with citations to Anantrasirichai & Bull (2021) on creative industries and Kandlhofer et al. (2016) on education, but no original evidence is presented.",
    290       "supported": "weak"
    291     },
    292     {
    293       "claim": "Only six organizations are behind the deployment of the major generative AI models.",
    294       "evidence": "Illustrated in Figure 3, which maps models to organizations (Google/DeepMind, OpenAI, Meta AI, NVIDIA, Runway, and university collaborations).",
    295       "supported": "moderate"
    296     },
    297     {
    298       "claim": "Generative AI models will not replace humans but enhance our content.",
    299       "evidence": "Stated in Section 1 as an assertion without supporting evidence. No empirical basis is provided for this claim.",
    300       "supported": "unsupported"
    301     },
    302     {
    303       "claim": "The taxonomy covers 9 categories of generative AI models organized by input/output modality.",
    304       "evidence": "Figure 1 presents the taxonomy; Sections 3.1-3.9 describe each category in detail.",
    305       "supported": "strong"
    306     }
    307   ],
    308   "methodology_tags": [
    309     "qualitative"
    310   ],
    311   "key_findings": "This paper provides a narrative taxonomy of generative AI models organized by input-output modality (9 categories including text-to-image, text-to-3D, text-to-video, text-to-audio, text-to-text, text-to-code, text-to-science, image-to-text, and other). It identifies six major organizations behind the major models and describes approximately 20 models published primarily in 2022. The paper notes limitations including dataset scarcity, computational cost, bias, accuracy concerns, and ethical risks like deepfakes.",
    312   "red_flags": [
    313     {
    314       "flag": "No systematic review methodology",
    315       "detail": "The paper presents itself as a 'state of the art review' but has no documented search strategy, inclusion/exclusion criteria, or systematic process for model selection. The selection of models appears ad-hoc with no justification for why these specific models were chosen over alternatives."
    316     },
    317     {
    318       "flag": "Uncritical quality assessment",
    319       "detail": "The paper describes each model's claims and architecture without any critical evaluation of the evidence supporting those claims. It reproduces marketing descriptions and self-reported results without independent verification."
    320     },
    321     {
    322       "flag": "Unbounded generalizations",
    323       "detail": "Claims like 'generative AI models will not replace humans but enhance our content' and 'the implications are enormous' are made without empirical support or scope boundaries."
    324     },
    325     {
    326       "flag": "Shallow technical depth",
    327       "detail": "The paper explicitly states it does not study technical aspects, resulting in descriptions that largely paraphrase the original papers' abstracts and introductions without deeper analysis."
    328     },
    329     {
    330       "flag": "Survey laundering risk",
    331       "detail": "By summarizing model claims without quality assessment, the survey risks laundering unverified or overstated claims from the original papers into an apparently authoritative review."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "Evaluating large language models trained on code",
    337       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    338       "year": 2021,
    339       "arxiv_id": "2107.03374",
    340       "relevance": "Foundational paper on Codex and LLM code generation evaluation, directly relevant to the survey's code generation scope."
    341     },
    342     {
    343       "title": "Flamingo: a visual language model for few-shot learning",
    344       "authors": ["Jean-Baptiste Alayrac", "Jeff Donahue", "Pauline Luc"],
    345       "year": 2022,
    346       "arxiv_id": "2204.14198",
    347       "relevance": "Major multimodal model combining vision and language, relevant to LLM capability evaluation."
    348     },
    349     {
    350       "title": "Competition-level code generation with AlphaCode",
    351       "authors": ["Yujia Li", "David Choi", "Junyoung Chung"],
    352       "year": 2022,
    353       "relevance": "Evaluates code generation on competition problems requiring deeper reasoning, relevant to LLM coding benchmarks."
    354     },
    355     {
    356       "title": "LaMDA: Language Models for Dialog Applications",
    357       "authors": ["Romal Thoppilan", "Daniel De Freitas", "Jamie Hall"],
    358       "year": 2022,
    359       "arxiv_id": "2201.08239",
    360       "relevance": "Major language model for dialog with safety and factual grounding components, relevant to LLM safety and capability."
    361     },
    362     {
    363       "title": "A generalist agent",
    364       "authors": ["Scott Reed", "Konrad Zolna", "Emilio Parisotto"],
    365       "year": 2022,
    366       "arxiv_id": "2205.06175",
    367       "relevance": "GATO multimodal multi-task agent, relevant to agentic AI and generalist model evaluation."
    368     },
    369     {
    370       "title": "Robust speech recognition via large-scale weak supervision",
    371       "authors": ["Alec Radford", "Jong Wook Kim", "Tao Xu"],
    372       "year": 2022,
    373       "arxiv_id": "2212.04356",
    374       "relevance": "Whisper model for speech recognition, relevant to LLM multimodal capability evaluation."
    375     },
    376     {
    377       "title": "Learning transferable visual models from natural language supervision",
    378       "authors": ["Alec Radford", "Jong Wook Kim", "Chris Hallacy"],
    379       "year": 2021,
    380       "relevance": "CLIP model enabling text-image alignment, foundational to many generative AI systems reviewed."
    381     },
    382     {
    383       "title": "Discovering faster matrix multiplication algorithms with reinforcement learning",
    384       "authors": ["Alhussein Fawzi", "Matej Balog", "Aja Huang"],
    385       "year": 2022,
    386       "relevance": "AlphaTensor using RL for algorithm discovery, relevant to AI capability for automated reasoning."
    387     },
    388     {
    389       "title": "ChatGPT: The end of online exam integrity?",
    390       "authors": ["Teo Susnjak"],
    391       "year": 2022,
    392       "arxiv_id": "2212.09292",
    393       "relevance": "Early analysis of ChatGPT's impact on education, relevant to LLM societal impact assessment."
    394     },
    395     {
    396       "title": "PEER: A Collaborative Language Model",
    397       "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Zhengbao Jiang"],
    398       "year": 2022,
    399       "arxiv_id": "2208.11663",
    400       "relevance": "Collaborative writing model trained on edit histories, relevant to LLM-assisted content generation."
    401     }
    402   ]
    403 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs