scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22616B)
      1 {
      2   "paper": {
      3     "title": "Advancements in Generative AI: A Comprehensive Review of GANs, GPT, Autoencoders, Diffusion Model, and Transformers",
      4     "authors": [
      5       "Staphord Bengesi",
      6       "Hoda El-Sayed",
      7       "Md Kamruzzaman Sarker",
      8       "Yao Houkpati",
      9       "John Irungu",
     10       "Timothy Oladunni"
     11     ],
     12     "year": 2023,
     13     "venue": "IEEE Access"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No code repository or analysis scripts are linked or mentioned. The paper provides no URL or archive for any code."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No dataset is released. The informal comparisons (e.g., ChatGPT vs. Bard screenshots) involve no released data. The survey itself does not release its corpus."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No environment specifications, library versions, or setup instructions are provided anywhere in the paper."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No reproduction instructions are provided. The paper contains no README, scripts, or step-by-step directions to replicate any of its informal experiments."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": false,
     41         "answer": false,
     42         "justification": "This is a descriptive survey paper; it does not run statistical experiments or report quantitative results that would require confidence intervals."
     43       },
     44       "significance_tests": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "No comparative statistical claims are made via formal experiments; significance tests are not applicable to a descriptive survey."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "No formal experiments are run; effect sizes are not applicable to a descriptive overview paper."
     53       },
     54       "sample_size_justified": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "No experimental sample is used; the paper is a descriptive survey and does not run controlled experiments."
     58       },
     59       "variance_reported": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "No repeated experiments are conducted; variance reporting is not applicable to this descriptive survey."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The paper does not compare against any prior survey of generative AI. Its informal experiments (comparing ChatGPT vs. Bard on 3 questions, or Firefly vs. Stable Diffusion on one image prompt) lack controlled baselines and proper methodology."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No baselines are included at all, so they cannot be contemporary. The survey could have compared against prior surveys such as the Zhang et al. 2023 survey it cites, but it does not. The criterion applies because a survey paper can and should include contemporary baselines."
     75       },
     76       "ablation_study": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "No system is proposed or evaluated; this is a survey paper with no novel system to ablate."
     80       },
     81       "multiple_metrics": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No formal evaluation with metrics is conducted; the paper contains only informal visual/anecdotal comparisons."
     85       },
     86       "human_evaluation": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No formal human evaluation of system outputs is performed; the informal comparisons involve the authors' own subjective impressions, which do not constitute a structured human evaluation."
     90       },
     91       "held_out_test_set": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "No formal train/test split or held-out test set is relevant to this descriptive survey."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper is organized by category (GANs, GPT, Autoencoders, Diffusion Models, Transformers; and by task type: text, image, video, code generation), providing breakdowns across these categories."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper discusses failure modes of GANs (mode collapse, non-convergence, instability) in Section II.C.2 and discusses broader challenges of generative AI in Section V."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper describes known technical limitations and failure modes (e.g., GAN mode collapse, gradient vanishing) and social harms (misinformation, privacy), which constitute acknowledgment of negative results from the broader literature."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The abstract claims the paper 'benchmarked state-of-the-art tools against Generative AI' (from the conclusion) and presents informal screenshot comparisons as evidence. The claim that 'ChatGPT outperformed Bard' (Section III.A) is based on three unstructured questions with no methodology, which does not support the implicit claim of systematic benchmarking."
    117       },
    118       "causal_claims_justified": {
    119         "applies": false,
    120         "answer": false,
    121         "justification": "The paper makes no causal claims; it is a descriptive survey and overview without causal inference language directed at its own experimental findings."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper makes broad claims about 'Generative AI' across industries based on informal experiments with a handful of tools and screenshots. The title and abstract claim a 'comprehensive review' but the coverage is selective and not bounded to the specific tools or settings tested."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "No alternative explanations for any of the paper's informal empirical comparisons are considered. The conclusion that 'ChatGPT outperformed Bard' based on 3 queries is presented without discussing alternative interpretations or confounds."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper refers to 'ChatGPT', 'Bard', 'Stable Diffusion', and 'Firefly' in its informal experiments without specifying exact model versions, API versions, or snapshot dates."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "For the text generation comparison, the three prompts used are explicitly stated: 'Provide a brief description of what Bard is in one paragraph', 'Provide a brief description of what ChatGPT is in one paragraph', and 'Habari za saa hizi' (Section III.A). For image generation, the prompt 'College Student Programming' is provided (Section III.B)."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "No hyperparameters (temperature, top-p, etc.) are reported for any of the model comparisons. No API settings are described."
    149       },
    150       "scaffolding_described": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No agentic scaffolding is used. The paper directly uses commercial tools as black boxes, so this criterion does not apply."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "No systematic literature search methodology is described. There are no filtering stages, search criteria, database queries, or inclusion/exclusion criteria. The paper does not document how papers were selected for review."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion and Section V discuss challenges of generative AI generally but not methodological limitations of this paper itself."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No specific threats to validity of the paper's own claims or informal experiments are discussed. There is no acknowledgment that comparing ChatGPT and Bard on 3 questions is insufficient to support performance claims."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The paper does not explicitly state what it does NOT cover or what its scope boundaries are. It presents itself as a 'comprehensive review' without bounding its claims to specific models, time periods, or domains."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No raw data (e.g., model outputs, screenshots in raw form, or the paper corpus for the survey) is made available for independent verification."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No systematic data collection procedure is described. The paper does not explain how papers or tools were identified and selected for inclusion in the review."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants are involved; this criterion does not apply to a survey paper that collects no human subjects data."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "There is no documented pipeline from paper collection to final analysis. No search queries, databases, or filtering steps are described."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or sponsors."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are clearly listed on the first page: Bowie State University, Morgan State University, and University of the District of Columbia. None are affiliated with the companies whose tools are evaluated."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": false,
    213         "answer": false,
    214         "justification": "No funding is disclosed, making it unclear if there is any funder; this criterion cannot be assessed."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "There is no competing interests statement or financial disclosure in the paper. Absence of declaration is not the same as absence of conflict."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "This is a survey paper that does not evaluate a pre-trained model's capability on a benchmark. The informal comparisons are not benchmark evaluations, so contamination criteria do not apply."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "No benchmark evaluation of a pre-trained model is conducted; contamination analysis is not applicable."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No benchmark is used to evaluate model capability; contamination is not applicable to this survey paper."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this survey paper."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this survey paper."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this survey paper."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this survey paper."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this survey paper."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this survey paper."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants are involved in this survey paper."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "This is a survey paper and does not propose a method requiring cost evaluation. Per the schema, cost_and_practicality items are NA for survey papers."
    281       },
    282       "compute_budget_stated": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "This is a survey paper and does not propose a method requiring computational budget reporting."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "ChatGPT outperformed Bard in delivering more precise answers to questions including self-descriptions and a Swahili question.",
    292       "evidence": "Section III.A presents Figure 9 showing screenshots of ChatGPT and Bard outputs for three prompts. The authors state 'results unmistakably indicate that ChatGPT outperformed Bard in delivering more precise answers.'",
    293       "supported": "weak"
    294     },
    295     {
    296       "claim": "Adobe Firefly excelled in precision while Stable Diffusion exhibited superior image resolution for text-to-image generation.",
    297       "evidence": "Section III.B shows Figure 10 with images generated by Firefly and Stable Diffusion using the prompt 'College Student Programming'. No quantitative metrics or human rater data are provided.",
    298       "supported": "weak"
    299     },
    300     {
    301       "claim": "Generative AI will generate $137 billion in 2023 and is expected to surge to $1.3 trillion by 2030.",
    302       "evidence": "Section IV.D cites Bloomberg Intelligence [153] for this economic projection, which is third-party market analysis rather than the authors' own research.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "GPT-4 outperformed all other state-of-the-art models on Measuring Massive Multitask Language Understanding (MMLU) covering 57 tasks.",
    307       "evidence": "Section II.B.4 states GPT-4 'outperformed them all' on MMLU, citing the GPT-4 technical report [90]. This is consistent with OpenAI's published benchmarks, but the paper relies entirely on the model developer's self-reported results.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "84% of the U.S. workforce occupies positions with potential to leverage Generative AI for automating repetitive tasks.",
    312       "evidence": "Section V.B cites McKinsey's analysis [160] and two other sources [161][162]. This is third-party research cited without critical evaluation.",
    313       "supported": "moderate"
    314     }
    315   ],
    316   "methodology_tags": [
    317     "qualitative",
    318     "case-study"
    319   ],
    320   "key_findings": "This paper provides a descriptive overview of major generative AI model families (GANs, VAEs, Transformers/GPT, Diffusion models) along with a catalog of 73 generative AI tools and their applications across seven task categories. The authors conducted informal comparisons of ChatGPT vs. Bard on three prompts and Stable Diffusion vs. Adobe Firefly on one image prompt, concluding ChatGPT and Firefly performed better respectively, though without rigorous methodology. The paper also surveys industrial applications in media, education, healthcare, and business, and discusses future prospects and concerns including privacy, security, and job displacement.",
    321   "red_flags": [
    322     {
    323       "flag": "Informal experiments presented as benchmarking",
    324       "detail": "Section III presents screenshots of ChatGPT vs. Bard on 3 unstructured questions and one image generation prompt as 'performance assessments.' These lack methodology, metrics, controlled conditions, or any statistical basis. The conclusion that 'ChatGPT outperformed Bard' is unsupported by this evidence."
    325     },
    326     {
    327       "flag": "No systematic literature review methodology",
    328       "detail": "The paper is presented as a 'comprehensive review' but provides no search queries, database sources, inclusion/exclusion criteria, or PRISMA-style flow diagram. It is a narrative overview rather than a systematic review, making coverage and selection criteria opaque."
    329     },
    330     {
    331       "flag": "Laundering unreviewed claims",
    332       "detail": "The paper presents capabilities of various tools (GPT-4, Bard, Stable Diffusion, etc.) based on developer reports and non-peer-reviewed sources without critical evaluation. Claims from OpenAI, Google, and Meta technical reports are accepted at face value."
    333     },
    334     {
    335       "flag": "No funding or competing interests disclosure",
    336       "detail": "The paper has no acknowledgments section and makes no disclosure of funding sources or competing interests, preventing assessment of potential conflicts."
    337     },
    338     {
    339       "flag": "Overly broad title and scope claims",
    340       "detail": "The title claims 'Comprehensive Review' but the paper is selective, lacks a systematic methodology, and relies primarily on informal demonstrations and secondary sources. Claims about 'comprehensive' coverage of the field are not supported by the paper's methodology."
    341     }
    342   ],
    343   "cited_papers": [
    344     {
    345       "title": "Attention is All you Need",
    346       "authors": [
    347         "Vaswani, A.",
    348         "et al."
    349       ],
    350       "year": 2017,
    351       "relevance": "Foundational transformer architecture paper underlying most modern LLMs and generative AI systems evaluated in this survey."
    352     },
    353     {
    354       "title": "Language Models are Few-Shot Learners",
    355       "authors": [
    356         "Brown, T.",
    357         "et al."
    358       ],
    359       "year": 2020,
    360       "relevance": "Introduces GPT-3, a foundational LLM whose capabilities and code generation performance are surveyed in this paper."
    361     },
    362     {
    363       "title": "GPT-4 Technical Report",
    364       "authors": [
    365         "OpenAI"
    366       ],
    367       "year": 2023,
    368       "relevance": "Describes GPT-4 capabilities including benchmark performance that the survey paper cites for claims about LLM state-of-the-art."
    369     },
    370     {
    371       "title": "Evaluating Large Language Models Trained on Code",
    372       "authors": [
    373         "Chen, M.",
    374         "et al."
    375       ],
    376       "year": 2021,
    377       "relevance": "Introduces Codex and HumanEval benchmark for code generation, directly relevant to the survey's coverage of code generation tools."
    378     },
    379     {
    380       "title": "A Complete Survey on Generative AI (AIGC): Is ChatGPT from GPT-4 to GPT-5 All You Need?",
    381       "authors": [
    382         "Zhang, C.",
    383         "et al."
    384       ],
    385       "year": 2023,
    386       "relevance": "Competing survey of generative AI that the authors cite multiple times; directly in scope as another review paper in the same space."
    387     },
    388     {
    389       "title": "Generative Adversarial Networks",
    390       "authors": [
    391         "Goodfellow, I. J.",
    392         "et al."
    393       ],
    394       "year": 2014,
    395       "relevance": "Original GAN paper whose architecture and training methodology is central to the survey's coverage of generative models."
    396     },
    397     {
    398       "title": "Improving Language Understanding by Generative Pre-Training",
    399       "authors": [
    400         "Radford, A.",
    401         "Narasimhan, K.",
    402         "Salimans, T.",
    403         "Sutskever, I."
    404       ],
    405       "year": 2018,
    406       "relevance": "Original GPT paper introducing the pretraining-finetuning paradigm for language models."
    407     },
    408     {
    409       "title": "Make-A-Video: Text-to-Video Generation without Text-Video Data",
    410       "authors": [
    411         "Singer, U.",
    412         "et al."
    413       ],
    414       "year": 2022,
    415       "relevance": "Describes a text-to-video generation system surveyed in the paper's coverage of video generation tools."
    416     },
    417     {
    418       "title": "Competition-level code generation with AlphaCode",
    419       "authors": [
    420         "Li, Y.",
    421         "et al."
    422       ],
    423       "year": 2022,
    424       "relevance": "Evaluates code generation at competition level, relevant to the paper's coverage of AI code generation capabilities."
    425     },
    426     {
    427       "title": "Capabilities of GPT-4 on Medical Challenge Problems",
    428       "authors": [
    429         "Nori, H.",
    430         "King, N.",
    431         "McKinney, S. M.",
    432         "Carignan, D.",
    433         "Horvitz, E."
    434       ],
    435       "year": 2023,
    436       "relevance": "Evaluates GPT-4 on medical benchmarks, cited in the survey's discussion of LLM performance in professional domains."
    437     },
    438     {
    439       "title": "ChatGPT: A Meta-Analysis after 2.5 Months",
    440       "authors": [
    441         "Leiter, C.",
    442         "et al."
    443       ],
    444       "year": 2023,
    445       "relevance": "Survey/meta-analysis of ChatGPT capabilities and limitations, directly in scope for this research survey."
    446     },
    447     {
    448       "title": "Hierarchical Text-Conditional Image Generation with CLIP Latents",
    449       "authors": [
    450         "Ramesh, A.",
    451         "Dhariwal, P.",
    452         "Nichol, A.",
    453         "Chu, C.",
    454         "Chen, M."
    455       ],
    456       "year": 2022,
    457       "relevance": "Describes DALL-E 2 text-to-image generation system surveyed in the paper's coverage of image generation tools."
    458     }
    459   ]
    460 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs