scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23068B)
      1 {
      2   "paper": {
      3     "title": "Peeping at creAItivity through a keyhole: creative self-perceptions, potential, and enhancement of GenAI chatbots",
      4     "authors": ["Dimitris Grammenos", "Todd Lubart"],
      5     "year": 2025,
      6     "venue": "Artificial Intelligence Review",
      7     "doi": "10.1007/s10462-025-11288-6"
      8   },
      9   "scan_version": 2,
     10   "active_modules": [],
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": false,
     16         "justification": "No code or scripts are released. The paper provides data on OSF but no analysis code."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All data including the 600 stories are available on OSF: https://osf.io/kmfsq/."
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The paper mentions using Jamovi (2.3.28) for statistical analysis but provides no environment specification for reproducing the analysis."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No step-by-step reproduction instructions are provided. The method section describes the procedure but not in a reproducible script or README format."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Standard deviations are reported in tables but no confidence intervals or error bars are provided for the main comparisons."
     39       },
     40       "significance_tests": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Kruskal-Wallis tests and Dwass-Steel-Critchlow-Fligner post hoc pairwise comparisons are used. E.g., χ2(2) = 405.11, p < .001 for CSC scores."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Statistical tests report chi-squared values and p-values but no standardized effect sizes (Cohen's d, eta-squared, etc.) are reported."
     49       },
     50       "sample_size_justified": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "200 stories per chatbot (100 per prompting approach) were collected. The limitations section notes that 5-10x more would be better, but no power analysis or formal justification is given."
     54       },
     55       "variance_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Standard deviations are consistently reported alongside means in Tables 2, 4, 5, 6, 7, and 15."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Nine benchmark stories (ST1-ST9) serve as baselines/controls, including dictionary definitions, lorem ipsum, gibberish, and a Kafka story. Human expert scores serve as the baseline for evaluating chatbot scoring ability."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The three chatbots tested (GPT-4o, Claude 3.5 Sonnet, Gemini 1.5 Flash) were contemporary models at the time of data collection (Oct-Nov 2024)."
     71       },
     72       "ablation_study": {
     73         "applies": false,
     74         "answer": false,
     75         "justification": "This is an exploratory study of chatbot creativity, not a system with components to ablate."
     76       },
     77       "multiple_metrics": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Multiple metrics are used: SSCS questionnaire scores (CSE, CPI, CSC), EPoC creativity scores (7-point scale), inter-rater reliability (Cronbach's alpha, McDonald's omega), story length, correlation analyses, and qualitative content analysis."
     81       },
     82       "human_evaluation": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Two PhD creativity researchers scored 10% of stories (inter-rater α = 0.87), one expert scored the remainder, and three professional writers scored selected stories (inter-rater α = 0.91)."
     86       },
     87       "held_out_test_set": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "This is not a machine learning evaluation with train/test splits."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Results are broken down per chatbot, per prompting approach (A1 vs A2), per phase, and with detailed content analysis of story elements (plot, character, setting, keyhole location, etc.)."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section 5.3.7 discusses logical errors, paradoxes and nonsense in narratives (ChatGPT: 44, Claude: 13, Gemini: 13). Section 5.2.2 discusses the chatbots' failure to detect scrambled/nonsense benchmark stories."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Several negative results are reported: Gemini's dramatic performance drop in PH-A2, chatbots' inability to detect scrambled text (ST6-ST8), ChatGPT's and Gemini's failure to recognize nonsense when hinted, and the general finding that chatbot stories are not particularly original despite high self-scores."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The abstract's claims about creative self-perceptions, creative potential, self-assessment ability, and improvement via DA and humor are all supported by results in the corresponding phases."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "The paper makes causal claims about Dynamic Assessment improving creative outcomes (Observation 18) but the intervention was applied only once per chatbot on the worst-scoring story, without controls or repeated measures. The design is inadequate for causal inference."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper explicitly frames findings as exploratory (Section 3) and the limitations section (Section 6) carefully bounds generalization, noting the small number of chatbots, single account/country, single test instrument, and limited DA/humor trials."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The discussion considers alternative explanations: that emergent creative personalities may be 'nothing more than a mirage' (citing Schaeffer et al. 2023), that the SSCS questionnaire presupposes a 'self', that training data overlap could explain results, and that Gemini's poor performance may be due to using the free version."
    128       },
    129       "proxy_outcome_distinction": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper extensively discusses the distinction between measuring creative products/potential versus actual creativity, citing Runco's Standard Definition update and Aru's argument that internal processes differ fundamentally. The discussion section explicitly states that high EPoC scores may not reflect genuine creativity."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Specific model versions are stated: ChatGPT (GPT-4o – paid), Claude (3.5 Sonnet – paid), Gemini (1.5 Flash – free). These are marketing names with specific version identifiers."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The prompts used in all phases are provided in the paper: the 3-step instruction set for PH-A, the scoring prompts for PH-B, and the full Dynamic Assessment prompts (Steps 1-3) in Section 4.4."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "No temperature, top-p, or other generation parameters are reported for any of the chatbot interactions."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used; chatbots are prompted directly through their standard interfaces."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "The data collection procedure is well-documented: prompting in random order, new sessions each time, batches of 5-10, various time periods over different days, training opt-out settings described."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 6 'Limitations' provides a dedicated, substantive discussion of 9 specific limitations."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "The 9 limitations are specific to this study: single account/country bias, use of only one self-report questionnaire, 90% of stories scored by one researcher, DA applied only once per chatbot, minimal Turing test sample, etc."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The paper explicitly states what it does not show: only commercial closed-source LLMs tested, no human comparison responses included, single creativity test used, single language (English), DA/humor results not generalizable from minimal samples."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "All 600 stories and analysis data are available on OSF: https://osf.io/kmfsq/."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 4 describes data collection in detail: dates (Oct 22 - Nov 5, 2024), random order, new sessions, batches of 5-10, training opt-out, exact instruments used (SSCS, EPoC)."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "The paper mentions 2 PhD creativity researchers and 3 story writing experts but does not describe how they were recruited or selected, nor their relationship to the authors."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The four-phase pipeline is well-documented with Figure 1 providing an overview. Each phase describes inputs, outputs, and the process flow from data collection through analysis."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The paper states: 'No funding was received for conducting this study or to assist with the preparation of this manuscript.'"
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are listed: FORTH (Greece) and Université Paris Cité / Univ Gustave Eiffel (France). Neither is affiliated with the chatbot companies being evaluated."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": false,
    214         "answer": false,
    215         "justification": "The study is unfunded, so funder independence is not applicable."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "The paper states: 'The authors declare no competing interests.'"
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "This study tests chatbots on a creative writing task, not on a benchmark that could be in training data. The task requires generating original narratives, not recalling known answers."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "Not a benchmark evaluation; the chatbots generate original creative content rather than being tested on pre-existing tasks with known answers."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "Not a benchmark evaluation. The EPoC task requires generating a novel story, not answering questions with known correct answers."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No pre-registration is mentioned despite involving human expert evaluators."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "No IRB or ethics approval is mentioned, despite involving human expert evaluators and story writing experts."
    250       },
    251       "demographics_reported": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "Minimal demographics: 2 PhD creativity researchers (both male), 3 story writing experts (1 female, 2 male) who have published books and teach creative writing. No age, experience level, or other demographics."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": true,
    258         "answer": false,
    259         "justification": "No inclusion/exclusion criteria stated for the human evaluators. They are described only as 'familiar with creativity rating' or having 'published several books.'"
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "Not an experimental study with conditions for human participants; the humans serve as evaluators, not experimental subjects."
    265       },
    266       "blinding_described": {
    267         "applies": true,
    268         "answer": true,
    269         "justification": "In Phase B, chatbots scored stories 'blindly' (not knowing which chatbot generated them). The writing experts were 'not informed that they were written by AI' until after scoring, then asked to guess human vs. AI origin."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "The human evaluators completed fixed tasks; there is no attrition to report for expert raters scoring assigned stories."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "No API costs, token counts, or inference times are reported despite using paid subscriptions for ChatGPT and Claude and generating 600+ stories plus extensive evaluations."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No total compute budget or API spend is stated."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "ChatGPT and Claude demonstrate high creative self-efficacy and creative personal identity, while Gemini scores significantly lower on the SSCS questionnaire.",
    293       "evidence": "Table 2 shows CSC scores: ChatGPT 47.6-49.4, Claude 47.3-50.7, Gemini 37.7-39.3. Kruskal-Wallis test: χ2(2) = 405.11, p < .001. Post hoc DSCF tests show significant pairwise differences (Section 5.1.1).",
    294       "supported": "strong"
    295     },
    296     {
    297       "claim": "Claude's stories were unanimously judged as the overall best by chatbots and human experts.",
    298       "evidence": "Table 6, R6: Claude mean 6.03 (SD 0.43), ChatGPT 5.76 (SD 0.35), Gemini 4.25 (SD 1.56). Human expert (R9): Claude 6.13, ChatGPT 5.81, Gemini 4.11. Inter-rater reliability between chatbots α = 0.97 (Section 5.2.1).",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "Chatbots cannot distinguish semantically coherent text from word-scrambled nonsense if the 'right words' are present.",
    303       "evidence": "Section 5.2.2: Stories ST6-ST8 (randomly reordered words) received scores of 6.33, 4.56, and 2.78 from chatbots. Only Claude detected the issue when given a vague hint; ChatGPT refused to consider it a problem; Gemini never understood.",
    304       "supported": "strong"
    305     },
    306     {
    307       "claim": "The Dynamic Assessment approach can considerably improve chatbots' creative outcomes.",
    308       "evidence": "Table 15: ChatGPT's revised story went from mean 4.70 to 7.00 (+49%), Claude from 4.80 to 6.50 (+35%), Gemini from 1.50 to 5.40 (+260%). However, this was tested only once per chatbot on the single worst-scoring story (Section 5.4).",
    309       "supported": "weak"
    310     },
    311     {
    312       "claim": "Each chatbot has a distinctive, highly consistent 'writing identity' that can be reliably identified.",
    313       "evidence": "Section 5.3.12: Claude identified all 12 story authors correctly (100%), ChatGPT missed 1 (92%), Gemini correctly classified 7/12 (58%). Extensive content analysis in Section 5.3 documents distinctive patterns in names, settings, openings, etc.",
    314       "supported": "moderate"
    315     },
    316     {
    317       "claim": "The two prompting approaches (all-in-one vs step-by-step) did not significantly affect ChatGPT but positively affected Claude and negatively affected Gemini.",
    318       "evidence": "Section 5.1.2/Table 4: Kruskal-Wallis test found no significant difference between approaches (χ2(2) = 0.00, p < .997 for CSC; χ2(2) = 0.08, p < .781 for word count). Claude's mean story length increased 33% and score improved 4% in A2; Gemini's score dropped 37% and length 60%.",
    319       "supported": "moderate"
    320     }
    321   ],
    322   "methodology_tags": ["qualitative", "observational"],
    323   "key_findings": "ChatGPT and Claude demonstrate high creative self-perceptions and produce stories rated highly by both AI and human evaluators, while Gemini scores lower on all measures. All chatbots have distinctive, consistent 'writing identities' but recycle a limited repertoire of story elements, producing homogeneous outputs within each chatbot. Critically, chatbots cannot distinguish semantically coherent text from word-scrambled nonsense containing the 'right words.' Dynamic Assessment and humor prompting can improve creative output quality, but this finding is based on minimal samples (one trial per chatbot).",
    324   "red_flags": [
    325     {
    326       "flag": "Minimal sample for DA and humor claims",
    327       "detail": "The Dynamic Assessment intervention was applied only once per chatbot on the single worst-scoring story. The humor experiment generated one story per chatbot. The paper acknowledges this but still frames these as key findings (Observations 18-20) despite the sample being far too small for any generalizable conclusion."
    328     },
    329     {
    330       "flag": "Paid vs free model comparison confound",
    331       "detail": "The paper concludes that paid models outperform free ones (Section 8), but the comparison is between GPT-4o (paid), Claude 3.5 Sonnet (paid), and Gemini 1.5 Flash (free). Flash is a smaller, weaker model regardless of pricing — the comparison conflates model capability with subscription tier."
    332     },
    333     {
    334       "flag": "No hyperparameter reporting",
    335       "detail": "Temperature and sampling parameters are not reported for any chatbot. Since creativity tasks are highly sensitive to temperature settings, this is a significant omission that undermines reproducibility."
    336     },
    337     {
    338       "flag": "Single-rater bias for 90% of stories",
    339       "detail": "Only 10% of stories were independently scored by two researchers. The remaining 540 stories were scored by a single evaluator. The paper acknowledges this in limitations but the inter-rater reliability estimate may not generalize to the full corpus."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "Homogenization effects of large language models on human creative ideation",
    345       "authors": ["B. R. Anderson", "J. H. Shah", "M. Kreminski"],
    346       "year": 2024,
    347       "doi": "10.1145/3635636.3656204",
    348       "relevance": "Studies how LLMs create homogeneity in creative ideation, relevant to understanding AI's impact on creative output diversity."
    349     },
    350     {
    351       "title": "GenAI creativity in narrative tasks: exploring new forms of creativity",
    352       "authors": ["F. Vinchon", "V. Gironnay", "T. I. Lubart"],
    353       "year": 2024,
    354       "doi": "10.3390/jintelligence12120125",
    355       "relevance": "Direct predecessor using EPoC to benchmark ChatGPT's creative potential, providing comparison data for this study."
    356     },
    357     {
    358       "title": "The current state of artificial intelligence generative Language models is more creative than humans on divergent thinking tasks",
    359       "authors": ["K. F. Hubert", "N. Kim", "D. L. Zabelina"],
    360       "year": 2024,
    361       "doi": "10.1038/s41598-024-53303-w",
    362       "relevance": "Benchmark comparison of GPT-4 vs humans on divergent thinking tests (AUT, CT, DAT)."
    363     },
    364     {
    365       "title": "We're different, we're the same: creative homogeneity across LLMs",
    366       "authors": ["E. Wenger", "Y. N. Kenett"],
    367       "year": 2025,
    368       "relevance": "Studies creative homogeneity across 22 LLMs using divergent thinking tests, finding LLMs have less response variability than humans."
    369     },
    370     {
    371       "title": "Are emergent abilities of large Language models a mirage?",
    372       "authors": ["R. Schaeffer", "B. Miranda", "O. Koyejo"],
    373       "year": 2023,
    374       "relevance": "Questions whether emergent abilities in LLMs are genuine or measurement artifacts, cited to contextualize claims about emergent creative personalities."
    375     },
    376     {
    377       "title": "Best humans still outperform artificial intelligence in a creative divergent thinking task",
    378       "authors": ["M. Koivisto", "S. Grassini"],
    379       "year": 2023,
    380       "doi": "10.1038/s41598-023-40858-3",
    381       "relevance": "Compares ChatGPT to humans on AUT, finding AI outperforms on average but best humans still exceed AI."
    382     },
    383     {
    384       "title": "Artificial muses: generative artificial intelligence chatbots have Risen to Human-Level creativity",
    385       "authors": ["J. Haase", "P. H. P. Hanel"],
    386       "year": 2023,
    387       "doi": "10.1016/j.yjoc.2023.100066",
    388       "relevance": "Compares creativity of six GenAI chatbots to humans using AUT, finding no qualitative difference."
    389     },
    390     {
    391       "title": "The originality of machines: AI takes the torrance test",
    392       "authors": ["E. E. Guzik", "C. Byrge", "C. Gilde"],
    393       "year": 2023,
    394       "doi": "10.1016/j.yjoc.2023.100065",
    395       "relevance": "Evaluates GPT-4 on TTCT verbal test, finding top 1% originality and fluency scores."
    396     },
    397     {
    398       "title": "The crowdless future? How generative AI is shaping the future of human crowdsourcing",
    399       "authors": ["L. Boussioux", "J. N. Lane", "M. Zhang", "V. Jacimovic", "K. R. Lakhani"],
    400       "year": 2023,
    401       "relevance": "Studies human-AI collaboration for business idea generation, finding AI collaboration improves strategic viability."
    402     }
    403   ]
    404 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs