scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25275B)
      1 {
      2   "paper": {
      3     "title": "Artificial Intelligence for Health Message Generation: Theory, Method, and an Empirical Study Using Prompt Engineering",
      4     "authors": ["Sue Lim", "Ralf Schmälzle"],
      5     "year": 2022,
      6     "venue": "Department of Communication, Michigan State University",
      7     "doi": ""
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, GitHub link, or code archive is provided in the paper. The authors mention using Python, Google Colab, and various packages but do not release any code."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The scraped tweets, AI-generated messages, and survey responses are not released. No dataset download link or data repository is provided."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions using Python, Google Colab, and packages like spacy, textacy, vader, sentence-transformers, snscrape, and the transformers package, but does not provide a requirements.txt, Dockerfile, or detailed version information for these dependencies."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While the paper describes the general methodology (prompt engineering steps, computational analyses, survey design), it does not provide step-by-step reproduction instructions, scripts, or a README that would allow a researcher to replicate the experiments without significant guesswork."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Tables 1 and 2 report means and standard deviations but do not include confidence intervals or error bars. Figure 3 shows dot plots but no error bars or CI notation."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "T-tests are used to compare AI-generated vs. human-generated messages for reading ease, sentiment (Table 1), clarity, and quality (Table 2), with p-values reported (e.g., tclarity = 4.32, p < .01; tquality = 5.39, p < .01)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The power analysis specifies effect size d = .3. The results tables provide means and standard deviations with baseline context (e.g., AI clarity 3.77 vs. human 3.22 on a 1-5 scale), providing enough information to compute and understand the magnitude of differences."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports a power analysis: 'We also conducted a power analysis using the pwr package in R for a one-sample and one-sided t-test, with effect size d = .3 and significance level α = .05. Sample size of 100 was enough to detect significance at the power level of .9.' They also cite Kim & Cappella (2019) for sample sufficiency."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Standard deviations are reported alongside means in Tables 1 and 2 (e.g., Clarity: AI 3.77 (0.55), Human 3.22 (0.43); Flesch: AI 63.4 (17.9), Human 68.4 (20.3))."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The study compares AI-generated messages to retweeted human-generated messages as a baseline, which serves as the comparison standard throughout both computational and human evaluation analyses."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The only comparison is against human-generated tweets. There is no comparison against other AI text generation systems (e.g., GPT-3 which was available at the time and mentioned in the related work). The prior study by Schmälzle and Wilcox (2022) using GPT-2 with fine-tuning is discussed but not directly compared."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The paper uses six different prompts but does not systematically ablate components (e.g., comparing prompting strategies, temperature settings, or model sizes). The prompt variations are described qualitatively but not evaluated in a controlled ablation framework."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The study uses multiple metrics: Flesch Reading Ease, VADER sentiment scores, n-gram analysis, topic modeling, semantic similarity (cosine similarity of sentence embeddings), and human ratings of clarity and quality."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "A human evaluation study with N=120 participants rated AI-generated and human-generated messages on quality and clarity using 5-point Likert scales (Tables 2 and Figure 3)."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is not a machine learning benchmark study with train/test splits. The evaluation is a direct comparison of generated messages vs. human messages, so held-out test sets are not structurally applicable."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by gender (male vs. female) in Figure 3, and by prompt type in the qualitative analysis. The semantic similarity analysis shows within-group and between-group comparisons."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses failure cases during prompt engineering: 'too many of the same messages without much informative content, too many names of official organizations or countries, or clearly false information.' Messages with exclusion criteria (false information, non-US references, unverifiable sources, recipes, non-English characters, repetitive phrases) are described."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that AI-generated messages were more similar to each other than human messages (msimilarity AI = .59 vs. msimilarity human = .43, p < 0.001), which is presented as a limitation of the AI approach. The non-significant results for reading ease and sentiment are also reported (Table 1)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims that: (1) AI-generated messages were on par with human ones in sentiment, reading ease, and semantic content — supported by Table 1 and Figure 2; (2) human evaluation showed AI messages ranked higher in quality and clarity — supported by Table 2 with significant t-test results."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal-adjacent claims such as 'the Bloom message engine has the potential to generate informative awareness messages that surpass the retweeted messages' and discusses prompt engineering as a way to 'strategically control qualities and tone.' However, the study design does not adequately control for confounds — e.g., AI messages were curated and cleaned while human messages were scraped from Twitter, introducing different production processes. The comparison is observational despite the causal framing."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "While limitations mention testing only one topic (folic acid) and one message type (awareness), the title and discussion make broad claims about 'AI for Health Message Generation' generally. The abstract does not bound claims to folic acid or the Bloom model. The discussion states 'we anticipate that this work will find many applications in health communication in the near future' without qualifying these claims."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not substantively discuss alternative explanations for why AI messages were rated higher. For example, the AI messages were curated through a multi-step filtering process while human messages were simply the top retweeted tweets — the curation process itself (not AI quality) could explain the difference. The higher internal semantic similarity of AI messages is noted but not explored as a potential confound."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper specifies 'Bloom 7B1' with '7 billion neural network parameters' and references Bigscience (2022). The specific model variant (bloom-7b1) is identified, and the Huggingface link is provided in the references."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "All six prompts are explicitly listed: 'It's National Folic Acid Awareness week', 'Every woman needs #folicacid every day,', 'Did you know, #Folicacid', 'Consuming folic acid is important because', 'Consuming folic acid:', and 'Folic acid during pregnancy'."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Hyperparameters are reported: maximum result length = 60 tokens, do_sample=True, temperature = 0.7, top_k=40, top_p=0.9. These are described in Step 3 of the Message Generation Protocol."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The system is a straightforward prompt-to-generation pipeline with no tool use, retry logic, or feedback mechanisms."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The data preprocessing pipeline is documented with counts: 42,646 raw tweets scraped, top 50 selected by retweet count, filtered for duplicates/non-English/promotional content yielding 28, plus 2 more for 30 total. AI message selection: 600 generated, 10 randomly selected per prompt (60 total), exclusion criteria applied, final 30 selected."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "There is a section titled 'Ethical Considerations and Limitations' that discusses multiple limitations spanning approximately two paragraphs."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The limitations section discusses specific threats: testing only one topic (folic acid), focusing only on awareness messaging (not attitude/behavior change), generating only short messages, limited computational metrics, and the narrow participant sample (college students) with acknowledgment that it overlaps with but does not fully represent the target population."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly bounds scope: 'this empirical research paper focuses only on the proximal goal of raising awareness' (footnote 1), tested only folic acid, only short social-media-style messages, and notes 'it will be necessary to expand these findings to other domains.' The footnotes repeatedly clarify what the paper does NOT claim."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw data (scraped tweets, generated messages, survey responses) are not available for independent verification. No data repository or supplementary data files are provided."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Data collection is described in detail: tweets scraped using snscrape with hashtags #folicacid and #folate from Twitter (no date constraints), yielding 42,646 raw messages. AI messages generated via Bloom with specified parameters. Survey conducted via Qualtrics with course credit compensation."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The paper states 'Participants were recruited from a study pool and received course credits as compensation for the study.' The sample is characterized: N=120, 70% female (n=84), college students. The connection to the target population (potential parents, women of childbearing age) is discussed."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline is documented across Steps 1-4 of the Method section, with counts at each stage: 42,646 tweets → top 50 → 28 after filtering → 30 final human messages. For AI: 600 generated → 60 randomly selected → exclusion criteria applied → 30 final. Survey: responses from unrealistically fast completions (<4 min) or incomplete surveys discarded, yielding N=120."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of grants, sponsors, or funding agencies."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly stated: 'Department of Communication, Michigan State University, East Lansing, USA' for both authors. Since they are evaluating an open-source model (Bloom) and not their own product, there is no inherent vendor conflict."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure does not confirm unfunded status."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper notes Bloom was trained on '1.5 TB of pre-processed text from 45 natural and 12 programming languages' but does not state the training data cutoff date. This is relevant because the folic acid tweets used for comparison could have been in Bloom's training data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether the retweeted folic acid tweets or similar health messages could have appeared in Bloom's training data. Since Bloom was trained on internet text, the comparison tweets may have been in the training corpus, which would affect the validity of the comparison."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper does not address whether the health messages or related folic acid content were part of Bloom's training data. Since Bloom was trained on Wikipedia and other web text, folic acid information was almost certainly in the training data, which could explain the model's ability to generate accurate health content."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No pre-registration link (OSF, AsPredicted, or similar) is mentioned. The study involves N=120 human participants evaluating messages."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": true,
    242         "justification": "The paper states the study 'was approved by the local review board' in the Participants section."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Basic demographics are reported: N=120, 70% female (n=84), recruited from a university study pool (college students). Gender breakdown of results is shown in Figure 3."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": true,
    252         "justification": "Exclusion criteria are stated: 'Responses from participants who completed the survey at an unrealistically fast speed (<4 minutes) or failed to complete the survey were discarded.'"
    253       },
    254       "randomization_described": {
    255         "applies": true,
    256         "answer": true,
    257         "justification": "Randomization is described: 'Message order was randomized within each block and approximately half of the sample started with the first question, the other half started with the second block.'"
    258       },
    259       "blinding_described": {
    260         "applies": true,
    261         "answer": true,
    262         "justification": "The paper states 'participants were not told in advance which of the messages were AI-generated or human generated; they were only told that the purpose of the study was to evaluate health messages related to folic acid.'"
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "The paper mentions that fast or incomplete responses were discarded to yield N=120, but does not report how many participants started the study, how many were excluded, or specific dropout numbers. The initial N before exclusions is not stated."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "The paper reports practical time costs: 'Loading the Bloom 7B1 model into Google Colab and generating 600 messages took a little over an hour.' The feasibility section discusses time requirements for the overall pipeline."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "While Google Colab is mentioned as the platform and generation time is reported (~1 hour for 600 messages), the specific GPU type, GPU hours, or total computational budget is not stated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The Bloom message engine is easy to use and can generate messages via prompting without fine-tuning.",
    286       "evidence": "RQ1 results in the Feasibility section: the system required only Python coding with available online resources, and generating 600 messages took about an hour on Google Colab.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "AI-generated messages are on par with human-generated messages in terms of sentiment, reading ease, and semantic content.",
    291       "evidence": "Table 1: No significant differences in Flesch Reading Ease (AI 63.4 vs. Human 68.4, p=.32) or VADER sentiment (AI .25 vs. Human .23, p=.87). Semantic similarity analysis and topic modeling in Figure 2.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "AI-generated messages were rated significantly higher than human-generated messages in clarity and quality by human evaluators.",
    296       "evidence": "Table 2: t-test results for clarity (t=4.32, p<.01) and quality (t=5.39, p<.01) with N=120 participants. AI clarity 3.77 vs. human 3.22; AI quality 3.65 vs. human 3.12.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Prompt engineering can be used to strategically control the qualities and tone of AI-generated messages.",
    301       "evidence": "Qualitative observations in the RQ2 results: prompts with #folicacid generated tweet-like content with hashtags, while prompts without hashtags generated more formal, factual content.",
    302       "supported": "weak"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval", "case-study"],
    306   "key_findings": "Using the Bloom 7B1 language model with prompt engineering, AI-generated folic acid awareness messages were comparable to retweeted human messages on computational metrics (reading ease, sentiment, semantic similarity) and were rated significantly higher by 120 human evaluators on clarity and quality. The system generated 600 messages in about an hour without requiring model fine-tuning. AI-generated messages showed higher internal similarity than human messages, suggesting less diversity in AI output.",
    307   "red_flags": [
    308     {
    309       "flag": "Unfair comparison standard",
    310       "detail": "AI-generated messages underwent a multi-step curation process (generating 600, randomly selecting 60, applying exclusion criteria, cleaning) while human messages were simply the top 30 retweeted tweets with basic filtering. The AI messages were effectively cherry-picked from a larger pool while human messages were not similarly curated. This asymmetric curation could explain the quality difference rather than AI superiority."
    311     },
    312     {
    313       "flag": "No contamination analysis",
    314       "detail": "Bloom was trained on Wikipedia, Semantic Scholar, and other web text which almost certainly included folic acid health information. The model's ability to generate accurate health messages may reflect memorized training data rather than genuine generation capability, but this is never discussed."
    315     },
    316     {
    317       "flag": "Small and narrow message sample",
    318       "detail": "Only 30 AI messages and 30 human messages were compared. With such a small sample of messages (not participants), individual message quality could drive the results. The single health topic (folic acid) limits generalizability despite broad claims about 'AI for Health Message Generation.'"
    319     },
    320     {
    321       "flag": "No alternative explanations for quality difference",
    322       "detail": "The paper does not consider that the curation process, the formal/informative style of Bloom outputs vs. casual Twitter style, or the removal of false/problematic content from AI messages could explain the higher human ratings, rather than intrinsic AI quality."
    323     }
    324   ],
    325   "cited_papers": [
    326     {
    327       "title": "Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing",
    328       "authors": ["P. Liu", "W. Yuan", "J. Fu", "Z. Jiang", "H. Hayashi", "G. Neubig"],
    329       "year": 2021,
    330       "relevance": "Systematic survey of prompting methods for LLMs, directly relevant to understanding prompt engineering techniques used in AI systems."
    331     },
    332     {
    333       "title": "All the news that's fit to fabricate: AI-generated text as a tool of media misinformation",
    334       "authors": ["S. E. Kreps", "M. McCain", "M. Brundage"],
    335       "year": 2020,
    336       "relevance": "Examines AI-generated text for misinformation, relevant to safety and adversarial use of LLM text generation."
    337     },
    338     {
    339       "title": "Release strategies and the social impacts of language models",
    340       "authors": ["I. Solaiman", "M. Brundage", "J. Clark"],
    341       "year": 2019,
    342       "relevance": "Discusses social impacts and release strategies for language models, relevant to AI safety and responsible deployment."
    343     },
    344     {
    345       "title": "CTRL: A conditional transformer language model for controllable generation",
    346       "authors": ["N. S. Keskar", "B. McCann", "L. R. Varshney", "C. Xiong", "R. Socher"],
    347       "year": 2019,
    348       "relevance": "Presents controllable text generation with transformer models, relevant to understanding AI code/text generation capabilities."
    349     },
    350     {
    351       "title": "Transformers: State-of-the-Art natural language processing",
    352       "authors": ["T. Wolf", "L. Debut", "V. Sanh"],
    353       "year": 2020,
    354       "relevance": "The HuggingFace Transformers library used as infrastructure for LLM-based research and applications."
    355     },
    356     {
    357       "title": "Climbing towards NLU: On meaning, form, and understanding in the age of data",
    358       "authors": ["E. M. Bender", "A. Koller"],
    359       "year": 2020,
    360       "relevance": "Critical examination of what LLMs actually learn about language, relevant to understanding LLM capabilities and limitations."
    361     },
    362     {
    363       "title": "The curious case of neural text degeneration",
    364       "authors": ["A. Holtzman", "J. Buys", "L. Du", "M. Forbes", "Y. Choi"],
    365       "year": 2019,
    366       "relevance": "Addresses text degeneration in neural language models and proposes nucleus sampling, relevant to LLM output quality."
    367     },
    368     {
    369       "title": "Behavioral use licensing for responsible AI",
    370       "authors": ["D. Contractor", "D. McDuff", "J. Haines"],
    371       "year": 2020,
    372       "relevance": "Proposes responsible AI licensing framework (RAIL), relevant to AI safety and governance of LLM deployment."
    373     }
    374   ]
    375 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs