scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22137B)
      1 {
      2   "paper": {
      3     "title": "Can AI Serve as a Substitute for Human Subjects in Software Engineering Research?",
      4     "authors": [
      5       "Marco Gerosa",
      6       "Bianca Trinkenreich",
      7       "Igor Steinmacher",
      8       "Anita Sarma"
      9     ],
     10     "year": 2023,
     11     "venue": "arXiv preprint",
     12     "arxiv_id": "2311.11081"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No source code or repository is released. The paper provides links to shared ChatGPT conversations (chat.openai.com/share/...) as examples, but no analysis code or scripts."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No dataset is released. The paper includes illustrative ChatGPT conversation links and a comparison table (Table 1) with survey data, but the underlying data from the referenced survey [14] is not released as part of this paper."
     25       },
     26       "environment_specified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "This is a vision/position paper with no computational experiments requiring environment specification."
     30       },
     31       "reproduction_instructions": {
     32         "applies": false,
     33         "answer": false,
     34         "justification": "This is a vision paper proposing conceptual approaches. There are no experiments to reproduce beyond the illustrative ChatGPT conversations, for which shared links are provided."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "This is a vision paper. The only quantitative comparison (Table 1) shows absolute differences between LLM-generated and real survey responses, but no statistical analysis is performed on these."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No statistical experiments are conducted. The Table 1 comparison is illustrative, not a rigorous statistical evaluation."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No statistical experiments are conducted in this vision paper."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No empirical study with a sample is conducted. The paper is a conceptual/vision paper."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experimental runs are conducted. This is a vision paper with illustrative examples."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "This is a vision paper that does not propose a system or method to be evaluated against baselines. The Table 1 comparison is illustrative, not a formal evaluation."
     69       },
     70       "baselines_contemporary": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No formal evaluation is conducted, so baselines are not applicable."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No system with components to ablate. This is a conceptual paper."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No formal evaluation is conducted in this vision paper."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No system outputs are produced that require human evaluation. The paper is a conceptual discussion."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No experiments with training/test splits are conducted."
     94       },
     95       "per_category_breakdown": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "No formal evaluation is conducted that would require per-category breakdowns."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 3 (Open Problems and Research Opportunities) extensively discusses limitations and failure modes: ethical considerations, bias in AI-generated data, feedback loops in AI training, complexity in persona differentiation, and challenges with demographic representation."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper discusses negative aspects and limitations throughout: Section 3.1 on bias and fairness concerns, Section 3.2 on validity challenges, and the related work section cites Dominguez-Olmedo et al. [34] and Lee et al. [35] who found LLMs fail to represent certain subpopulations."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract makes appropriately hedged claims: 'we explore the potential,' 'could offer scalable and efficient means,' 'AI could augment aspects of data gathering.' These vision-level claims are supported by the conceptual discussion and illustrative examples in the paper."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The paper makes no causal claims. It is a vision/position paper discussing potential future applications of LLMs in qualitative research."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper is appropriately scoped to software engineering research and explicitly states it is a 'vision paper' exploring conceptual possibilities. The conclusion explicitly states: 'This paper does not propose to replace human subjects, but rather to explore the boundaries of AI's capabilities.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper discusses multiple alternative perspectives: Section 4 (Related Work) cites work both supporting and questioning LLM fidelity as human proxies (e.g., Dominguez-Olmedo et al. questioning LLM survey responses, Lee et al. showing LLMs fail for some subpopulations). Section 3 discusses bias, feedback loops, and the risk that LLM outputs may reflect training data rather than genuine human perspectives."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper states the illustrative examples were 'generated using GPT-4' but provides no version or snapshot date (e.g., gpt-4-0314 vs gpt-4-0613). The acknowledgments mention 'ChatGPT v4 was used for copy editing.'"
    138       },
    139       "prompts_provided": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The actual prompts used for the illustrative examples are provided in the appendices (Appendix A for focus groups, Appendix B for surveys), and shared ChatGPT conversation links are provided for each example (Figures 1-4)."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No hyperparameters (temperature, top-p, etc.) are reported for the GPT-4 interactions. Section 3.2 discusses 'setting the right level of randomness' as an open problem but does not report what settings were used for the illustrative examples."
    148       },
    149       "scaffolding_described": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No agentic scaffolding is used. The illustrative examples are simple prompt-response interactions with ChatGPT."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "This is a vision paper with no data preprocessing pipeline. The illustrative examples use raw ChatGPT outputs."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 3 (Open Problems and Research Opportunities) serves as an extensive limitations discussion, covering ethical considerations, bias, validity concerns, feedback loops, and persona differentiation challenges across 4 pages."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 3 discusses specific threats: embedded biases and stereotypes in LLM training data (Section 3.1), AI 'echo chambers' from feedback loops (Section 3.2), the caveat that prior research may be in training data invalidating replication studies (Section 3.2), and difficulty of precise demographic representation (Section 3.3)."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The paper explicitly states scope boundaries: it is a 'vision paper' (stated multiple times), 'we neither believe nor desire for AI to completely replace human subjects,' and it is scoped to 'qualitative data collection in software engineering research.' The conclusion clarifies: 'This paper does not propose to replace human subjects.'"
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The raw survey data from [14] used for the Table 1 comparison is not made available. The ChatGPT conversation links may expire and are not archived."
    182       },
    183       "data_collection_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "For the illustrative examples, the data collection is described: prompts were given to GPT-4 via ChatGPT, and the survey comparison uses demographic data from a prior study [14] of 242 OSS contributors. The prompts and contexts are shown in the paper and appendices."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No human participants are recruited in this study. The paper discusses using AI as a substitute for human participants conceptually."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No data pipeline exists. The illustrative examples are direct ChatGPT interactions with no transformation steps."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 6 (Acknowledgments) states: 'The National Science Foundation partially supports this work under grant numbers 2236198, 2235601, 2247929, 2303043, and 2303042.'"
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Author affiliations are clearly listed: Northern Arizona University (Gerosa, Steinmacher) and Oregon State University (Trinkenreich, Sarma). No conflict with evaluated products."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The National Science Foundation is an independent government funding agency with no financial stake in whether LLMs can substitute for human subjects in SE research."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No competing interests or financial interests statement is present in the paper."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It uses GPT-4 for illustrative examples of persona-based prompting, not for benchmark evaluation."
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmark evaluation is conducted. However, the paper does acknowledge the contamination concern in Section 3.2: 'the caveat that prior research could have been incorporated into the models' training datasets.'"
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No benchmark evaluation is conducted in this vision paper."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved in this study."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "This is a vision paper with no method whose cost would be meaningful to report."
    280       },
    281       "compute_budget_stated": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is a vision paper with no significant computation performed."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "LLM-generated survey responses exhibit an average deviation of just 4% from actual survey data across five questions about OSS contribution motivations.",
    291       "evidence": "Table 1 shows absolute differences between real survey data from [14] (242 OSS contributors) and GPT-4 generated responses across 5 Likert-scale questions with 6 response options. Deviations range from 1% to 14% per response option.",
    292       "supported": "weak"
    293     },
    294     {
    295       "claim": "Persona-based prompting can generate responses that are coherent with findings from previous research on gender differences in OSS contributions.",
    296       "evidence": "Section 2.1 compares two ChatGPT persona interviews (Figures 1-2): a male persona emphasized 'technical hurdles and complex problem-solving' while a female persona emphasized 'community collaboration, learning experiences.' Authors cite consistency with their prior work [14] showing men are more motivated by fun/technical challenges and women by reciprocity/kinship.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "AI could augment aspects of data gathering in software engineering research but cannot replace the nuanced, empathetic understanding inherent in human subjects.",
    301       "evidence": "This is the paper's central thesis, discussed throughout and in the conclusion (Section 5). It is a position statement supported by the conceptual discussion in Sections 2-3 and related work in Section 4.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Multi-persona prompting can simulate focus group dynamics with distinct voices building upon each other's contributions.",
    306       "evidence": "Section 2.2 and Appendix A present a 4-round focus group simulation generated by GPT-4 with 5 personas designing a web portal. The conversation shows personas referencing and building on each other's suggestions.",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": [
    311     "theoretical",
    312     "qualitative"
    313   ],
    314   "key_findings": "This vision paper proposes using LLMs as substitutes for human subjects in qualitative SE research through three prompting strategies: persona-based prompting for interviews, multi-persona dialogue for focus groups, and mega-persona responses for surveys. An illustrative comparison of GPT-4 survey responses with actual data from 242 OSS contributors showed an average 4% deviation across 5 questions. The paper identifies numerous open problems including bias propagation, feedback loops in AI training, demographic representation precision, and ethical concerns, concluding that an integrated approach combining AI and human data will yield the best outcomes.",
    315   "red_flags": [
    316     {
    317       "flag": "Illustrative examples presented as evidence",
    318       "detail": "The paper presents a handful of ChatGPT conversation examples and a single 5-question survey comparison as evidence for the viability of AI-generated qualitative data, but acknowledges this is not an 'exhaustive validation.' The 4% average deviation claim is based on only 5 questions from a single survey with no statistical testing of the difference."
    319     },
    320     {
    321       "flag": "Confirmation bias in example selection",
    322       "detail": "The persona-based interview examples (Figures 1-2) are selected to show consistency with the authors' own prior research [14] on gender differences in OSS. No examples of cases where the LLM produced responses inconsistent with known findings are shown."
    323     },
    324     {
    325       "flag": "Training data contamination in validation",
    326       "detail": "The survey comparison uses data from the authors' own 2021 ICSE paper [14], which GPT-4 may have seen during training. The paper acknowledges this concern in Section 3.2 ('the caveat that prior research could have been incorporated into the models' training datasets') but still presents the comparison as supportive evidence."
    327     },
    328     {
    329       "flag": "No rigorous empirical validation",
    330       "detail": "As a self-described vision paper, the work provides no controlled experiments, statistical tests, or systematic evaluation of the proposed approaches. The claims about LLM fidelity rest entirely on illustrative examples and references to other researchers' findings."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "Out of one, many: Using language models to simulate human samples",
    336       "authors": ["Lisa P. Argyle", "Ethan C. Busby", "Nancy Fulda", "Joshua R. Gubler", "Christopher Rytting", "David Wingate"],
    337       "year": 2023,
    338       "relevance": "Directly relevant to using LLMs as proxies for human populations in research, introducing the concept of 'algorithmic fidelity.'"
    339     },
    340     {
    341       "title": "Can AI language models replace human participants?",
    342       "authors": ["Danica Dillion", "Niket Tandon", "Yuling Gu", "Kurt Gray"],
    343       "year": 2023,
    344       "doi": "10.1016/j.tics.2023.04.008",
    345       "relevance": "Discusses conditions under which LLMs might replace human participants in psychological science research."
    346     },
    347     {
    348       "title": "Using large language models in psychology",
    349       "authors": ["Dorottya Demszky", "Diyi Yang", "David S. Yeager"],
    350       "year": 2023,
    351       "relevance": "Reviews applications of LLMs in psychology research including as substitutes for human participants."
    352     },
    353     {
    354       "title": "PersonaLLM: Investigating the ability of GPT-3.5 to express personality traits and gender differences",
    355       "authors": ["Hang Jiang", "Xiajie Zhang", "Xubo Cao", "Jad Kabbara", "Derry Roy"],
    356       "year": 2023,
    357       "arxiv_id": "2305.02547",
    358       "relevance": "Evaluates LLMs' ability to embody assigned personality traits in user personas, directly relevant to persona-based prompting approaches."
    359     },
    360     {
    361       "title": "Evaluating large language models in generating synthetic HCI research data: A case study",
    362       "authors": ["Perttu Hämäläinen", "Mikke Tavast", "Anton Kunnari"],
    363       "year": 2023,
    364       "doi": "10.1145/3544548.3580688",
    365       "relevance": "Evaluates LLMs for generating synthetic qualitative data in HCI research, finding they can produce believable accounts."
    366     },
    367     {
    368       "title": "Using large language models to simulate multiple humans and replicate human subject studies",
    369       "authors": ["Gati V. Aher", "Rosa I. Arriaga", "Adam Tauman Kalai"],
    370       "year": 2023,
    371       "relevance": "Presents the Turing Experiment test for evaluating how AI can simulate groups of participants, demonstrating replication of classic experiments."
    372     },
    373     {
    374       "title": "Large language models as subpopulation representative models: A review",
    375       "authors": ["Gabriel Simmons", "Christopher Hare"],
    376       "year": 2023,
    377       "arxiv_id": "2310.17888",
    378       "relevance": "Reviews the use of LLMs as representative models for subpopulations, directly relevant to the mega-persona approach."
    379     },
    380     {
    381       "title": "A prompt pattern catalog to enhance prompt engineering with ChatGPT",
    382       "authors": ["Jules White", "Quchen Fu", "Sam Hays"],
    383       "year": 2023,
    384       "arxiv_id": "2302.11382",
    385       "relevance": "Catalogs prompt engineering patterns including persona-based prompting, foundational to the approaches discussed in this paper."
    386     },
    387     {
    388       "title": "Supporting qualitative analysis with large language models: Combining codebook with GPT-3 for deductive coding",
    389       "authors": ["Ziang Xiao", "Xingdi Yuan", "Q. Vera Liao"],
    390       "year": 2023,
    391       "relevance": "Explores LLMs for qualitative data analysis, a complementary use to the data generation approach proposed in this paper."
    392     },
    393     {
    394       "title": "Guinea pigbots",
    395       "authors": ["Matthew Hutson", "Andrew Mastin"],
    396       "year": 2023,
    397       "relevance": "Science journal article discussing current and potential uses of generative AI in qualitative research, including as substitutes for human subjects."
    398     }
    399   ]
    400 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs