scan.json (27362B)
1 { 2 "paper": { 3 "title": "LLM Agents in Interaction: Measuring Personality Consistency and Linguistic Alignment in Interacting Populations of Large Language Models", 4 "authors": ["Ivar Frisch", "Mario Giulianelli"], 5 "year": 2024, 6 "venue": "PERSONALIZE", 7 "arxiv_id": "2402.02896", 8 "doi": "10.48550/arXiv.2402.02896" 9 }, 10 "scan_version": 3, 11 "active_modules": [], 12 "methodology_tags": ["observational"], 13 "key_findings": "Personality-conditioned GPT-3.5 agents can be distinguished by both BFI questionnaire responses and LIWC-analyzed language use (98.5% classification accuracy), but consistency varies by profile: creative agents maintain stable BFI scores while analytical agents drift significantly after a writing task. After cross-group interaction, agents exhibit linguistic alignment toward their conversational partner—creative agents adapt more than analytical ones, reducing LIWC-based classification accuracy to 66.15%. The alignment is asymmetric and persona-dependent rather than a simple convergence effect.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "Code is released at https://github.com/ivarfresh/Interaction_LLMs, stated in footnote 1 of Section 2: 'Code for experiments and analyses available at https://github.com/ivarfresh/Interaction_LLMs'." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper does not mention releasing the generated stories, BFI response data, or LIWC vectors. Only the code repository is referenced." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions 'gpt-3.5-turbo-0613' and 'the LangChain library' (footnote 2) but provides no requirements.txt, Dockerfile, or detailed environment specifications with library versions." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided in the paper. The code repository is linked but no README with commands or a 'Reproducing Results' section is described." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Box plots in Figure 1 show distributions (quartiles, median) but no formal confidence intervals are reported. ANOVA tables (Tables 1-6) report F-statistics and p-values but no CIs." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": true, 46 "justification": "ANOVA tests are used throughout with F-statistics and p-values reported in Tables 1-6 (e.g., Table 1 reports F=8645, p<0.001 for Extraversion). Point-biserial and Spearman correlations are also reported." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Cohen's d values are reported in Tables 2, 3, 5, and 6 (e.g., Table 2 shows Cohen's d=2.61 for Agreeableness in the analytic group). Point-biserial correlation coefficients also serve as effect size measures (Figure 2c, 2d)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper never explicitly states the number of agents per group, let alone justifies the sample size or provides a power analysis. The scatter plots suggest roughly 50-100 per group, but this is never stated in the text." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Box plots in Figure 1 show distributional spread visually, but the results tables (Tables 2, 3, 5, 6) report only means—no standard deviations or other spread measures are included alongside the numerical results." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The non-interactive condition (Experiment 1) serves as the baseline/control against which the interactive condition (Experiment 2) is compared. Pre-writing BFI scores also serve as baselines for post-writing scores." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": false, 73 "justification": "The paper does not compare its personality conditioning approach against alternative methods (e.g., other prompting strategies, fine-tuning, or prior work's conditioning techniques). Only within-study comparisons are made." 74 }, 75 "ablation_study": { 76 "applies": false, 77 "answer": false, 78 "justification": "The system has essentially one component (personality conditioning via prompting). There are no modular components to ablate." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple assessment methods are used: BFI questionnaire scores (explicit personality assessment), LIWC category counts (implicit linguistic assessment), logistic regression classification accuracy, PCA visualization, and Spearman correlations between BFI scores and LIWC categories." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "All evaluation is automated. BFI tests are self-administered by LLM agents, LIWC analysis is computational, and classification is via logistic regression. No human evaluation of the generated stories or agent behavior is conducted." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The logistic regression classifier uses 10-fold cross-validation ('trained and tested in a 10-fold cross-validation setup', Section 3.1.2), ensuring results are reported on held-out data." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by all five BFI personality traits (Extraversion, Agreeableness, Conscientiousness, Neuroticism, Openness), by persona group (creative vs analytical), and by experimental condition (interactive vs non-interactive)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The Limitations section discusses that 'stories written by GPT-3.5 were not always of good quality' and that 'generations often contain mentions to the agent's own personality traits' despite instructions to avoid this." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Several negative results are reported: analytical agents show significant personality drift after writing (Table 2), neuroticism scores overlap between groups (Table 1, p=0.005 vs p<0.001 for others), and the interactive condition does not produce stronger alignment than the non-interactive condition for the analytical group." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims are hedged ('exploratory work', 'different degrees of personality consistency') and supported by the results: BFI score differences (Figure 1, Table 1), LIWC separation (Figure 2, 98.5% accuracy), and post-interaction convergence (66.15% accuracy). No overclaiming." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims about interaction affecting behavior ('changes as a result of interaction'). The study design manipulates the interaction variable while controlling personality conditioning, comparing interactive vs non-interactive conditions (Experiment 1 vs 2) with pre/post measurements, which is adequate for these claims." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title says 'Large Language Models' (plural) but only GPT-3.5-turbo is tested. Only two extreme personality profiles are used. While the paper frames results as 'exploratory,' the title and abstract suggest broader applicability than the single-model, two-persona design supports." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper carefully distinguishes interaction effects from writing-task effects by comparing post-interactive BFI scores against post-non-interactive scores (not just pre-writing scores): 'To discern changes in BFI responses that result from interaction from those induced by the writing task itself' (Section 3.2.1)." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper distinguishes between explicit personality assessment (BFI questionnaires) and implicit personality assessment (LIWC-based language analysis) and discusses the relationship between these proxies and actual personality consistency. They note that LIWC-BFI correlations weaken after interaction (Table 7 vs Table 4), showing awareness of what the measurements capture." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "The exact model version 'gpt-3.5-turbo-0613' is specified in footnote 2 of Section 2.1." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt text is provided in Appendix A: personality prompts (A.1, A.2), writing task prompts (A.3), BFI test prompt (A.4), and all 44 BFI statements (A.5). The actual text used is given, not just descriptions." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Footnote 2 states: 'All parameters at their OpenAI default settings, except for temperature' with temperature=0.7 specified in Section 2.1. Context window size (4,096 tokens) is also noted." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The experiments directly prompt GPT-3.5-turbo with personality conditioning and task instructions." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "The word count filtering is documented: 'we only keep stories with a word count between 500 and 900' (footnote 5). LIWC processing using the 2007 dictionary is described (Section 2.4). BFI scoring procedure is detailed in Appendix A.6." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "A dedicated 'Limitations' section is present with substantive discussion spanning multiple paragraphs, covering interaction design, personality measures, story quality, and prompt engineering limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Specific threats are discussed: single-turn interaction only ('we only studied interactions consisting of one turn of one-sided dialogue'), GPT-3.5 story quality ('generations often contain mentions to the agent's own personality traits'), and limited prompting strategies ('extensive prompt engineering was beyond the scope of this study')." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper explicitly bounds its scope: 'exploratory work' (Section 1), only GPT-3.5-turbo was tested, only two personality profiles, single-turn interaction, and the Ethical Considerations section notes the personas 'do not reflect real-life personality categorisations of human subjects.'" 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "The generated stories, BFI responses, and LIWC vectors are not released. Only the code repository is provided, but not the actual experimental outputs." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "The data generation procedure is well-described: temperature sampling (0.7) from GPT-3.5-turbo-0613, personality prompts (Appendix A.1-A.2), writing task prompts (A.3), BFI administration (A.4-A.5), and LIWC analysis using the 2007 dictionary." 192 }, 193 "recruitment_methods_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "The agent population bootstrapping method is clearly described in Section 2.1: temperature sampling from a single LLM to create a population, with each response considered as a different agent. The two-group structure via personality prompts is detailed in Section 2.2." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The paper does not report how many stories were generated in total, how many were discarded for falling outside the 500-900 word range, or the exact number of agents per group. There are unexplained gaps between data generation and final analysis." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source is mentioned anywhere in the paper. No acknowledgments section listing grants or sponsors is present." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: Ivar Frisch at Utrecht University and Mario Giulianelli at ETH Zürich. They are not evaluating their own product." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": false, 217 "answer": false, 218 "justification": "No funding is mentioned; appears to be unfunded academic work (student thesis at Utrecht University)." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial disclosure statement is included in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It studies behavioral properties (personality consistency and linguistic alignment) of LLM agents, not model knowledge or task performance." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "Not applicable — the paper studies LLM agent behavior under personality conditioning, not model performance on a benchmark that could be contaminated." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Not applicable — no benchmark evaluation is performed. The BFI questionnaire is used as a personality assessment instrument, not a capability benchmark." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants. All subjects are LLM agents." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The study uses LLM agents only." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants. Agent 'demographics' (personality profiles) are experimentally assigned." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants to include or exclude." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants. Agent creation via temperature sampling is described but this is not human subject randomization." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants. Blinding is not applicable to LLM agent experiments." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants. Agent dropout is not applicable, though story filtering is mentioned without counts." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No API costs, token counts, or wall-clock times are reported despite using GPT-3.5-turbo for all experiments (generating stories, administering BFI tests across multiple conditions)." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget (total API spend, number of API calls, or processing time) is stated." 290 } 291 } 292 }, 293 "claims": [ 294 { 295 "claim": "Personality-conditioned LLM agents show BFI scores consistent with their assigned profiles, with significant differences across 4 of 5 traits before writing.", 296 "evidence": "ANOVA results in Table 1 show significant differences for Extraversion (F=8645, p<0.001), Agreeableness (F=13384, p<0.001), Conscientiousness (F=1439, p<0.001), and Openness (F=5012, p<0.001). Neuroticism shows overlap (F=23, p=0.005). See Figure 1a.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Creative agents maintain stable BFI scores after writing while analytical agents show significant personality drift.", 301 "evidence": "Table 3 shows no significant BFI changes for creative agents (all p>0.4). Table 2 shows analytical agents' scores increase significantly on all five traits after writing (all p<0.03, Cohen's d up to 2.71 for Conscientiousness).", 302 "supported": "strong" 303 }, 304 { 305 "claim": "Agents from different personality groups can be clearly distinguished based on LIWC language use with 98.5% accuracy in the non-interactive condition.", 306 "evidence": "Section 3.1.2: 'A simple logistic regression classifier trained and tested in a 10-fold cross-validation setup on count vectors of LIWC categories obtains an almost perfect average accuracy of 98.5%.' PCA visualization (Figure 2a) shows clear separation.", 307 "supported": "strong" 308 }, 309 { 310 "claim": "Agents exhibit linguistic alignment toward their conversational partner after cross-group interaction, reducing classification accuracy from 98.5% to 66.15%.", 311 "evidence": "Section 3.2.2: classification accuracy drops from 98.5% (non-interactive) to 66.15% (interactive) in 10-fold cross-validation. Figures 2a vs 2b show increased overlap in PCA space. Point-biserial correlations weaken (Figure 2c vs 2d).", 312 "supported": "strong" 313 }, 314 { 315 "claim": "Linguistic alignment is asymmetric: the creative persona adapts more toward the analytical one than vice versa.", 316 "evidence": "Section 3.2.2: 'creative agents use more words expressing negative emotions, sadness and discrepancy than before interaction. These categories are specific to analytical agents in the non-interactive condition.' Spearman correlations weaken more for creative traits (Table 7 vs Table 4). However, the evidence is based on direction of LIWC shift without a direct statistical test of asymmetry.", 317 "supported": "moderate" 318 }, 319 { 320 "claim": "Post-interaction changes in analytical agents' BFI scores reflect inconsistency rather than alignment to the creative persona.", 321 "evidence": "Section 3.2.1: Analytical agents' post-interaction traits 'move towards those of the creative group—but less so than after the non-interactive writing task.' Tables 5-6 compare conditions. The drift is smaller with interaction than without, suggesting it's not interaction-driven alignment.", 322 "supported": "moderate" 323 } 324 ], 325 "red_flags": [ 326 { 327 "flag": "Undisclosed sample size", 328 "detail": "The paper never explicitly states the number of agents per group. The scatter plots suggest roughly 50-100 per group, but this critical methodological detail is absent from the text, making it impossible to fully evaluate the statistical claims." 329 }, 330 { 331 "flag": "Missing attrition counts", 332 "detail": "Stories are filtered to 500-900 words, but the paper does not report how many stories were generated, how many were discarded, or whether the filtering was differential across groups—which could introduce selection bias." 333 }, 334 { 335 "flag": "Single model only", 336 "detail": "Only GPT-3.5-turbo-0613 is tested, but the title and framing suggest broader applicability to 'Large Language Models.' The findings may be specific to this model's training and RLHF alignment." 337 }, 338 { 339 "flag": "Extreme unrealistic personas", 340 "detail": "The two personality profiles set all BFI traits to extremes (all high or all low), which does not reflect realistic human personality distributions. As the authors acknowledge, this limits ecological validity." 341 }, 342 { 343 "flag": "No variance in tabular results", 344 "detail": "Tables 2, 3, 5, and 6 report means for BFI scores but no standard deviations, making it difficult to assess the reliability and spread of the measurements despite Cohen's d being reported." 345 } 346 ], 347 "cited_papers": [ 348 { 349 "title": "Generative agents: Interactive simulacra of human behavior", 350 "authors": ["Joon Sung Park", "Joseph O'Brien", "Carrie Jun Cai", "Meredith Ringel Morris", "Percy Liang", "Michael S. Bernstein"], 351 "year": 2023, 352 "relevance": "Seminal work on LLM agent populations simulating human behavior, directly relevant to multi-agent AI systems and agentic workflows." 353 }, 354 { 355 "title": "PersonaLLM: Investigating the ability of GPT-3.5 to express personality traits and gender differences", 356 "authors": ["Hang Jiang", "Xiajie Zhang", "Xubo Cao", "Jad Kabbara", "Deb Roy"], 357 "year": 2023, 358 "arxiv_id": "2305.02547", 359 "relevance": "Key prior work on GPT-3.5 personality conditioning whose prompts and methodology are directly reused in this study." 360 }, 361 { 362 "title": "Personality traits in large language models", 363 "authors": ["Greg Serapio-García", "Mustafa Safdari", "Clément Crepy", "Stephen Fitz", "Peter Romero", "Luning Sun", "Marwa Abdulhai", "Aleksandra Faust", "Maja Matarić"], 364 "year": 2023, 365 "arxiv_id": "2307.00184", 366 "relevance": "Evaluates personality trait expression in LLMs using psychological instruments, relevant to understanding LLM behavioral conditioning." 367 }, 368 { 369 "title": "On the effectiveness of creating conversational agent personalities through prompting", 370 "authors": ["Heng Gu", "Chadha Degachi", "Uğur Genç", "Senthil Chandrasegaran", "Himanshu Verma"], 371 "year": 2023, 372 "arxiv_id": "2310.11182", 373 "relevance": "Studies effectiveness of prompt-based personality conditioning for conversational agents, directly relevant to persona-conditioned LLM behavior." 374 }, 375 { 376 "title": "Identifying and manipulating the personality traits of language models", 377 "authors": ["Graham Caron", "Shashank Srivastava"], 378 "year": 2022, 379 "arxiv_id": "2212.10276", 380 "relevance": "Foundational work on measuring and modifying LLM personality traits, relevant to AI safety and alignment through behavioral control." 381 }, 382 { 383 "title": "HuggingGPT: Solving AI tasks with ChatGPT and its friends in HuggingFace", 384 "authors": ["Yongliang Shen", "Kaitao Song", "Xu Tan", "Dongsheng Li", "Weiming Lu", "Yueting Zhuang"], 385 "year": 2023, 386 "arxiv_id": "2303.17580", 387 "relevance": "Multi-LLM orchestration system where models collaborate on tasks, relevant to agentic AI workflows and multi-agent interaction." 388 }, 389 { 390 "title": "MM-ReAct: Prompting ChatGPT for multimodal reasoning and action", 391 "authors": ["Zhengyuan Yang", "Linjie Li", "Jianfeng Wang", "Kevin Lin", "Ehsan Azarnasab", "Faisal Ahmed", "Zicheng Liu", "Ce Liu", "Michael Zeng", "Lijuan Wang"], 392 "year": 2023, 393 "arxiv_id": "2303.11381", 394 "relevance": "Multimodal agentic framework extending LLM capabilities through tool use and action, relevant to agentic AI systems." 395 }, 396 { 397 "title": "Mindstorms in natural language-based societies of mind", 398 "authors": ["Mingchen Zhuge", "Haozhe Liu", "Francesco Faccio", "Dylan R. Ashley", "Róbert Csordás", "Anand Gopalakrishnan", "Abdullah Hamdi"], 399 "year": 2023, 400 "arxiv_id": "2305.17066", 401 "relevance": "Studies emergent behavior in LLM populations organized as societies of mind, relevant to multi-agent AI interaction and collective intelligence." 402 }, 403 { 404 "title": "Does GPT-3 demonstrate psychopathy? Evaluating large language models from a psychological perspective", 405 "authors": ["Xingxuan Li", "Yutong Li", "Shafiq Joty", "Linlin Liu", "Fei Huang", "Lin Qiu", "Lidong Bing"], 406 "year": 2022, 407 "arxiv_id": "2212.10529", 408 "relevance": "Evaluates LLM behavior from a psychological perspective using standardized instruments, relevant to AI safety and behavioral assessment." 409 }, 410 { 411 "title": "Examining GPT-4: Capabilities, implications and future directions", 412 "authors": ["Edward Y. Chang"], 413 "year": 2023, 414 "relevance": "Examines GPT-4 capabilities and implications, relevant to understanding LLM capability assessment and evaluation methodology." 415 } 416 ], 417 "engagement_factors": { 418 "practical_relevance": { 419 "score": 1, 420 "justification": "Somewhat relevant for practitioners building persona-conditioned chatbots or multi-agent systems, but no directly usable tool or technique is provided." 421 }, 422 "surprise_contrarian": { 423 "score": 1, 424 "justification": "The asymmetric alignment finding (creative adapts more than analytical) is mildly interesting but the overall message that LLM personas are fragile in interaction is not very surprising." 425 }, 426 "fear_safety": { 427 "score": 1, 428 "justification": "The ethical considerations section raises concerns about targeting individuals via persona-conditioned LLMs, but the paper itself does not demonstrate a novel attack or safety threat." 429 }, 430 "drama_conflict": { 431 "score": 0, 432 "justification": "No controversy, no criticism of specific labs or products, and no dramatic claims." 433 }, 434 "demo_ability": { 435 "score": 1, 436 "justification": "Code is released on GitHub, but it is experimental scripts, not a tool or demo someone could immediately try." 437 }, 438 "brand_recognition": { 439 "score": 1, 440 "justification": "Uses GPT-3.5 (a well-known product) but authors are from academic institutions (Utrecht University, ETH Zürich), not a famous AI lab." 441 } 442 } 443 }