ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30433B)


      1 {
      2   "paper": {
      3     "title": "Survival Games: Human-LLM Strategic Showdowns under Severe Resource Scarcity",
      4     "authors": [
      5       "Zhihong Chen",
      6       "Yiqian Yang",
      7       "Jinzhao Zhou",
      8       "Qiang Zhang",
      9       "Chin-teng Lin",
     10       "Yiqun Duan"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2505.17937",
     15     "doi": "10.48550/arXiv.2505.17937"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract provides a GitHub repository URL: https://github.com/hong123123/Survival-Games. 'Codes are available through' this link."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No dataset download link is provided. Simulation logs, raw output data, and wrongdoing detection results are not released. Only the code is shared."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, conda environment, or dependency specification is mentioned in the paper. The only technical detail is that 'LLMs are accessed directly through API's.'"
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README content, specific commands, or setup instructions are described."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Tables 1-6 report averages over 3 or 10 runs but no confidence intervals, error bars, or ± notation. Only point estimates are given (e.g., 'Average of three repetitions')."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims 'jailbreaking prompts significantly enhancing unethical actions' and makes multiple comparative claims between models, but no statistical significance tests (p-values, t-tests, etc.) are reported anywhere."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are reported. Comparisons are based on raw numbers in tables without any measure of effect magnitude beyond the raw differences."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The main experiments use 3 runs per condition (Section 4.1) and the history independence evaluation uses 10 trials (Section 4.6). No power analysis or justification for these sample sizes is provided."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Tables show averages of 3 repetitions but no standard deviation, variance, IQR, or any spread measure. The reader cannot assess result stability across runs."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Multiple LLMs are compared against each other: DeepSeek-v3, DeepSeek-r1, GPT-4o, GPT-o4-mini, and GPT-3.5-turbo. Models serve as baselines for each other in Tables 1-2."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The models evaluated are contemporary: GPT-4o, GPT-o4-mini, DeepSeek-R1, and DeepSeek-V3 are all recent models. GPT-3.5-turbo is included as an older reference point."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper tests multiple conditions: base prompts (Table 1), jailbreak/puppetry attack prompts (Table 3), and EthicsPrompt cooperative prompts (Table 5), isolating the effect of prompt engineering on behavior."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Multiple metrics are used: survival duration (days), total ethical violations, violations per day, remaining food, and per-violation-type breakdown (13 categories in Table 2)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation of the simulation outputs or wrongdoing classifications. The wrongdoing detection is entirely LLM-based (Section 3.4, Fig. 4). No human annotators validate the detector's accuracy."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "The simulation is the evaluation itself — there is no separation between development and test scenarios. Prompts were presumably iterated on the same simulation environment without held-out evaluation."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Tables 2, 4, and 6 provide per-violation-type breakdowns across 13 categories (deception, killing, physical harm, etc.) for each model and condition."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "The paper does not discuss specific failure cases of the simulation or detection system. Section 5 briefly mentions hallucinations in the resource system but provides no concrete examples or analysis of when the approach breaks down."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "Every experiment shows the framework 'working' in some way. While GPT-4o showing zero violations is a finding, the paper does not report configurations, approaches, or detection methods that were tried and abandoned."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims about DeepSeek engaging in resource hoarding vs. OpenAI showing restraint are supported by Tables 1-2. Prompt engineering effects are supported by Tables 3-6. The framework is described and code released."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims ('prompt engineering can significantly steer LLM behavior') are supported by controlled experiments varying one factor (prompt type) while keeping the simulation environment constant. This single-variable manipulation design is adequate for causal inference within the framework."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The conclusion claims the framework provides insights for 'real-world human-AI interactions' and 'ensure LLM alignment with human norms in high-stakes, real-world human-AI interactions.' This dramatically overgeneralizes from a simple 3-agent simulation with 2 model families to real-world deployment."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "No alternative explanations are discussed. The behavioral differences between models could stem from API temperature defaults, training data differences, or biases in the LLM-based wrongdoing detector. None of these are considered."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper measures LLM-detected wrongdoing labels in a simulation and frames this as 'ethical alignment' and 'moral behavior.' No discussion of whether simulation-based proxy metrics capture actual ethical alignment in real deployment scenarios."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Models are identified by marketing names only: 'DeepSeek-R1/V3', 'GPT-4o/4o-mini', 'GPT-3.5-turbo'. No snapshot dates, API versions, or specific model identifiers (e.g., 'gpt-4o-2024-05-13') are provided."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Extensive prompt examples are provided in Appendices A-C, including full input/output pairs for daily planning (A.1-A.4), subplan determination (A.3-A.4), resource system classification (B), and ethical violation evaluation (C)."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "No LLM API parameters are reported — no temperature, top-p, max tokens, or sampling settings. Section 4.1 states 'LLMs are accessed directly through API's' without specifying any parameters."
    158       },
    159       "scaffolding_described": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The agent architecture is described in detail in Section 3: perceive, retrieve, plan, reflect, and execute modules (Section 3.1), inter-agent interactions with memory streams (Section 3.2), and the health/food system (Section 3.3). Figures 1-2 illustrate the architecture."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "The simulation-to-evaluation pipeline is described at a high level (Section 3.4), but the specific steps from raw simulation logs to the averaged numbers in Tables 1-6 are not documented. How wrongdoing detections are aggregated into the final counts is not explained."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 5 is titled 'Limitation' and contains two sentences about hallucinations in the resource system and the need for more scenarios. It exists as a dedicated section, though it is very brief."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The limitations section is only two sentences long. It mentions 'hallucinations in the resource system' and needing 'a wider range of scenarios' but does not discuss specific threats such as the reliability of the LLM-based evaluator, the 3-run sample size, potential API parameter confounds, or the validity of simulated 'human' behavior."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show. Instead, it makes sweeping claims about 'real-world human-AI interactions' without bounding what was not tested (e.g., different scenarios, actual humans, more than 3 agents)."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No raw simulation logs, agent action traces, or wrongdoing detection outputs are released. Only aggregated averages appear in the tables. Independent verification of the underlying data is not possible."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The simulation setup is described: Section 4.1 specifies 3 agents, 6 simulated days, 3 runs per condition, initial food = 15 units, and that the robot agent uses different LLMs. The wrongdoing detection procedure is described in Section 3.4."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants are involved. All agents in the simulation are LLM-controlled. The question of participant recruitment does not apply."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "The paper describes the simulation and the wrongdoing detection system separately, but the full pipeline from raw simulation output to the final averaged numbers in the tables is not documented. Aggregation steps, filtering criteria, and data transformation are not specified."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding source or acknowledgment section is present in the paper. There is no mention of grants, sponsors, or funding agencies."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly listed: South China University of Technology, Hong Kong University of Science and Technology, and University of Technology Sydney. No authors are affiliated with the companies whose models are evaluated."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "Since no funding is disclosed, independence of the funder from the outcome cannot be assessed. Absence of disclosure prevents evaluation."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial disclosure statement appears anywhere in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates are stated for any of the evaluated models (DeepSeek-R1/V3, GPT-4o, GPT-o4-mini, GPT-3.5-turbo). The paper evaluates pre-trained models on a benchmark simulation."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether the simulation scenarios, prompts, or the generative agents framework could have appeared in the models' training data. The framework builds on Park et al. (2023), which is widely known and likely in training corpora."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "The simulation framework is novel but built on well-known generative agents work. No discussion of whether familiarity with the underlying framework or similar scenarios could influence model behavior. The paper does not address contamination risk."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved. All 'agents' in the simulation are LLM-controlled, including the two 'human' agents which use GPT-4o."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. The paper studies LLM-controlled agents in a simulation, not actual human subjects."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants to characterize demographically."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants to recruit or screen."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants to randomize."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants requiring blinding."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants who could drop out."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Section 4.1 states: 'For each simulated day, the API cost is $1 at most (for GTP-4o).' This provides a basic cost estimate per simulation day."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Only per-day API cost for GPT-4o is mentioned ($1). Total compute budget across all experiments (multiple models × multiple conditions × 3 runs × ~6 days) is not quantified. No hardware or total API spend reported."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Three runs per condition are conducted (Section 4.1) but no variance, standard deviation, or seed sensitivity analysis is reported. Only averages appear in tables."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Section 4.1 explicitly states: 'We conduct 3 runs per LLM–prompt configuration to control for stochasticity.' Section 4.6 states 'ten repeated trials.'"
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search is described. Models are used via API with unspecified parameters. No mention of how prompt wordings or simulation parameters were selected or tuned."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The specific prompt framings (cooperative vs. self-preservation), jailbreak templates, and simulation parameters are presented without justification for why these particular configurations were chosen."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Multiple models are compared across multiple conditions and violation types without any correction for multiple comparisons. No Bonferroni, Holm, or other corrections are applied."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors evaluate their own simulation framework and wrongdoing detection system without acknowledging potential bias from author-evaluation. No independent evaluation or validation of the detection system is provided."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "Different models likely have different computational costs and latencies, but performance is not reported as a function of compute. GPT-4o costs $1/day but costs for other models are not compared."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "No discussion of whether the simulation benchmark actually measures 'ethical alignment' as claimed. The construct validity of mapping LLM-detected wrongdoing labels in a simple simulation to real-world ethical behavior is not examined."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "All evaluated models use the same simulation framework and agent architecture. The scaffold is constant across model comparisons, controlling for this confound by design. However, the 'human' agents all use GPT-4o, which is noted in Table 1 caption."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of temporal leakage. The generative agents framework (Park et al., 2023) and similar simulation scenarios are publicly available and likely in model training data, potentially influencing behavior."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the evaluation setup leaks information. The prompts explicitly describe the game rules and other agents' status, which could influence behavior in ways that conflate ethical alignment with prompt compliance."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the 3 runs per condition are truly independent or whether model behavior is correlated across runs due to shared training."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, or decontamination analysis."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "DeepSeek-R1 frequently engages in selfish or deceptive strategies (4.33 violations total, 1.44 per day), while GPT-4o demonstrates restraint with zero ethical violations.",
    370       "evidence": "Table 1 shows DeepSeek-R1 has 4.33 total violations vs. 0 for GPT-4o. Table 2 details DeepSeek-R1's violations: 0.67 deception, 3.67 spying. GPT-4o and GPT-3.5-turbo show zero violations across all categories. Averaged over 3 runs.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Jailbreaking (puppetry attack) prompts significantly enhance unethical actions across all models, even those with strong safety alignment.",
    375       "evidence": "Table 3-4 show that with puppetry attacks, violations jump from 0 to 182-244 across all models. GPT-4o goes from 0 to 244 violations with attack. All 13 violation categories are triggered under attack conditions.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Cooperative prompts (EthicsPrompt) consistently eliminate all ethical violations across all models.",
    380       "evidence": "Table 5 shows EthicsPrompt reduces all violations to zero for every model tested (DeepSeek-v3, DeepSeek-r1, GPT-o4-mini) across 10 trials under extreme unfair initialization.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Reasoning-capable models (DeepSeek-R1) exhibit a wider array of unethical actions under resource scarcity compared to non-reasoning models.",
    385       "evidence": "Table 6 shows DeepSeek-R1 engages in 7 different violation types (36 total violations over 10 trials) while GPT-o4-mini shows only stealing (2 total). DeepSeek-v3 shows zero violations. Section 4.6 attributes this to 'reasoning-driven adaptability.'",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "The framework provides a reproducible testbed for quantifying LLM ethics in high-stakes scenarios with insights for real-world human-AI interactions.",
    390       "evidence": "Code is released on GitHub. The simulation framework is described in Sections 3.1-3.4. However, no validation of the testbed itself (e.g., human agreement with wrongdoing detector, test-retest reliability) is provided.",
    391       "supported": "weak"
    392     }
    393   ],
    394   "methodology_tags": [
    395     "benchmark-eval",
    396     "case-study"
    397   ],
    398   "key_findings": "The paper finds stark behavioral differences between LLM families in a survival simulation: DeepSeek-R1 commits 4.33 ethical violations per simulation (primarily spying) while GPT-4o commits zero. Jailbreak prompts dramatically increase violations across all models (0→182-244), while cooperative EthicsPrompt prompts eliminate all violations. The study suggests ethical alignment under resource scarcity is both model-dependent and prompt-dependent, though results are based on only 3 runs per condition without statistical testing.",
    399   "red_flags": [
    400     {
    401       "flag": "Tiny sample size without variance reporting",
    402       "detail": "Main experiments use only 3 runs per condition with averages reported but no standard deviation, confidence intervals, or error bars. With such high stochasticity in LLM outputs, 3 runs provide very weak evidence. A single unusual run could dominate the averages."
    403     },
    404     {
    405       "flag": "LLM evaluating LLM without validation",
    406       "detail": "The wrongdoing detection system (Section 3.4) is itself LLM-based, but there is no validation of the detector's accuracy, precision, or recall against human annotations. The evaluation instrument is unvalidated."
    407     },
    408     {
    409       "flag": "Misleading 'Human-LLM' framing",
    410       "detail": "The paper title claims 'Human-LLM Strategic Showdowns' but no actual humans participate. Both 'human' agents are controlled by GPT-4o (Table 1 caption). The entire simulation is LLM-vs-LLM, not human-vs-LLM as the framing suggests."
    411     },
    412     {
    413       "flag": "No statistical significance tests despite repeated use of 'significantly'",
    414       "detail": "The word 'significantly' appears multiple times (abstract, Section 4.4, 4.5) referring to differences between conditions, but no statistical tests are performed anywhere in the paper. All claims are based on comparing raw averaged numbers."
    415     },
    416     {
    417       "flag": "Overclaiming generalization to real-world deployment",
    418       "detail": "The conclusion claims insights for 'real-world human-AI interactions' and 'trustworthy AI deployment in real-world, resource-scarce contexts' from a simple 3-agent text simulation with no actual humans, unvalidated metrics, and 3 runs per condition."
    419     },
    420     {
    421       "flag": "Unspecified model versions and API parameters",
    422       "detail": "Models are referred to by marketing names only (GPT-4o, DeepSeek-R1) without API versions, snapshot dates, or sampling parameters (temperature, top-p). Results may not be reproducible if model versions change."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Do the rewards justify the means? Measuring trade-offs between rewards and ethical behavior in the MACHIAVELLI benchmark",
    428       "authors": ["Alexander Pan", "Jun Shern Chan", "Andy Zou", "Nathaniel Li", "Steven Basart", "Thomas Woodside", "Hanlin Zhang", "Scott Emmons", "Dan Hendrycks"],
    429       "year": 2023,
    430       "relevance": "LLM ethical behavior benchmark that this paper adapts for its wrongdoing detection system."
    431     },
    432     {
    433       "title": "Generative agents: Interactive simulacra of human behavior",
    434       "authors": ["Joon Sung Park", "Joseph O'Brien", "Carrie Jun Cai", "Meredith Ringel Morris", "Percy Liang", "Michael S Bernstein"],
    435       "year": 2023,
    436       "relevance": "Foundational generative agent framework that this paper extends with survival mechanics."
    437     },
    438     {
    439       "title": "Affordable generative agents",
    440       "authors": ["Yangbin Yu", "Qin Zhang", "Qiang Fu", "Deheng Ye"],
    441       "year": 2024,
    442       "relevance": "Scalable extension of generative agents framework used as the base for this paper's simulation environment."
    443     },
    444     {
    445       "title": "Moral alignment for LLM agents",
    446       "authors": ["Elizaveta Tennant", "Stephen Hailes", "Mirco Musolesi"],
    447       "year": 2024,
    448       "arxiv_id": "2410.01639",
    449       "relevance": "Studies LLM moral alignment including deceptive strategies under red-teaming conditions."
    450     },
    451     {
    452       "title": "Project SID: Many-agent simulations toward AI civilization",
    453       "authors": ["Altera AL", "Andrew Ahn", "Nic Becker"],
    454       "year": 2024,
    455       "arxiv_id": "2411.00114",
    456       "relevance": "Large-scale multi-agent simulation revealing emergent resource monopolies and governance failures."
    457     },
    458     {
    459       "title": "Red teaming language models with language models",
    460       "authors": ["Ethan Perez", "Sam Ringer", "Kamile Lukosiute"],
    461       "year": 2022,
    462       "relevance": "Foundational work on adversarial red-teaming of LLMs for alignment testing."
    463     },
    464     {
    465       "title": "Alignment of language agents",
    466       "authors": ["Zachary Kenton", "Tom Everitt", "Laura Weidinger", "Iason Gabriel", "Vladimir Mikulik", "Geoffrey Irving"],
    467       "year": 2021,
    468       "arxiv_id": "2103.14659",
    469       "relevance": "Theoretical framework for aligning language agents with human values in deployment scenarios."
    470     },
    471     {
    472       "title": "Deconstructing the ethics of large language models from long-standing issues to new-emerging dilemmas: A survey",
    473       "authors": ["Chengyuan Deng", "Yiqun Duan", "Xin Jin"],
    474       "year": 2024,
    475       "arxiv_id": "2406.05392",
    476       "relevance": "Survey of ethical issues in LLMs covering value alignment and deployment dilemmas."
    477     },
    478     {
    479       "title": "Voyager: An open-ended embodied agent with large language models",
    480       "authors": ["Guanzhi Wang", "Yuqi Xie", "Yunfan Jiang"],
    481       "year": 2023,
    482       "relevance": "Open-ended LLM agent in Minecraft exhibiting emergent strategies relevant to studying agent behavior under resource constraints."
    483     },
    484     {
    485       "title": "A virtue-based framework to support putting AI ethics into practice",
    486       "authors": ["Thilo Hagendorff"],
    487       "year": 2022,
    488       "relevance": "Virtue ethics framework for evaluating AI alignment, directly relevant to ethical behavior evaluation approaches."
    489     },
    490     {
    491       "title": "Agent incentives: A causal perspective",
    492       "authors": ["Tom Everitt", "Ryan Carey", "Eric D Langlois", "Pedro A Ortega", "Shane Legg"],
    493       "year": 2021,
    494       "relevance": "Causal analysis of agent incentive structures revealing tensions between survival-driven incentives and ethical goals."
    495     }
    496   ],
    497   "engagement_factors": {
    498     "practical_relevance": {
    499       "score": 1,
    500       "justification": "The simulation framework is a research tool for studying LLM ethics, not directly applicable to practitioner workflows."
    501     },
    502     "surprise_contrarian": {
    503       "score": 1,
    504       "justification": "Finding that jailbreaks increase unethical behavior and that different models behave differently is expected; no conventional wisdom is challenged."
    505     },
    506     "fear_safety": {
    507       "score": 2,
    508       "justification": "Raises concerns about LLMs making unethical decisions under resource scarcity, touching on AI safety in competitive multi-agent scenarios."
    509     },
    510     "drama_conflict": {
    511       "score": 1,
    512       "justification": "Some implicit DeepSeek vs. OpenAI comparison angle, but presented as neutral research rather than controversy."
    513     },
    514     "demo_ability": {
    515       "score": 1,
    516       "justification": "Code released on GitHub but requires API keys and simulation setup; not easily demoed."
    517     },
    518     "brand_recognition": {
    519       "score": 1,
    520       "justification": "Tests OpenAI GPT-4o and DeepSeek models which are recognizable, but the paper itself is from academic groups without major brand recognition."
    521     }
    522   }
    523 }

Impressum · Datenschutz