scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27084B)
      1 {
      2   "paper": {
      3     "title": "Annotation alignment: Comparing LLM and human annotations of conversational safety",
      4     "authors": ["Rajiv Movva", "Pang Wei Koh", "Emma Pierson"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2406.06369"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No GitHub link, Zenodo archive, or any repository URL is provided in the paper. No mention of code release."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The study uses the publicly available DICES dataset (Aroyo et al., 2023). The DICES-350 dataset is referenced as a public resource. However, the authors' own LLM annotations are not explicitly released."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper mentions API access (OpenAI API, Google Vertex AI API) but does not specify library versions or environment details."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described in text but without sufficient detail for exact replication (e.g., exact API call parameters beyond temperature)."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The paper reports confidence intervals: e.g., 'r = 0.95 (95% CI: [0.93, 0.96])' for split-half reliability, and 99% CIs for null distributions in RQ2 (e.g., 'r ∈ [0.44, 0.64] for the Latinx female group'). Bootstrap-based intervals are used throughout."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Bootstrap resampling is used for significance testing: 'we check if r1 > r2 in at least 95% of 1000 bootstrap resamples' (Section 3, RQ1). Permutation tests with 5000 permutations are used for RQ2 (Section 3). Bonferroni correction is applied for multiple comparisons (footnote 3)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Pearson correlations are reported as effect sizes throughout (e.g., r = 0.59 for GPT-4 vs. r = 0.51 for median annotator). Table 1 provides comprehensive correlation values. The paper contextualizes GPT-4's correlation as falling at the 81st percentile relative to individual annotators."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper explicitly addresses statistical power: RQ2 shows 'the dataset is underpowered to detect demographic differences in annotator-LLM alignment' (Section 3). The authors demonstrate that 350 conversations are insufficient for subgroup comparisons by showing wide CIs. This is a power analysis by demonstration."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Standard deviations of individual annotator correlations with GPT-4 are reported: 'across all annotators, the standard deviation of an individual annotator's correlation with GPT-4 is 0.107; for the raters within a group, the SD in correlation with GPT-4 ranges from 0.075 (Latinx female) to 0.132 (Asian male)' (Section 3, RQ2). Reliability checks in Appendix C report r >= 0.94 across repeated trials."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple baselines are included: five LLMs are compared (GPT-3.5, GPT-4, GPT-4o, Gemini 1.5 Pro, Llama 3.1 405B), and the median human annotator's correlation (r = 0.51) and the average of 3 random humans (r = 0.72) serve as human baselines."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "All models tested are contemporary as of 2024: GPT-4, GPT-4o, Gemini 1.5 Pro, and Llama 3.1 405B Instruct. These are leading models on recent leaderboards (Chiang et al., 2024)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper ablates prompt design choices: rating-only vs. analyze-rate prompts (Table 1), Likert vs. binary ratings (Appendix C), single vs. per-criterion ratings (Appendix C), and zero-shot vs. few-shot (Appendix C). Each design choice is evaluated on a held-out validation set."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses Pearson correlation on both Likert ratings and binarized ratings (Table 1), compares to the median annotator and percentile rank, and examines within-group standard deviation as additional measures."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Human evaluation is central to this paper. The entire study compares LLM annotations against 112 human annotators. The qualitative analysis of disagreements (Section 3, RQ1) involves manual categorization of 27+21 disagreement cases by the authors."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper explicitly uses DICES-990 as a validation set for prompt engineering and DICES-350 as a held-out test set. Footnote 7 states: 'prompt engineering is a form of hyperparameter tuning... here we engineered our prompt by validating on DICES-990, so that DICES-350 is a fully held-out test set.'"
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by model (5 models), by prompt type (rating-only vs. analyze-rate), by rating type (Likert vs. binary) in Table 1, and by demographic group (10 race-gender subgroups) in Figure 2."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Extensive qualitative analysis of failure cases is provided in Section 3 (RQ1) and Tables S2-S3: 27 cases where GPT-4 rates safe but humans rate unsafe, and 21 cases in the opposite direction, with manual categorization of error patterns."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Several negative results are prominently reported: RQ2 shows the dataset is underpowered for demographic alignment analysis; RQ3 shows GPT-4 cannot predict group disagreements; Appendix D describes three discarded approaches that did not work; Appendix C reports few-shot learning led to reduced correlation."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims GPT-4 achieves r = 0.59 (supported by Table 1), that larger datasets are needed to resolve demographic disparities (supported by RQ2 analysis showing wide CIs), that there is substantial idiosyncratic variation (supported by within-group SD analysis), and that GPT-4 cannot predict group disagreements (supported by RQ3 null results). All claims are directly supported."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper is careful to make correlational rather than causal claims. Statements like 'GPT-4 achieves a Pearson correlation of r = 0.59' and 'there is no evidence that GPT-4 can identify conversations...' are appropriately framed. The qualitative analysis identifies patterns but does not make causal claims about why disagreements occur."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The Limitations section explicitly bounds generalization: safety ratings depend on annotation guidelines and prompt design; the study uses only English conversations; the safety definition inherits DICES's five categories; race/gender are U.S.-centric and coarse. The authors note 'the results we observe here would change with different annotator guidelines or prompts.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses multiple alternative explanations: disagreements may reflect normative differences rather than LLM errors (Section 3, RQ1); absence of significant demographic differences may be due to insufficient power rather than true absence (RQ2); alignment variation within groups suggests factors beyond demographics matter (RQ2); prompt sensitivity could explain results (Limitations)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Exact model version IDs are provided in Appendix B: gpt-3.5-turbo-0125, gpt-4-0125-preview, gpt-4o-2024-05-23, google/gemini-1.5-pro-001, and meta/llama3-405b-instruct-maas."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompts are provided in Figure S1 (safety annotation prompt) and Figure S2 (disagreement prediction prompt). The actual text used is shown, including the analyze-rate structure and rating scale."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Temperature is reported as 0 for all models (Appendix B: 'All generations used a temperature of 0 to reduce non-determinism'). The binarization threshold is specified (>=3 for most models, >=2 for GPT-3.5)."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The study uses direct zero-shot prompting of LLMs via API calls."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Data preprocessing is documented: annotators with >80% disagreement with majority were removed (11 out of 123, leaving 112); binary aggregation of five safety criteria is described; the binarization threshold for Likert ratings is specified; Gemini's 16 blocked outputs are documented and handled by filling with middle value 3."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "A substantial dedicated 'Limitations' section is present (spanning approximately one full page), discussing demographic scope, prompt sensitivity, safety definition, language scope, and annotator characteristics."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats are discussed: the dataset includes only five coarse U.S.-centric race categories and two binary genders; DICES annotation guidelines were not released in full so the prompt may not perfectly match annotator instructions; the study did not systematically explore how prompt definitions of safety impact ratings; annotator characteristics beyond demographics were not captured."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Specific scope boundaries are stated: results apply only to English conversations; the safety conceptualization inherits DICES's five categories and may not generalize to other definitions; the RQ2 null result does not disprove meaningful demographic differences; results would change with different annotator guidelines or prompts; characteristics beyond race and gender may be more salient."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The underlying DICES dataset is publicly available (Aroyo et al., 2023), enabling independent verification of the human annotation data. However, the LLM-generated annotations are not explicitly released as a separate dataset."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Data collection is described in detail in Section 2 and Appendix A: DICES-350 consists of 350 multi-turn conversations collected by crowdworkers interacting with LaMDA on sensitive topics, rated by 112 crowdworkers across five binary safety criteria. LLM annotations were generated via API calls with specified prompts and temperature settings."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not describe how the crowdworkers were recruited for DICES. It references the original DICES paper (Aroyo et al., 2023) for details, but does not provide recruitment methods itself. The potential bias introduced by crowdworker selection (e.g., platform used, selection criteria) is not discussed."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The data pipeline is documented: 123 annotators → 11 removed for extreme disagreement → 112 remaining; 350 conversations rated; annotations aggregated across five binary criteria; LLM annotations generated with specified prompts; Gemini's 16/350 blocked outputs handled with middle value. Validation set (DICES-990) vs. test set (DICES-350) separation is documented."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Funding is disclosed in the Acknowledgments: RM supported by NSF DGE #2139899; PWK supported by Singapore NRF and AI Visiting Professorship Programme; EP supported by Google Research Scholar award, NSF CAREER #2142419, CIFAR Azrieli scholarship, LinkedIn Research Award, and Abby Joseph Cohen Faculty Fund."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly stated: Rajiv Movva (Cornell Tech), Pang Wei Koh (University of Washington), Emma Pierson (Cornell Tech). These are academic affiliations with no direct conflicts regarding the evaluated products."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding sources (NSF, Singapore NRF, Google Research Scholar, CIFAR, LinkedIn) are generally independent of outcomes regarding GPT-4, Gemini, or Llama performance. The Google Research Scholar award to EP is a minor potential concern given that Gemini is a Google product, but Google is not the primary funder and the paper does not evaluate Gemini favorably over competitors."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper does not evaluate model capability on a benchmark. It studies annotation alignment between LLMs and humans on a subjective safety rating task. Training data contamination is not relevant because the task is subjective annotation, not factual knowledge recall."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not applicable. The study evaluates subjective safety annotation rather than model knowledge on a benchmark. Whether the model has seen the DICES conversations during training does not invalidate the annotation alignment analysis in the same way benchmark contamination would."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable. This is not a benchmark evaluation study. The task is subjective safety annotation where contamination with the specific conversations would not necessarily bias the results in a meaningful way (the model is not being tested on factual recall)."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "The paper does not recruit new human participants. It uses the existing public DICES dataset. The crowdworkers were part of the original DICES study, not this paper's research."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No new human participants were recruited. The paper uses an existing public dataset (DICES). Ethics review would have been the responsibility of the original DICES study authors."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No new human participants were recruited. The DICES dataset demographics (race, gender, age group) are described as part of the dataset description, but these are properties of the existing dataset, not a new human study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No new human participants were recruited. The authors describe their filtering of the existing DICES annotators (removing 11 with >80% majority disagreement), but this is data preprocessing, not human subjects research."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No experimental study with human participants. The paper analyzes an existing dataset."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No experimental study with human participants requiring blinding."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No new human participants were recruited. The annotator filtering (123 → 112) is data preprocessing on an existing dataset."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Appendix B reports per-model costs for annotating 350 conversations: approximately $0.10 for GPT-3.5, $0.60 for GPT-4o, $2 for GPT-4, $1 for Gemini-1.5, and $4 for Llama-3.1. Total cost for all experiments including prompt engineering was $200."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "The total API spend is stated as $200 for all experiments (Appendix B). No GPU hours were needed as all computation was done via API calls."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "GPT-4 with analyze-rate prompt achieves Pearson correlation of r = 0.61 (Likert) and r = 0.59 (binary) with average annotator ratings, higher than the median human annotator's r = 0.51.",
    286       "evidence": "Table 1 reports correlations across 5 models and 2 prompt types. GPT-4 analyze-rate achieves the highest Likert correlation (0.61, significantly better than second-best in >95% of bootstraps). The binary correlation of 0.59 falls at the 81st percentile of human annotators (Section 3, RQ1).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "The DICES-350 dataset is underpowered to detect demographic differences in annotator-LLM alignment.",
    291       "evidence": "Figure 2 shows all true group-model correlations lie within their null 99% CIs. The CIs are wide (e.g., r ∈ [0.44, 0.64] for Latinx female group). Permutation tests with 5000 permutations and Bonferroni correction show no significant differences (Section 3, RQ2).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "There is substantial idiosyncratic variation in alignment with GPT-4 within demographic groups, suggesting race and gender do not fully capture differences in alignment.",
    296       "evidence": "The standard deviation of individual annotator correlations with GPT-4 is 0.107 overall; within-group SDs range from 0.075 to 0.132, often comparable to or exceeding the overall SD (Section 3, RQ2).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "GPT-4 cannot predict when one demographic group finds a conversation more unsafe than another.",
    301       "evidence": "Pearson correlations between GPT-4's disagreement Likert ratings and true group differences are near zero and not significant for all tested group pairs: (white, Black), (white, Asian), (white, Latinx). Additional checks in Appendix E confirm the null result (Section 3, RQ3).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "There are systematic patterns in GPT-4-human disagreements: GPT-4 rates sensitive advice as safe while annotators rate it unsafe, and GPT-4 flags subtle bias/stereotyping that annotators rate as safe.",
    306       "evidence": "Qualitative analysis of 27 safe-by-GPT/unsafe-by-humans cases (17/27 are sensitive advice) and 21 unsafe-by-GPT/safe-by-humans cases (15/21 involve bias/stereotyping). Examples in Tables S2 and S3 (Section 3, RQ1).",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval", "observational"],
    311   "key_findings": "GPT-4 and Llama 3.1 405B achieve higher correlation with average human safety ratings than the median individual annotator (r = 0.59-0.61 vs. r = 0.51) on the DICES-350 dataset, though both still fall well below the agreement among groups of human annotators (r = 0.72 for 3 random humans, r = 0.95 for split-halves). The dataset lacks statistical power to detect demographic differences in LLM alignment, and within-group variation in alignment often exceeds between-group variation. GPT-4 cannot predict when different demographic groups disagree about safety, suggesting current models do not capture pluralistic perspectives on harm.",
    312   "red_flags": [
    313     {
    314       "flag": "No code or LLM annotation data released",
    315       "detail": "Despite being a computational study with relatively simple methodology, no code repository or generated LLM annotation data is released, limiting reproducibility."
    316     },
    317     {
    318       "flag": "Potential contamination not discussed",
    319       "detail": "While contamination is less directly relevant for subjective annotation tasks than for benchmark evaluations, the models may have been trained on DICES conversations or similar content, which could influence their safety ratings. This is not discussed."
    320     }
    321   ],
    322   "cited_papers": [
    323     {
    324       "title": "DICES Dataset: Diversity in Conversational AI Evaluation for Safety",
    325       "authors": ["Lora Aroyo", "Alex S. Taylor", "Mark Diaz", "Christopher M. Homan", "Alicia Parrish", "Greg Serapio-Garcia", "Vinodkumar Prabhakaran", "Ding Wang"],
    326       "year": 2023,
    327       "relevance": "Core dataset used for LLM safety annotation alignment study; key resource for evaluating diverse human perspectives on AI safety."
    328     },
    329     {
    330       "title": "Constitutional AI: Harmlessness from AI Feedback",
    331       "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"],
    332       "year": 2022,
    333       "relevance": "Foundational work on training AI with AI feedback for safety, directly relevant to LLM alignment methodology."
    334     },
    335     {
    336       "title": "Safety-Tuned LLaMAs: Lessons From Improving the Safety of Large Language Models that Follow Instructions",
    337       "authors": ["Federico Bianchi", "Mirac Suzgun", "Giuseppe Attanasio"],
    338       "year": 2024,
    339       "relevance": "Evaluation of LLM safety via adversarial prompts, complementary approach to annotation alignment studied in this paper."
    340     },
    341     {
    342       "title": "HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal",
    343       "authors": ["Mantas Mazeika", "Long Phan", "Xuwang Yin"],
    344       "year": 2024,
    345       "relevance": "Standardized safety benchmark for evaluating LLM robustness against adversarial attacks."
    346     },
    347     {
    348       "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference",
    349       "authors": ["Wei-Lin Chiang", "Lianmin Zheng", "Ying Sheng"],
    350       "year": 2024,
    351       "arxiv_id": "2403.04132",
    352       "relevance": "Major LLM evaluation platform using human preferences; used as reference for model selection in this study."
    353     },
    354     {
    355       "title": "Can Large Language Models Be an Alternative to Human Evaluations?",
    356       "authors": ["Cheng-Han Chiang", "Hung-yi Lee"],
    357       "year": 2023,
    358       "relevance": "Directly relevant study on using LLMs as substitutes for human evaluation in NLP tasks."
    359     },
    360     {
    361       "title": "With Little Power Comes Great Responsibility",
    362       "authors": ["Dallas Card", "Peter Henderson", "Urvashi Khandelwal"],
    363       "year": 2020,
    364       "relevance": "Influential work on statistical power in NLP experiments, cited to support the underpowered finding in RQ2."
    365     },
    366     {
    367       "title": "A Roadmap to Pluralistic Alignment",
    368       "authors": ["Taylor Sorensen", "Jared Moore", "Jillian Fisher"],
    369       "year": 2024,
    370       "relevance": "Framework for pluralistic LLM alignment capturing diverse viewpoints, directly relevant to the survey's scope on alignment methodology."
    371     },
    372     {
    373       "title": "Large language models cannot replace human participants because they cannot portray identity groups",
    374       "authors": ["Angelina Wang", "Jamie Morgenstern", "John P. Dickerson"],
    375       "year": 2024,
    376       "relevance": "Empirical study on LLM limitations in representing diverse perspectives, directly relevant to annotation alignment findings."
    377     },
    378     {
    379       "title": "UltraFeedback: Boosting Language Models with High-quality Feedback",
    380       "authors": ["Ganqu Cui", "Lifan Yuan", "Ning Ding"],
    381       "year": 2023,
    382       "relevance": "Study on using LLM annotations for model training, relevant to the broader context of LLM-as-annotator methodology."
    383     },
    384     {
    385       "title": "The PRISM Alignment Project: What Participatory, Representative and Individualised Human Feedback Reveals About the Subjective and Multicultural Alignment of Large Language Models",
    386       "authors": ["Hannah Rose Kirk", "Alexander Whitefield", "Paul Röttger"],
    387       "year": 2024,
    388       "relevance": "Large-scale alignment study with individualized human feedback, complementary to the demographic alignment analysis in this paper."
    389     },
    390     {
    391       "title": "RTP-LX: Can LLMs Evaluate Toxicity in Multilingual Scenarios?",
    392       "authors": ["Adrian de Wynter", "Ishaan Watts", "Nektar Ege Altıntoprak"],
    393       "year": 2024,
    394       "relevance": "Study on LLM safety evaluation in multilingual settings, extending the monolingual English scope of the current paper."
    395     }
    396   ]
    397 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs