ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25600B)


      1 {
      2   "paper": {
      3     "title": "BiasAlert: A Plug-and-play Tool for Social Bias Detection in LLMs",
      4     "authors": [
      5       "Zhiting Fan",
      6       "Ruizhe Chen",
      7       "Ruiling Xu",
      8       "Zuozhu Liu"
      9     ],
     10     "year": 2024,
     11     "venue": "Conference on Empirical Methods in Natural Language Processing",
     12     "arxiv_id": "2407.10241",
     13     "doi": "10.48550/arXiv.2407.10241"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "The abstract states 'Model and code will be publicly released' — this is a promise of future release, not an actual release. No repository URL or archive link is provided anywhere in the paper."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper uses publicly available datasets: RedditBias, Crows-pairs, BOLD, BeaverTails, and SBIC. The retrieval database is constructed from the publicly available SBIC dataset (Sap et al., 2019). The evaluation datasets are standard public benchmarks."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "Appendix C.1 mentions '8 RTX 3090 GPUs, each with 24 GB of memory' and some training hyperparameters, but no requirements.txt, Dockerfile, conda environment, or specific library versions are provided. This is insufficient to recreate the environment."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No step-by-step reproduction instructions, README with commands, or scripts are provided. The paper describes the method but does not include specific instructions a researcher could follow to replicate the experiments."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Table 3 (bias mitigation results) reports values with ± notation (e.g., '0.125±0.000', '0.033±0.014'). Table 5 also reports human annotation consistency with ± notation. However, the main results in Table 1 do not include error bars or confidence intervals."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper claims BiasAlert 'significantly outperforms' baselines but provides no statistical significance tests (no p-values, t-tests, or other tests). Claims of difference are based solely on comparing numbers in tables."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper reports raw accuracy and F1 scores but does not report formal effect sizes (Cohen's d, odds ratios, etc.). While absolute numbers and baselines are provided, no standardized effect size measures are used."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No justification is given for sample sizes. The paper uses 30% of RedditBias for evaluation and 40 prompts for application studies without explaining why these numbers were chosen or discussing power analysis."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Appendix C.1 states 'Reported results are means over three runs.' Tables 3 and 5 report ± values indicating variance across runs. However, the main results in Table 1 do not include variance measures, which is a partial gap."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Section 3.1 lists 8 baselines across two categories: Bias Detection APIs (Azure Content Safety, OpenAI Moderation, Llama-Guard) and LLMs-as-Judges (Llama-2-chat 7B/13B, Gemma-it 7B, GPT-3.5, GPT-4 Turbo). Results are compared in Table 1."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Baselines include GPT-4 Turbo (2023), Llama-Guard (2023), Gemma-it 7B (2024), and Llama-3-Instruct 8B (2024 in appendix). These are contemporary and competitive for a 2024 paper."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Section 3.3 and Table 2 present an ablation study evaluating the individual contributions of Retrieval (RE), Step-by-step Instruction (CoT), and In-context Demonstration (Demo) on both Llama-2-7b-chat and GPT-4."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The paper uses five metrics: Accuracy, F1 Score, Classification Score, Attribution Score, and Overall Score (Section 3.1 and Appendix C.1)."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Section 4.1 reports human validation consistency exceeding 92% for both text completion and QA tasks. Section E.3 describes the crowdsourcer annotation process with 3 undergraduate students."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Section 3.1 states 'We randomly select 30% of RedditBias as the evaluating dataset. These data do not overlap with the training dataset to ensure fair comparisons.' Additionally, Crows-pairs is used as a separate evaluation dataset not used in training."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Figure 4 and Appendix C.2 provide per-bias-type breakdowns (gender, race, orientation, religion) showing detection accuracy across categories for all models."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "The paper does not discuss specific failure cases or show qualitative examples of where BiasAlert fails. The Limitations section mentions general issues (outdated retrieval database, implicit bias detection) but no concrete failure analysis."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "Every experiment shows improvement. The ablation study shows monotonic gains with each component added. No configurations that failed or approaches that were tried and abandoned are discussed."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims BiasAlert 'significantly outperforms existing state-of-the-art methods like GPT4-as-A-Judge in detecting bias' — Table 1 shows BiasAlert achieves 0.84 Acc vs GPT-4's 0.61 on RedditBias. The utility claims are supported by application studies in Section 4."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The ablation study (Table 2) makes causal claims about individual components (retrieval, CoT, demonstration) through controlled single-variable manipulation, which is adequate for these claims. Each component is added incrementally while holding others constant."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper's title claims a 'Plug-and-play Tool for Social Bias Detection in LLMs' broadly, but experiments are limited to RedditBias and Crows-pairs datasets, which focus on specific English-language bias types. The Limitations section acknowledges 'simulated datasets with preliminary results' but the title and abstract do not bound the claims to these specific datasets and bias types."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper does not discuss alternative explanations for why BiasAlert outperforms baselines. For example, it does not consider whether the fine-tuning data overlap with evaluation data could inflate results, or whether the retrieval component simply adds training-set memorization rather than genuine bias detection capability."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper uses 'GPT-4 Turbo', 'GPT-3.5', 'Llama-2-7b-chat', 'Gemma-7b-it' without specifying exact API versions or snapshot dates. 'GPT-4 Turbo' and 'GPT-3.5' are marketing names without version identifiers (e.g., no 'gpt-4-1106-preview' or API date)."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Table 6 in the appendix provides the full instruction template including task definition, step-by-step instructions, in-context demonstration with examples, and output format template. The prompt components are detailed enough to reconstruct the prompts used."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Appendix C.1 reports: batch size 16, AdamW optimizer, learning rate 5e-5, weight decay 0.05, 10 epochs, LoRA rank 16 on all linear modules. For retrieval, K references are mentioned. Training hardware is also specified."
    149       },
    150       "scaffolding_described": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The paper describes the full pipeline in Section 2 and Figure 2: Contriever-MSMARCO retrieval of top-K references from the bias database, augmentation of the generation with references, and instruction-following detection. The scaffolding architecture is clear."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Appendix B.1 describes constructing the retrieval database from SBIC: selecting samples with group bias annotations, extracting biased statements, categorizing by bias type, resulting in 41,000 entries across 7 bias types (Table 4). The instruction-following dataset construction from RedditBias is described in Sections 2.2 and 3.1."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "There is a dedicated 'Limitations' section after the Conclusion that discusses three specific limitations: use of simulated datasets, outdated retrieval database, and lack of retrieval quality assessment."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The Limitations section discusses specific threats: 'our application study is conducted on simulated datasets with preliminary results, as there is still a lack of benchmarks for open-text bias evaluation,' 'the retrieval database based on SBIC is outdated,' and 'the employed retriever cannot capture the relevance between expressions of implicit bias.' These are specific to this study."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The paper does not explicitly state what BiasAlert does NOT cover. It does not bound its claims to specific languages, cultures, or bias types. The Limitations section mentions issues but does not explicitly state scope boundaries like 'our results apply only to English-language bias detection' or 'we do not claim effectiveness for implicit bias.'"
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The evaluation datasets (RedditBias, Crows-pairs) are publicly available, but the specific train/test splits used, the constructed retrieval database, the instruction-following fine-tuning dataset, and the model predictions are not released for independent verification."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Appendix B.1 describes the retrieval database construction from SBIC with specific numbers (150k posts, 34k biased, 1k social groups, resulting in 41,000 entries across 7 bias types). Section 3.1 and Appendix B.2 describe the instruction-following dataset construction from RedditBias."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "Section E.3 mentions '3 undergraduate students who possess good English proficiency and strong ethical principles as crowdsourcers' but does not describe how they were recruited or whether this introduces selection bias. For the application studies using crowdsourcers for validation, no recruitment details beyond this basic description are provided."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The pipeline is documented: SBIC data selection and standardization (Appendix B.1), RedditBias split into training and 30% evaluation (Section 3.1), retrieval database construction with 7 bias types and 41,000 entries (Table 4), and the instruction-following dataset construction (Appendix B.2)."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No funding information is disclosed anywhere in the paper. There is no acknowledgments section listing grants, corporate sponsors, or funding agencies."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "All authors are affiliated with Zhejiang University, which is disclosed on the first page. The paper evaluates third-party models (GPT-4, Llama) rather than models from their own institution."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "Since funding is not disclosed, it is impossible to assess whether the funder is independent of the outcome. The absence of funding disclosure means this criterion cannot be satisfied."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "There is no competing interests statement or financial disclosure in the paper. Absence of disclosure is not the same as absence of conflict."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The paper uses GPT-4 Turbo and GPT-3.5 as baselines and evaluates them on RedditBias and Crows-pairs, which are publicly available benchmarks. No training data cutoff dates are stated for any of the models used."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "RedditBias (2021) and Crows-pairs (2020) were published before the training cutoffs of GPT-4 Turbo and other models used as baselines. The paper does not discuss whether these benchmarks may appear in the training data of the evaluated models."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "Both RedditBias (2021) and Crows-pairs (2020) were available online well before the training cutoffs of GPT-4 and other models. The paper does not address this contamination risk, which could affect the baseline comparisons."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "The human crowdsourcers are used for annotation validation, not as study participants in a human subjects study. This is a benchmark evaluation paper with human annotation for validation purposes."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "The crowdsourcers are performing annotation tasks, not participating as subjects of a human study. The 'Potential Risks' section mentions informed consent for annotators regarding offensive content, but this is not a human subjects study requiring IRB approval."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "The paper uses crowdsourcers for annotation validation, not as study participants. Section E.3 describes annotators as '3 undergraduate students' but this is an annotation task, not a human subjects study."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human subjects study is conducted. The crowdsourcers are annotators performing validation tasks, not study participants."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human subjects experiment is conducted that would require randomization."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human subjects experiment is conducted that would require blinding."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human subjects study is conducted that would have participant attrition."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": true,
    280         "justification": "Table 3 reports the average time cost for BiasAlert to process one generation: 1.27-1.74 seconds when deployed on 2 RTX 3090 GPUs. Section 4.2 explicitly discusses this: 'BiasAlert takes an average of 1.4 seconds to monitor a single generation.'"
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "The paper mentions training on 8 RTX 3090 GPUs but does not state total training time, GPU hours, or total computational budget for the experiments."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "BiasAlert significantly outperforms existing state-of-the-art methods including GPT-4-as-A-Judge in detecting bias.",
    292       "evidence": "Table 1 shows BiasAlert achieves 0.84 Acc and 0.82 F1 on RedditBias, vs. GPT-4's 0.61 Acc and 0.59 F1. On Crows-pairs, BiasAlert achieves 0.70 Acc vs. GPT-4's 0.43 Acc.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "External retrieval of social bias knowledge is necessary for accurate and reliable bias detection.",
    297       "evidence": "Table 2 ablation study shows adding retrieval (RE) improves Overall Score from 0.01 to 0.19 for Llama-2-7b-chat base and from 0.21 to 0.40 for GPT-4.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "BiasAlert achieves over 92% human validation consistency in bias evaluation tasks.",
    302       "evidence": "Tables 5 and Section 4.1 report human annotation consistency above 0.92 for both text completion and question-answering tasks across 9 LLMs, using 40 sampled completions per LLM validated by 3 crowdsourcers.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "BiasAlert can significantly reduce the proportion of biased generation when deployed for bias mitigation.",
    307       "evidence": "Table 3 shows bias reduction across 8 LLMs, e.g., Alpaca-7b from 0.283 to 0.042, OPT-6.7b from 0.167 to 0.042. Results are reported with ± values over multiple runs.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Step-by-step instructions enhance reasoning capabilities for bias classification and attribution.",
    312       "evidence": "Table 2 ablation shows adding CoT improves Classification Score from 0.64 to 0.76 and Attribution Score from 0.58 to 0.71 for the base model.",
    313       "supported": "strong"
    314     }
    315   ],
    316   "methodology_tags": [
    317     "benchmark-eval"
    318   ],
    319   "key_findings": "BiasAlert, a retrieval-augmented fine-tuned Llama-2-7b-chat model, outperforms GPT-4 and dedicated safety APIs at detecting social bias in open-text LLM generations, achieving 0.84 accuracy vs. 0.61 for GPT-4 on RedditBias. The ablation study demonstrates that external bias knowledge retrieval is the most critical component, improving the overall score from 0.01 to 0.19 for the base model. Application studies show BiasAlert can both evaluate bias across 9 LLMs and mitigate bias during deployment with an average inference time of 1.4 seconds per generation, achieving over 92% consistency with human annotations.",
    320   "red_flags": [
    321     {
    322       "flag": "No significance tests for main claims",
    323       "detail": "The paper repeatedly claims 'significant' outperformance but provides no statistical significance tests. All comparisons are based on raw number differences in tables without p-values, confidence intervals on main results, or any formal testing."
    324     },
    325     {
    326       "flag": "Promised but unreleased code",
    327       "detail": "The abstract states 'Model and code will be publicly released' but no URL is provided. Future release promises reduce reproducibility and independent verification."
    328     },
    329     {
    330       "flag": "Very small human validation sample",
    331       "detail": "Human validation uses only 40 samples per LLM evaluated by 3 undergraduate students. This is a small sample for validating a tool claimed to be reliable for bias detection. The annotators' demographics and potential biases are not discussed."
    332     },
    333     {
    334       "flag": "Benchmark contamination risk for baselines",
    335       "detail": "RedditBias (2021) and Crows-pairs (2020) were published before the training cutoffs of GPT-4 and other baseline models. Higher baseline performance due to contamination would make BiasAlert's improvements appear smaller, but the paper does not address this issue."
    336     },
    337     {
    338       "flag": "Training data overlap with evaluation",
    339       "detail": "BiasAlert is trained on 70% of RedditBias and tested on the remaining 30%. While this is a standard split, the paper does not discuss whether the retrieval database (built from SBIC) has topical overlap with the evaluation data that could inflate results."
    340     },
    341     {
    342       "flag": "No model version specificity",
    343       "detail": "GPT-4 Turbo and GPT-3.5 are referenced by marketing names without API versions or snapshot dates. Results could vary significantly across model versions."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations",
    349       "authors": ["Hakan Inan", "Kartikeya Upasani"],
    350       "year": 2023,
    351       "arxiv_id": "2312.06674",
    352       "relevance": "LLM-based safety tool that serves as a baseline for evaluating bias detection approaches."
    353     },
    354     {
    355       "title": "DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models",
    356       "authors": ["Boxin Wang", "Weixin Chen"],
    357       "year": 2023,
    358       "relevance": "Comprehensive LLM trustworthiness evaluation framework relevant to AI safety and bias evaluation."
    359     },
    360     {
    361       "title": "Bias and Fairness in Large Language Models: A Survey",
    362       "authors": ["Isabel O Gallegos", "Ryan A Rossi"],
    363       "year": 2024,
    364       "arxiv_id": "2309.00770",
    365       "relevance": "Survey paper on bias and fairness in LLMs, directly relevant to the survey scope on LLM evaluation methodology."
    366     },
    367     {
    368       "title": "Holistic Evaluation of Language Models",
    369       "authors": ["Percy Liang", "Rishi Bommasani"],
    370       "year": 2022,
    371       "arxiv_id": "2211.09110",
    372       "relevance": "Comprehensive LLM evaluation benchmark (HELM) that establishes methodology for holistic model assessment."
    373     },
    374     {
    375       "title": "BiasAsker: Measuring the Bias in Conversational AI System",
    376       "authors": ["Yuxuan Wan", "Wenxuan Wang"],
    377       "year": 2023,
    378       "relevance": "Tool for measuring conversational AI bias using generated text approaches, relevant to LLM evaluation methodology."
    379     },
    380     {
    381       "title": "Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection",
    382       "authors": ["Akari Asai", "Zeqiu Wu"],
    383       "year": 2023,
    384       "arxiv_id": "2310.11511",
    385       "relevance": "Retrieval-augmented generation approach that underpins the kind of knowledge-augmented LLM systems like BiasAlert."
    386     },
    387     {
    388       "title": "JailbreakBench: An Open Robustness Benchmark for Jailbreaking Large Language Models",
    389       "authors": ["Patrick Chao", "Edoardo Debenedetti"],
    390       "year": 2024,
    391       "arxiv_id": "2404.01318",
    392       "relevance": "Benchmark for evaluating LLM safety against adversarial attacks, relevant to AI safety evaluation."
    393     },
    394     {
    395       "title": "BeaverTails: Towards Improved Safety Alignment of LLM via a Human-Preference Dataset",
    396       "authors": ["Jiaming Ji", "Mickel Liu"],
    397       "year": 2023,
    398       "arxiv_id": "2307.04657",
    399       "relevance": "Safety alignment dataset used for evaluating LLM safety, relevant to LLM evaluation benchmarks."
    400     },
    401     {
    402       "title": "Large Language Model Bias Mitigation from the Perspective of Knowledge Editing",
    403       "authors": ["Ruizhe Chen", "Yichen Li"],
    404       "year": 2024,
    405       "arxiv_id": "2405.09341",
    406       "relevance": "Addresses LLM bias mitigation through knowledge editing, relevant to AI safety and bias reduction methods."
    407     },
    408     {
    409       "title": "GPT-4 Technical Report",
    410       "authors": ["Josh Achiam", "Steven Adler"],
    411       "year": 2023,
    412       "arxiv_id": "2303.08774",
    413       "relevance": "Technical report for GPT-4, a key model used in LLM capability evaluation across the survey."
    414     }
    415   ]
    416 }

Impressum · Datenschutz