scan.json (26148B)
1 { 2 "paper": { 3 "title": "Bias Testing and Mitigation in LLM-based Code Generation", 4 "authors": ["Dong Huang", "Jie M. Zhang", "Qingwen Bu", "Xiaofei Xie", "Junjie Chen", "Heming Cui"], 5 "year": 2023, 6 "venue": "arXiv preprint (cs.SE)", 7 "arxiv_id": "2309.14345", 8 "doi": "10.48550/arXiv.2309.14345" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": true, 15 "justification": "The paper states 'We release our dataset and source code in https://github.com/huangd1999/CBS' (Section 8, Conclusion). A GitHub URL is provided." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The paper states that all prompts, generated scenarios, and full manual analysis results are available on their homepage/GitHub repo (Sections 3.5 and 8). The prompt datasets are from publicly available Kaggle datasets (Adult Income, Employee, Health Insurance) and the generated prompt pool is released." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "The paper mentions 'Ubuntu 18.04.6 LTS' and 'four NVIDIA GeForce RTX 3090 Ti graphics cards' (Section 4.1) but does not provide a requirements.txt, Dockerfile, or detailed library version information sufficient to recreate the environment." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but the paper itself contains no README-style commands or a 'Reproducing Results' section." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "Results are reported as point estimates (CBS percentages). No confidence intervals or error bars are provided for any of the main results in Tables 4, 7, or 8." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper makes numerous comparative claims (e.g., GPT-4-turbo has higher bias than GPT-3.5-turbo, mitigation strategies reduce bias) but no statistical significance tests (p-values, t-tests, etc.) are reported." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper reports CBS scores with baseline context throughout, e.g., 'the overall CBS of GPT-4 decreases from 59.88% to 36.23% for zero-shot prompting' and 'from 59.88% to 4.79%' (Section 4.4). These provide enough context to assess the magnitude of effects." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "The study uses 334 prompts across three tasks. No justification is provided for why 334 prompts is sufficient, nor is any power analysis discussed." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "While the paper runs experiments 5 times and reports averages (Section 4.4), no standard deviation, variance, or spread measure is reported across these runs. CBS_U@5 and CBS_I@5 capture union/intersection across runs but are not variance measures." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper includes comparison against the prior work of Liu et al. [7] which focused on code completion bias. Additionally, the 'original' (no mitigation) CBS serves as the baseline for all mitigation experiments in Tables 7 and 8." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "Liu et al. [7] (2023) is the most directly related prior work and is contemporary. The paper also evaluates five recent LLMs (PaLM-2, Claude-instant-1, GPT-3.5-turbo, GPT-4-turbo, GPT-4), which were state-of-the-art at time of writing." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "The paper systematically varies components: five different prompting strategies (zero-shot, one-shot, few-shot, CoT1, CoT2) and two scenarios (with/without test feedback). This effectively serves as an ablation of the mitigation approach components. Section 5.3 also tests sensitivity to prompt phrasing." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "Three metrics are used: CBS, CBS_U@K, and CBS_I@K (Section 3.4). Additionally, precision, recall, and FPR are reported for the testing framework reliability (Section 4.3)." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": true, 84 "justification": "Human evaluation is included for cases where AST cannot process the code. Two human participants independently review code for bias, with a third senior expert resolving disagreements. Cohen's Kappa > 0.9 is reported (Section 3.6, footnote 8)." 85 }, 86 "held_out_test_set": { 87 "applies": false, 88 "answer": false, 89 "justification": "This paper does not involve train/test splits in the traditional sense. It generates prompts and evaluates LLM outputs for bias; there is no model training or tuning on a dev set." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Results are broken down by six bias attributes (age, region, gender, education, occupation, race), by three task types (adult income, employment, health insurance), and by each of the five LLMs. Tables 4, 6, 7, and 8 all provide detailed per-category breakdowns." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section 5.4 discusses false negatives in bias detection (e.g., value pool not covering ages > 65). Section 4.4.3 analyzes why Scenario 1 mitigation has limited effectiveness. The paper also discusses cases where prompting strategies increase bias (e.g., few-shot on GPT-4 increases CBS from 59.88% to 68.56%)." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper reports that direct prompt engineering strategies sometimes increase bias rather than reducing it (e.g., GPT-4 few-shot CBS increases from 59.88% to 68.56%, Table 7). It also reports that LLMs cannot reliably self-detect their own biases (Table 9, GPT-3.5-turbo detects only 18.84% of age bias)." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims 13.47% to 49.10% gender bias, which matches Table 4. The claim that feedback reduces GPT-4 bias from 59.88% to 4.79% is supported by Table 8. All abstract claims have corresponding evidence in the results sections." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "The paper's main causal claims are about the effectiveness of mitigation strategies. These are supported by controlled experiments where the only variable changed is the prompt strategy or feedback presence. Section 4.4.3 provides additional causal analysis of why Scenario 1 is less effective (LLMs' inability to self-detect bias, Table 9)." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper explicitly states 'although the tasks we chose are widely studied, realistic, and critical, they could not cover all the bias-sensitive scenarios' (Section 3.2). Section 6.2 (External Validity) discusses limitations of generalizability. The paper bounds findings to three specific task domains and five specific models." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": true, 126 "justification": "Section 4.4.3 investigates whether Scenario 1 improvements are due to prompt changes rather than actual bias mitigation, concluding LLMs' inability to self-detect bias is the key factor. Section 5.3 explores whether prompt phrasing variation explains CBS differences. Section 5.1 discusses training data as an alternative explanation for the lack of fairness-performance trade-off." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper lists 'PaLM-2-CodeChat-bison', 'Claude-instant-1', 'GPT-3.5-turbo', 'GPT-4-turbo', and 'GPT-4' (Section 4.1). These are marketing names without specific version snapshots or API dates. For instance, no snapshot date like 'gpt-4-0613' is specified. The paper mentions accessing GPT-4 on '12-11-2023' in the introduction but does not specify which API version was used for the systematic experiments." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": true, 138 "justification": "Table 3 provides the full mitigation prompt templates. The code generation prompt template structure is shown in Section 3.5 and Fig. 3 with the actual template text. The paper states full prompts are on the GitHub repo. The prompts shown in the paper contain the actual text used." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "Temperature is set to 1.0 for all experiments (Section 4.1, Section 5.6). Section 5.6 further evaluates the effect of different temperature settings (0.0, 0.2, 0.4, 0.6, 0.8, 1.0). K=5 for multiple runs is also specified." 144 }, 145 "scaffolding_described": { 146 "applies": false, 147 "answer": false, 148 "justification": "The paper does not use agentic scaffolding. It uses direct LLM API calls for code generation with prompts, followed by AST-based automated testing. There is no agentic loop, tool use, or scaffolding." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 3.5 and Table 2 document the full prompt filtering pipeline: 1000 original prompts per task, filtered through three stages (duplicate removal via SentenceTransformer with 0.8 threshold, bias-inducing prompt removal, unrelated prompt removal), with counts at each stage (e.g., 1000 -> 151 -> 111 -> 93 for Adult Income)." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 6 'Threats to Validity' contains three subsections: Internal Validity (6.1), External Validity (6.2), and Construct Validity (6.3), providing substantive discussion." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": true, 165 "justification": "The threats are specific to this study: subjective bias in prompt design (6.1), model randomness addressed by 5-run averaging (6.1), dataset representativeness limited to three task types (6.2), and AST limitations for runtime-error code requiring manual review (6.3)." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 3.2 explicitly states: 'although the tasks we chose are widely studied, realistic, and critical, they could not cover all the bias-sensitive scenarios where LLM-generated code can be applied.' Section 8 lists specific future expansions (more models, attributes, scenarios). The paper also notes it does not evaluate open-source models due to low code generation effectiveness (Section 4.1)." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": true, 177 "justification": "The paper releases the full dataset and source code on GitHub (https://github.com/huangd1999/CBS, Section 8). Generated scenarios, prompts, and full manual analysis results are stated to be available on the homepage." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 3.5 describes prompt generation in detail: GPT-4 generates 1000 scenarios per template, then three filtering stages are applied. Section 3.6 describes how bias testing is conducted via AST parsing and test case generation. Section 4.1 describes the experimental setup." 183 }, 184 "recruitment_methods_described": { 185 "applies": true, 186 "answer": false, 187 "justification": "The paper involves two human participants for manual bias evaluation (Section 3.6), but does not describe how they were recruited, their qualifications beyond being 'human participants' and a 'senior expert,' or potential selection bias." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The full pipeline is documented in Fig. 3 and Sections 3.1-3.7: prompt construction -> filtering (Table 2 with counts at each stage) -> code generation -> AST extraction -> test case generation -> bias score calculation. Each step is explained with counts and criteria." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 9 (Acknowledgements) lists funding: 'National Key R&D Program of China (2022ZD0160201), HK RGC RIF (R7030-22), a Huawei Flagship Research Grant in 2023, HK RGC GRF (Ref: 17208223 & 17204424), and the HKU-CAS Joint Laboratory for Intelligent System Software.'" 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly listed: The University of Hong Kong, King's College London, Shanghai Jiao Tong University, Singapore Management University, and Tianjin University. None of the authors appear affiliated with the companies whose models are evaluated (Google, Anthropic, OpenAI)." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": true, 209 "justification": "Funding comes from Chinese government R&D programs, Hong Kong Research Grants Council, Huawei, and HKU-CAS Joint Laboratory. While Huawei is a tech company, it does not have a direct stake in the bias evaluation outcomes of PaLM-2, Claude, or GPT models. The funders appear independent of the specific outcomes." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests statement or financial interests declaration is present in the paper." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": false, 220 "answer": false, 221 "justification": "This paper does not evaluate LLM capability on a benchmark. It evaluates whether LLMs generate biased code for novel, custom-constructed prompts. The bias testing framework generates its own test cases. Contamination of training data with the test prompts is not a concern since the prompts were freshly generated." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": false, 225 "answer": false, 226 "justification": "Same as above: the paper does not evaluate pre-trained model knowledge on any existing benchmark. The prompts and test cases are newly constructed for this study." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": false, 230 "answer": false, 231 "justification": "Same as above: the paper creates its own prompts and evaluation framework rather than using existing benchmarks that could be contaminated." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "The human involvement is limited to manual code review for bias classification by researchers/annotators (Section 3.6), not a human subjects study. The humans are not study participants; they are evaluators." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants are studied. The two human reviewers serve as annotators/evaluators of code bias, not as research subjects." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants are studied. The annotators are not research subjects." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are studied." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are studied." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are studied." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are studied." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": true, 275 "justification": "Section 5.7 and Table 16 report average token usage (input + output) for each LLM across all mitigation scenarios, ranging from ~509 to ~1064 tokens per task. Section 5.4 reports testing time (57.16s original vs. 3958.84s enriched value pool, Table 14)." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": false, 280 "justification": "While token usage per task is reported, the total computational budget (total API spend, total GPU hours, aggregate cost) is not stated. The paper mentions hardware (4x RTX 3090 Ti) but not how long it was used or total API costs." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "Biases are prevalent in LLM-generated code: 13.47% to 49.10% of code generated by the tested LLMs exhibits gender bias.", 287 "evidence": "Table 4 shows CBS for gender: PALM-2 13.47%, Claude-instant-1 49.10%, GPT-3.5-turbo 23.35%, GPT-4-turbo 34.13%, GPT-4 38.92%.", 288 "supported": "strong" 289 }, 290 { 291 "claim": "Larger language models do not necessarily produce less biased code (e.g., GPT-4 has higher age bias than GPT-3.5-turbo).", 292 "evidence": "Table 4: GPT-4 age CBS is 39.52% vs GPT-3.5-turbo's 23.95%. GPT-4-turbo is even higher at 52.10%. Section 5.1 and Table 10 further show no trade-off between performance and bias.", 293 "supported": "strong" 294 }, 295 { 296 "claim": "The automated bias testing framework achieves 100% precision in detecting code bias.", 297 "evidence": "Table 5 shows the confusion matrix for PALM-2-CodeChat-bison: 0 FP, 141 TP, resulting in 100% precision. Section 4.3.1.", 298 "supported": "moderate" 299 }, 300 { 301 "claim": "Direct prompt engineering strategies have limited effectiveness in mitigating bias, but test feedback-driven refinement significantly reduces bias (e.g., GPT-4 overall CBS from 59.88% to 4.79%).", 302 "evidence": "Table 7 (Scenario 1) shows limited reduction or even increases. Table 8 (Scenario 2) shows dramatic reductions. GPT-4 with CoT2: Scenario 1 CBS 32.34%, Scenario 2 CBS 4.79%. Section 4.4.", 303 "supported": "strong" 304 }, 305 { 306 "claim": "LLMs have difficulty detecting bias in their own generated code, which explains why direct prompting is ineffective.", 307 "evidence": "Table 9: GPT-3.5-turbo self-detects only 18.84% of age biases, 29.27% of region biases, 39.47% of gender biases. Section 4.4.3.", 308 "supported": "strong" 309 }, 310 { 311 "claim": "The automated bias testing framework can analyze the vast majority of LLM-generated code, with human review needed only for code with runtime/syntax errors.", 312 "evidence": "Table 6 shows automated testing detects almost all biases. E.g., for GPT-4 age: 130 of 132 detected automatically, 2 by human review. Section 4.3.2.", 313 "supported": "strong" 314 } 315 ], 316 "methodology_tags": ["benchmark-eval", "observational"], 317 "key_findings": "This paper demonstrates that social biases (age, gender, region, education, occupation, race) are prevalent in code generated by five major LLMs for bias-sensitive tasks, with gender bias rates ranging from 13.47% to 49.10%. Direct prompt engineering strategies (zero-shot, few-shot, CoT) have limited effectiveness in mitigating these biases and sometimes increase them. However, when automated test execution feedback identifying specific biases is provided back to LLMs, bias rates drop dramatically (e.g., GPT-4 overall CBS from 59.88% to 4.79%). The study also reveals that LLMs are poor at self-detecting bias in their own generated code, explaining why direct prompting alone is insufficient.", 318 "red_flags": [ 319 { 320 "flag": "No statistical significance tests", 321 "detail": "All comparisons between models and mitigation strategies are based on raw CBS percentages without any significance testing. Given the inherent non-determinism of LLMs (acknowledged in the paper), differences between conditions may not be statistically significant." 322 }, 323 { 324 "flag": "No variance or uncertainty measures", 325 "detail": "While experiments are run 5 times with averaged results, no standard deviations, confidence intervals, or other spread measures are reported. The reader cannot assess whether the CBS differences between conditions are within the natural variability." 326 }, 327 { 328 "flag": "Precision validated on only one model", 329 "detail": "The 100% precision claim for automated bias testing is validated only on PALM-2-CodeChat-bison (Table 5). It is unclear whether this precision holds for the other four models, whose code may have different characteristics." 330 }, 331 { 332 "flag": "Model versions not pinned", 333 "detail": "Model versions are specified only by marketing names (GPT-3.5-turbo, GPT-4, etc.) without API version snapshots. Given that these models update over time, the results may not be reproducible with current model versions." 334 } 335 ], 336 "cited_papers": [ 337 { 338 "title": "Uncovering and Quantifying Social Biases in Code Generation", 339 "authors": ["Y. Liu", "X. Chen", "Y. Gao", "Z. Su", "F. Zhang", "D. Zan", "J.-G. Lou", "P.-Y. Chen", "T.-Y. Ho"], 340 "year": 2023, 341 "relevance": "Most closely related prior work on social bias in code completion models, which this paper extends to text-to-code generation." 342 }, 343 { 344 "title": "Evaluating Large Language Models Trained on Code", 345 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 346 "year": 2021, 347 "arxiv_id": "2107.03374", 348 "relevance": "Foundational HumanEval benchmark paper, used in this study for measuring code generation performance trade-offs." 349 }, 350 { 351 "title": "ReCode: Robustness Evaluation of Code Generation Models", 352 "authors": ["S. Wang", "Z. Li", "H. Qian"], 353 "year": 2022, 354 "relevance": "Inspired the multi-run evaluation metrics (CBS_U@K and CBS_I@K) used in this paper for measuring LLM code generation robustness." 355 }, 356 { 357 "title": "SWE-Bench: Can Language Models Resolve Real-World GitHub Issues?", 358 "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. Narasimhan"], 359 "year": 2023, 360 "arxiv_id": "2310.06770", 361 "relevance": "Major benchmark for evaluating LLM code generation in real-world software engineering tasks." 362 }, 363 { 364 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation", 365 "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"], 366 "year": 2023, 367 "relevance": "HumanEval+ benchmark that improved test case quality for evaluating code generation correctness." 368 }, 369 { 370 "title": "A Comprehensive Empirical Study of Bias Mitigation Methods for Machine Learning Classifiers", 371 "authors": ["Z. Chen", "J. M. Zhang", "F. Sarro", "M. Harman"], 372 "year": 2023, 373 "relevance": "Comprehensive study of bias mitigation methods in ML, providing methodological context for fairness evaluation in AI systems." 374 }, 375 { 376 "title": "LLM is Like a Box of Chocolates: The Non-determinism of ChatGPT in Code Generation", 377 "authors": ["S. Ouyang", "J. M. Zhang", "M. Harman", "M. Wang"], 378 "year": 2023, 379 "arxiv_id": "2308.02828", 380 "relevance": "Directly motivates the multi-run evaluation approach in this paper, addressing LLM non-determinism in code generation." 381 }, 382 { 383 "title": "AgentCoder: Multi-Agent-Based Code Generation with Iterative Testing and Optimisation", 384 "authors": ["D. Huang", "Q. Bu", "J. M. Zhang", "M. Luck", "H. Cui"], 385 "year": 2023, 386 "arxiv_id": "2312.13010", 387 "relevance": "Multi-agent approach to code generation with iterative testing, relevant to agentic code generation evaluation." 388 }, 389 { 390 "title": "CodaMOSA: Escaping Coverage Plateaus in Test Generation with Pre-trained Large Language Models", 391 "authors": ["C. Lemieux", "J. P. Inala", "S. K. Lahiri", "S. Sen"], 392 "year": 2023, 393 "relevance": "Uses LLMs for automated test generation, relevant to the broader theme of LLM-assisted software testing." 394 }, 395 { 396 "title": "A Survey on Bias and Fairness in Machine Learning", 397 "authors": ["N. Mehrabi", "F. Morstatter", "N. Saxena", "K. Lerman", "A. Galstyan"], 398 "year": 2021, 399 "relevance": "Foundational survey on bias and fairness in ML that provides the theoretical framework for protected attributes used in this study." 400 }, 401 { 402 "title": "Evaluating Social Bias in Code Generation Models", 403 "authors": ["L. Ling"], 404 "year": 2024, 405 "relevance": "Concurrent work evaluating social bias in fine-tuned code generation models, complementary to this paper's focus." 406 } 407 ] 408 }