scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24831B)
      1 {
      2   "paper": {
      3     "title": "Bias Unveiled: Investigating Social Bias in LLM-Generated Code",
      4     "authors": ["Lin Ling", "Fazle Rabbi", "Song Wang", "Jinqiu Yang"],
      5     "year": 2025,
      6     "venue": "AAAI 2025",
      7     "arxiv_id": "2411.10351",
      8     "doi": null
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "GitHub repository linked in the paper: https://github.com/janeeyre912/fairness_testing_code_generation. Code and datasets URLs are provided at the top of the paper."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "Dataset (SocialBias-Bench) is released at https://github.com/janeeyre912/fairness_testing_code_generation/tree/master/dataset, linked directly in the paper."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is mentioned in the paper. The paper mentions using textX DSL framework and specific LLM APIs but does not specify library versions or environment details."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions are provided in the paper. The paper links to code and data but does not include a README with commands or a 'Reproducing Results' section."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper reports CBS and BLS as point estimates (e.g., '60.58%', '28.34%') without confidence intervals or error bars. Figure 4 shows temperature effects but without uncertainty bands."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "The paper uses t-tests to assess statistical significance of bias mitigation strategies. Table 5 marks statistically significant changes with asterisks (*), and the paper states 'If p < 0.05, the difference is statistically significant.' Temperature effects are also evaluated with p-values."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper reports percentage improvements with baseline context, e.g., 'the overall bias decreased to 29.15% from 60.58%' after the first iteration, and 'reduce social bias in LLM-generated code by up to 90%'. CBS values are provided for each demographic before and after mitigation."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The dataset contains 343 tasks and 1,715 code snippets per model (5 per task), but there is no justification for why 343 tasks or 5 snippets per task were chosen. No power analysis is provided."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No variance, standard deviation, or spread measures are reported for the CBS or BLS metrics across the 5 code snippets generated per task. Only aggregated percentages are presented."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper compares against two prior works (Liu et al. 2023 and Huang et al. 2023) that also investigate social bias in code generation. The default (no mitigation) condition serves as the baseline for mitigation experiments."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The compared prior works (Liu et al. 2023 and Huang et al. 2023) are contemporary, both published in 2023, which is recent relative to the 2024/2025 submission."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper includes ablation studies on temperature effects (Section 'Effects of temperature' with Figure 4) and prompt variations (judgemental words). Three mitigation strategies are tested incrementally (COT, positive role-play + COT, iterative prompting)."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Three metrics are used: Code Bias Score (CBS) for overall bias severity, Bias Leaning Score (BLS) for fine-grained bias direction, and Pass@attribute for functional correctness."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The paper makes claims about real-world bias implications and fairness but relies entirely on automated metamorphic testing. No human evaluation of the generated code's fairness or quality is included. Given the claims about real-world social impact, human judgment on whether detected biases are meaningful would be relevant."
     85       },
     86       "held_out_test_set": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "This is not a supervised learning task with train/test splits. The framework generates test cases per task definition and evaluates all code snippets. No model training or tuning on the dataset occurs."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down by demographic dimension (age, gender, religion, race, employment status, marital status, education) in Tables 3 and 4, and by task category in Table 2. Radar charts (Figure 3) show per-demographic patterns."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper discusses cases where mitigation strategies failed or worsened bias: 'COT prompt increases CBSdemographic for all dimensions and the overall CBS' and 'role-playing can sometimes reinforce biases when sensitive attributes are unintentionally embedded in the context or reasoning steps.'"
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports that COT prompting and positive role-play + COT prompting were not effective and sometimes increased bias. For GPT-3.5, COT increased CBS from 60.58% to 72.65%. This is a clear negative result."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims: (1) severe bias in all four LLMs — supported by Table 3 showing CBS 28-60%. (2) Dialogue with Solar reduces bias by up to 90% — supported by Table 5 showing iterative prompting reducing CBS from 60.58% to 8.77% for GPT-3.5. (3) Code and data publicly available — GitHub links provided."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper makes causal claims about mitigation strategies reducing bias ('iterative prompting effectively reduces bias'). The study design uses controlled before/after comparisons with the same tasks, models, and dataset, which is adequate for these claims. Statistical tests support significance."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The title 'Investigating Social Bias in LLM-Generated Code' is broad but results are limited to 4 specific LLMs, 343 human-centered tasks in 7 categories, and Python code generation. The paper does not explicitly bound the generalization — it does not state these results may not hold for other LLMs, programming languages, or non-human-centered tasks."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper does not discuss alternative explanations for the observed biases. For instance, it does not consider whether the bias results could be influenced by the specific prompt format used, the DSL-generated test case design, or whether the task definitions themselves may embed implicit assumptions. No threats-to-validity section or alternative explanations are provided."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Exact model versions are specified: GPT-3.5-turbo-0125, codechat-bison@002, CodeLlama-70b-instruct-hf, and claude-3-haiku-20240307. These include version identifiers and snapshot dates."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper provides the actual prompt structure in Figure 1 (sub-figure b) showing the code prompt format, and the exact text of mitigation prompts: 'Let's think step by step. Consider the condition carefully to ensure fairness' for COT and 'You are a fair and impartial code developer...' for role-play. The DSL-generated prompts follow a documented template with actual examples."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Temperature settings are explicitly discussed and varied (0.2 to 1.0) in the temperature ablation study (Figure 4). The default temperature is implied as 1.0 (the default setting). The paper also generates 5 code snippets per task."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The Solar framework's workflow is described in detail: DSL-based prompt generation, test case generation via textX, metamorphic testing with attribute mutation, iterative feedback loop. Figure 1 shows the full pipeline, and the methodology section describes each component."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The paper describes the task generation process: manually crafted 2-3 tasks per category, GPT-4o generated 60 scenarios per category, then 'remove the duplicate and unrelated generated tasks and then adjust some related attributes.' Second author cross-checked. The final count of 343 tasks is reported."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No dedicated limitations or threats-to-validity section is present in the paper. The conclusion briefly mentions future work ('expand the datasets to include more scenarios') but this is not a substantive limitations discussion."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No threats to validity are discussed. The paper does not address potential issues such as the representativeness of the 343 tasks, the appropriateness of the bias definition used, or whether the DSL-generated test cases adequately capture real-world bias scenarios."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The paper does not explicitly state what the results do NOT show. There are no explicit statements about which populations, settings, or languages are excluded from the claims. The preliminary section limits bias to 'those against different demographics in human-centered tasks' but does not discuss what this excludes."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The dataset and code are released on GitHub (https://github.com/janeeyre912/fairness_testing_code_generation). The task definitions and generated code snippets should be verifiable through the released artifacts."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The data collection procedure is described: 2-3 manually crafted tasks per category, GPT-4o used to generate additional scenarios, filtering for duplicates and unrelated tasks, and cross-checking by the second author. Categories and task counts are listed in Table 2."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants in the study. The data is a benchmark dataset of coding tasks, not data from human subjects."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The pipeline from task definition to code prompt generation to test case generation to bias evaluation is documented in the Methodology section with Figure 1 showing the workflow. The filtering step for the dataset is described (remove duplicates and unrelated tasks, cross-check by second author)."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding sources are mentioned in the paper. There is no acknowledgments section listing grants or sponsors."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly listed: Concordia University and York University. These are academic institutions, not companies whose products are being evaluated."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding is disclosed, so independence cannot be assessed. The paper does not state whether it is unfunded or simply omits funding information."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The paper evaluates LLMs on code generation tasks but does not state the training data cutoff dates for any of the four models tested. This is relevant because the task structures could potentially appear in training data."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No discussion of whether the task definitions or similar prompts could have appeared in the training data of the subject LLMs. The SocialBias-Bench dataset was partially generated by GPT-4o, raising questions about potential overlap with GPT-3.5 training data."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The paper does not address contamination risk. While SocialBias-Bench is newly created, the task structures are based on common social scenarios that could appear in training data. Additionally, the paper references HumanEval performance for model comparison but does not discuss contamination for that benchmark either."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants in this study. The evaluation is entirely automated using the Solar framework on LLM-generated code."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants. The study evaluates LLM-generated code for bias using automated testing."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants. The demographics in the paper refer to the demographic dimensions used in the bias evaluation framework, not study participants."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants and no experimental assignment of participants to conditions."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants and no blinding applicable to automated LLM evaluation."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No API costs, tokens consumed, or wall-clock time are reported. The framework calls LLMs multiple times per task (5 code snippets per task, plus iterative prompting with up to 3 iterations) but no cost information is provided."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No computational budget is stated. The total API spend for 4 models x 1715 snippets plus mitigation experiments is not reported."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "All four LLMs exhibit severe social bias in code generation, with overall CBS ranging from 28.34% to 60.58%.",
    287       "evidence": "Table 3 shows CBS_overall: GPT-3.5-turbo-0125 at 60.58%, codechat-bison@002 at 40.06%, CodeLlama-70b-instruct-hf at 28.34%, and claude-3-haiku-20240307 at 36.33%.",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "Iterative prompting with Solar feedback can reduce social bias in LLM-generated code by up to 90%.",
    292       "evidence": "Table 5 shows GPT-3.5-turbo-0125 CBS dropping from 60.58% to 8.77% after 3 iterations (85.5% reduction). The abstract claims 'up to 90%' but the data shows approximately 85.5% for the model presented in detail. Other models' detailed results are in the artifact.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "Chain-of-Thought prompting and positive role-playing are not effective for bias mitigation and can increase bias.",
    297       "evidence": "Table 5 shows GPT-3.5 CBS increased from 60.58% to 72.65% with COT and to 68.66% with P-COT. The paper states 'all the subject LLMs do not exhibit a significant change in the CBS_overall' for these strategies.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "Bias is most severe in age, gender, and employment status demographics across all tested LLMs.",
    302       "evidence": "Table 3 shows consistently higher CBS values for these three demographics across all four models. For GPT-3.5: age 31.25%, gender 20.93%, employment 33.24%, while religion (16.44%) and race (19.42%) are lower.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Temperature settings significantly affect bias in code generation.",
    307       "evidence": "Figure 4 and the temperature ablation section show CodeLlama CBS rising from 28.34% to 65.19% as temperature decreases from 1.0 to 0.2. Other models also show significant changes at specific temperatures.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Iterative prompting improves functional correctness while reducing bias.",
    312       "evidence": "Table 5 shows Pass@attribute increasing from 66.60% (default) to 85.66% after 3 iterations for GPT-3.5-turbo-0125.",
    313       "supported": "moderate"
    314     }
    315   ],
    316   "methodology_tags": ["benchmark-eval"],
    317   "key_findings": "The paper proposes Solar, a fairness framework for evaluating social bias in LLM-generated code using metamorphic testing. Evaluation on four LLMs (GPT-3.5-turbo, codechat-bison, CodeLlama-70b, claude-3-haiku) reveals severe bias across all models, with CBS ranging from 28-61%. Age, gender, and employment status show the highest bias. Iterative prompting using Solar's feedback effectively reduces bias by up to ~85%, while Chain-of-Thought and role-playing prompting strategies are ineffective or counterproductive.",
    318   "red_flags": [
    319     {
    320       "flag": "No limitations section",
    321       "detail": "The paper lacks any limitations section or threats-to-validity discussion, which is unusual for a venue like AAAI. There is no discussion of potential weaknesses in the task design, bias definition, or generalizability."
    322     },
    323     {
    324       "flag": "Dataset partially generated by GPT-4o without contamination analysis",
    325       "detail": "The SocialBias-Bench dataset was partially generated using GPT-4o, but one of the test subjects is GPT-3.5-turbo, which shares training data infrastructure. No analysis of potential contamination or circularity is provided."
    326     },
    327     {
    328       "flag": "Selective presentation of mitigation results",
    329       "detail": "Full mitigation results are shown only for GPT-3.5-turbo in Table 5 'due to space limits.' The claim of 'up to 90% bias reduction' appears based on the best-case result, and the reader cannot verify this for all models from the paper alone."
    330     },
    331     {
    332       "flag": "No variance or uncertainty quantification",
    333       "detail": "Despite generating 5 code snippets per task, no variance or standard deviation across snippets is reported. The stability of the bias measurements is unknown."
    334     },
    335     {
    336       "flag": "Unbounded generalization claims",
    337       "detail": "The paper tests 4 specific LLMs on Python code generation for human-centered tasks but draws broad conclusions about 'social bias in LLM-generated code' without bounding the scope of the findings."
    338     }
    339   ],
    340   "cited_papers": [
    341     {
    342       "title": "Evaluating Large Language Models Trained on Code",
    343       "authors": ["Mark Chen", "Jerry Tworek"],
    344       "year": 2021,
    345       "arxiv_id": "2107.03374",
    346       "relevance": "Foundational HumanEval benchmark for evaluating LLM code generation, used in this paper for model performance comparison."
    347     },
    348     {
    349       "title": "Uncovering and quantifying social biases in code generation",
    350       "authors": ["Yujia Liu", "Xiaobao Chen"],
    351       "year": 2023,
    352       "relevance": "Prior work on social bias in LLM code generation at NeurIPS, which this paper extends with more comprehensive evaluation and mitigation strategies."
    353     },
    354     {
    355       "title": "Bias Testing and Mitigation in LLM-based Code Generation",
    356       "authors": ["Dong Huang", "Qingwen Bu"],
    357       "year": 2023,
    358       "relevance": "Concurrent work on bias testing in text-to-code LLM tasks, directly compared with Solar framework in this paper."
    359     },
    360     {
    361       "title": "Bias and fairness in large language models: A survey",
    362       "authors": ["Isabel O. Gallegos"],
    363       "year": 2023,
    364       "arxiv_id": "2309.00770",
    365       "relevance": "Comprehensive survey of bias and fairness in LLMs that motivates the study of bias in code generation specifically."
    366     },
    367     {
    368       "title": "StarCoder: may the source be with you!",
    369       "authors": ["Raymond Li"],
    370       "year": 2023,
    371       "arxiv_id": "2305.06161",
    372       "relevance": "Major open-source code generation model relevant to evaluating LLM code generation capabilities."
    373     },
    374     {
    375       "title": "Code Llama: Open foundation models for code",
    376       "authors": ["Baptiste Roziere"],
    377       "year": 2023,
    378       "arxiv_id": "2308.12950",
    379       "relevance": "One of the four LLMs evaluated in this paper for code generation bias."
    380     },
    381     {
    382       "title": "FairBench: A Four-Stage Automatic Framework for Detecting Stereotypes and Biases in Large Language Models",
    383       "authors": ["Yanhong Bai"],
    384       "year": 2023,
    385       "arxiv_id": "2308.10397",
    386       "relevance": "Framework for detecting bias in LLMs, related to the fairness evaluation methodology used in this paper."
    387     },
    388     {
    389       "title": "Fairness Improvement with Multiple Protected Attributes: How Far Are We?",
    390       "authors": ["Zhenpeng Chen", "Jie M. Zhang", "Federica Sarro", "Mark Harman"],
    391       "year": 2024,
    392       "relevance": "ICSE paper on fairness with multiple protected attributes, closely related to the multi-demographic bias evaluation in this paper."
    393     },
    394     {
    395       "title": "An empirical survey of the effectiveness of debiasing techniques for pre-trained language models",
    396       "authors": ["Nicholas Meade", "Elinor Poole-Dayan", "Siva Reddy"],
    397       "year": 2021,
    398       "arxiv_id": "2110.08527",
    399       "relevance": "Survey of debiasing techniques for language models, relevant context for bias mitigation strategies."
    400     },
    401     {
    402       "title": "Biasasker: Measuring the bias in conversational AI system",
    403       "authors": ["Yuxiang Wan"],
    404       "year": 2023,
    405       "relevance": "FSE paper on measuring bias in conversational AI, related methodology for automated bias detection."
    406     }
    407   ]
    408 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs