scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22705B)
      1 {
      2   "paper": {
      3     "title": "Auditing Fairness under Model Updates: Fundamental Complexity and Property-Preserving Updates",
      4     "authors": ["Ayoub Ajarra", "Debabrota Basu"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2601.05909",
      8     "doi": "10.48550/arXiv.2601.05909"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "The paper provides a link to a supplementary code repository: https://anonymous.4open.science/r/Auditors-with-prospects-050F (Appendix H). This is an anonymous link, indicating review stage, but a working URL is provided."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The experiments use two publicly available datasets: COMPAS (Angwin et al. 2016) and Student Performance (Cortez and Silva 2008). These are standard public benchmarks the authors did not modify."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "While the paper specifies the hardware ('11th Gen Intel Core i7-1185G7 processor (3.00 GHz, 8 cores) with 32.0 GiB of RAM') and mentions scikit-learn, there is no requirements.txt, Dockerfile, or detailed library version listing beyond mentioning scikit-learn's default solver."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper states 'Implementation details and instructions for reproducing our results are provided in the supplementary code repository' but does not include step-by-step reproduction instructions in the paper itself. The anonymous repository link may contain them, but the paper text does not."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The figures (Figures 4 and 6) show shaded bands around estimation curves, representing error/uncertainty ranges across sample budgets. Concentration bounds (Theorem 8) provide theoretical confidence guarantees."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper makes comparative claims about convergence and accuracy of their estimator but does not use formal statistical significance tests (no p-values, t-tests, etc.). Comparisons are shown graphically without formal testing."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Table 1 reports specific estimation errors (e.g., 2.33×10^-2 for COMPAS/MLP) and ratio errors (e.g., 2.5×10^-3) at a budget of 1000 samples, providing magnitude context for results."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The theoretical analysis (Theorems 2, 4, 7, 8) provides formal sample complexity bounds that justify the required number of samples. The experiments vary budget from 200 to 1000 to show convergence behavior."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Figures 4 and 6 show shaded bands around the main estimation curves across different sample budgets, indicating spread/variance in the estimates."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper compares its framework conceptually to Yan and Zhang (2022) but does not include empirical comparisons against any baseline methods. The experiments only evaluate the authors' own EPO algorithm."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No empirical baselines are included. The paper discusses Yan and Zhang (2022) theoretically but does not run their method for comparison."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No ablation study is presented. The paper could have ablated components such as the prospect class construction vs. simple SP estimation, but does not."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper evaluates using multiple metrics: estimation error of statistical parity, prospect ratio error, and runtime (Table 1, Figures 4, 6)."
     80       },
     81       "human_evaluation": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "This is a theoretical paper with synthetic/benchmark experiments on fairness auditing. Human evaluation of outputs is not relevant to the claims."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The paper does not describe a train/test split in its experimental evaluation. Results are shown across varying labeling budgets from the same datasets without explicit held-out test set separation."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down per dataset (COMPAS vs. Student Performance) and per strategic class (MLP vs. Random Forest), shown in Table 1 and Figure 4/6."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Proposition 9 establishes that model classes with infinite VC dimension are not weakly or strongly auditable, identifying a clear failure boundary. Section 6 discusses limitations of the approach."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports impossibility results: Proposition 9 proves infinite VC classes are not auditable, and Appendix E.2 shows weak auditability does not extend to all infinite VC classes. These are genuine negative/impossibility results."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims (i) characterizing information complexity of allowable updates via SP dimension, (ii) distribution-free auditing bounds, and (iii) extension to other objectives are all supported by the theorems (Theorems 2, 4, 7, 8) and Appendix B."
    112       },
    113       "causal_claims_justified": {
    114         "applies": false,
    115         "answer": false,
    116         "justification": "This is a theoretical paper establishing complexity bounds and proposing algorithms. It does not make causal claims about interventions or outcomes."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The paper is careful about scope: theoretical results specify exactly which hypothesis classes (finite, infinite VC, infinite VC dimension) each result applies to. Experimental claims are bounded to two specific datasets and two strategic model classes."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper does not discuss alternative explanations for the experimental results. It does not consider whether other factors could explain the observed convergence patterns or prospect ratio estimates beyond its theoretical framework."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper mentions 'logistic regression classifier with l2 regularization, trained on the original labels using scikit-learn's default solver' and 'multi-layer perceptron (MLP)' and 'random forest' but does not specify scikit-learn version, MLP architecture details (layers, activation functions, hyperparameters), or random forest configuration."
    134       },
    135       "prompts_provided": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "This paper does not use LLM prompting. It is a theoretical/experimental paper on fairness auditing using traditional ML models."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper mentions 'scikit-learn's default solver' for logistic regression and tolerance threshold epsilon=0.005 but does not report MLP or random forest hyperparameters (e.g., number of layers, hidden units, number of trees, max depth). The number of sampled hypotheses for the finite approximation is also not specified."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used. This is a traditional ML/theoretical paper."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The paper states it uses COMPAS with Caucasian/non-Caucasian groups and Student Performance with Female/Male groups but does not describe any preprocessing steps, feature selection, or how the datasets were prepared for the experiments."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 6 'Discussion and Future Work' discusses limitations, including the restriction to traditional predictive models and the need for extensions to interactive settings, property-preserving architectures, and manipulation-proof definitions."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "The discussion section identifies future directions but does not discuss specific threats to validity of the experimental results, such as the small scale of experiments (only 2 datasets), the use of synthetic strategic classes, or the gap between theoretical assumptions and practical settings."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 6 explicitly states scope boundaries: 'While our focus was on traditional predictive models' and identifies specific extensions needed (interactive settings, LLMs, manipulation-proof definitions). Proposition 9 clearly delimits what classes cannot be audited."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper uses publicly available datasets (COMPAS, Student Performance) that anyone can independently access and verify."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper simply references the COMPAS dataset (Angwin et al. 2016) and Student Performance dataset (Cortez and Silva 2008) without describing how they were obtained or any steps taken for this study."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants are involved. The data sources are standard public benchmarks."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper does not describe the pipeline from raw datasets to the experimental setup. It is unclear how features were selected, how the logistic regression 'true model' was trained, how the strategic class models were sampled, or how many models were included in the finite approximation."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding information or acknowledgments section is present in the paper. The authors are affiliated with Inria/CNRS/Univ. Lille but no grants or funding sources are mentioned."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly stated: 'Équipe Scool, Univ. Lille, Inria, CNRS, Centrale Lille, UMR 9189- CRIStAL, France'."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding is disclosed, so independence of funder cannot be assessed. The absence of funding disclosure is itself a gap."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It trains traditional ML models (logistic regression, MLP, random forest) on fairness datasets to test an auditing framework."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "No pre-trained model benchmark evaluation is involved. The paper tests an auditing algorithm, not model knowledge."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "No pre-trained model benchmark evaluation is involved. Contamination is not relevant to this auditing framework study."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants are involved in this study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": true,
    275         "justification": "Table 1 reports runtime in milliseconds per sample (1.2-1.9 ms). Figure 6 shows runtime scaling across budgets. The paper states the EPO oracle requires ~3 ms per sample for random forests."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": true,
    280         "justification": "Hardware is specified in Appendix H: '11th Gen Intel Core i7-1185G7 processor (3.00 GHz, 8 cores) with 32.0 GiB of RAM'. Runtime figures are provided across experiments."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "Finite hypothesis classes are weakly auditable for statistical parity with sample complexity O((18/epsilon^2) * log(8|F|/delta)).",
    287       "evidence": "Theorem 2 (Section 4.1), proved in Appendix E.1.",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "The SP dimension provides a necessary and sufficient characterization of weak auditability: a concept class F is agnostic and realizably weak auditable if and only if SP(F) is finite.",
    292       "evidence": "Corollary 5 (Section 4.1), derived from Theorem 4. Proofs in Appendix E.2.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "The SP dimension is upper bounded by VC dimension (VC(F) >= SP(F)), meaning auditing statistical parity has lower information complexity than learnability.",
    297       "evidence": "Proposition 6 (Section 4.1) with proof from definition of VC and SP dimensions.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "Model classes with infinite VC dimension are not weakly or strongly auditable.",
    302       "evidence": "Proposition 9 (Section 4.2), proved in Appendix F.3.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "The EPO framework accurately estimates statistical parity and recovers the prospect class on COMPAS and Student Performance datasets, with estimation error decreasing monotonically with sample size.",
    307       "evidence": "Section 5.2, Table 1, Figures 4 and 6. COMPAS estimation error 2.33×10^-2, Student Performance 8.22×10^-3 at budget 1000.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "The framework naturally extends to other auditing objectives including prediction error, learning stability, generalization error, and robust risk.",
    312       "evidence": "Appendix B defines loss functions for each property (Table 2) and proves risk minimization implies weak auditing (Propositions 10, 11, 12). However, no experiments validate these extensions.",
    313       "supported": "moderate"
    314     }
    315   ],
    316   "methodology_tags": ["theoretical", "benchmark-eval"],
    317   "key_findings": "This paper introduces a PAC auditing framework for group fairness (statistical parity) under strategic model updates, where the model owner may change the model class while preserving certain properties. The key theoretical contribution is the SP dimension, a novel combinatorial complexity measure that characterizes the sample complexity of auditing statistical parity and is provably upper bounded by VC dimension. The paper establishes that finite hypothesis classes are both weakly and strongly auditable, while infinite VC classes are not. Numerical experiments on COMPAS and Student Performance datasets confirm that the EPO oracle accurately estimates statistical parity and the prospect class with low runtime overhead.",
    318   "red_flags": [
    319     {
    320       "flag": "No empirical baselines",
    321       "detail": "The numerical experiments only evaluate the authors' EPO algorithm without comparing against any existing auditing methods, including the approach of Yan and Zhang (2022) which is the closest prior work."
    322     },
    323     {
    324       "flag": "Minimal experimental validation",
    325       "detail": "Only two small datasets (COMPAS and Student Performance) with two strategic model types each are tested. The experimental section is thin relative to the theoretical claims, and no experiments validate the extensions to learning error, generalization error, or robust risk."
    326     },
    327     {
    328       "flag": "Missing experimental details",
    329       "detail": "Key experimental parameters are unspecified: MLP architecture, random forest configuration, number of sampled hypotheses for the finite approximation, data preprocessing steps, and scikit-learn version. This makes full reproduction difficult."
    330     },
    331     {
    332       "flag": "Anonymous repository link",
    333       "detail": "The code is hosted at an anonymous review URL (anonymous.4open.science), which may become unavailable after the review process. No permanent archive (e.g., Zenodo) is provided."
    334     }
    335   ],
    336   "cited_papers": [
    337     {
    338       "title": "Active fairness auditing",
    339       "authors": ["Tom Yan", "Chicheng Zhang"],
    340       "year": 2022,
    341       "relevance": "Closest prior work on PAC auditing of statistical parity under manipulation-proofness, which this paper directly extends."
    342     },
    343     {
    344       "title": "Distribution-specific auditing for subgroup fairness",
    345       "authors": ["Daniel Hsu", "Jizhou Huang", "Brendan Juba"],
    346       "year": 2024,
    347       "relevance": "Studies auditing of statistical parity under Gaussian distributions and demonstrates computational hardness of fairness verification."
    348     },
    349     {
    350       "title": "Auditing fairness by betting",
    351       "authors": ["Ben Chugg", "Santiago Cortes-Gomez", "Bryan Wilder", "Aaditya Ramdas"],
    352       "year": 2023,
    353       "relevance": "Alternative approach to fairness auditing using hypothesis testing and sequential methods."
    354     },
    355     {
    356       "title": "Under manipulations, are some AI models harder to audit?",
    357       "authors": ["Augustin Godinot", "Erwan Le Merrer", "Gilles Trédan", "Camilla Penzo", "François Taïani"],
    358       "year": 2024,
    359       "relevance": "Studies the difficulty of auditing ML models under strategic manipulations, directly related to the adversarial auditing setting of this paper."
    360     },
    361     {
    362       "title": "Preventing fairness gerrymandering: Auditing and learning for subgroup fairness",
    363       "authors": ["Michael Kearns", "Seth Neel", "Aaron Roth", "Zhiwei Steven Wu"],
    364       "year": 2018,
    365       "relevance": "Foundational work on auditing statistical parity through reduction to weak agnostic learning."
    366     },
    367     {
    368       "title": "A reductions approach to fair classification",
    369       "authors": ["Alekh Agarwal", "Alina Beygelzimer", "Miroslav Dudík", "John Langford", "Hanna Wallach"],
    370       "year": 2018,
    371       "relevance": "Influential fairness-aware ML framework connecting fair classification to cost-sensitive learning."
    372     },
    373     {
    374       "title": "Are emergent abilities of large language models a mirage?",
    375       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    376       "year": 2023,
    377       "relevance": "Referenced in the context of LLM fairness shifts over time, relevant to AI evaluation methodology."
    378     },
    379     {
    380       "title": "How is ChatGPT's behavior changing over time?",
    381       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    382       "year": 2024,
    383       "relevance": "Referenced for LLM behavioral drift over updates, motivating the paper's framework for auditing under model updates."
    384     },
    385     {
    386       "title": "FairSense: Long-term fairness analysis of ML-enabled systems",
    387       "authors": ["Yining She", "Sumon Biswas", "Christian Kästner", "Eunsuk Kang"],
    388       "year": 2025,
    389       "arxiv_id": "2501.01665",
    390       "relevance": "Recent work on longitudinal fairness analysis of ML systems, directly relevant to auditing fairness under model drift."
    391     },
    392     {
    393       "title": "Closing the AI accountability gap: Defining an end-to-end framework for internal algorithmic auditing",
    394       "authors": ["Inioluwa Deborah Raji", "Andrew Smart", "Rebecca N White", "Margaret Mitchell", "Timnit Gebru"],
    395       "year": 2020,
    396       "relevance": "Framework for algorithmic auditing practices, providing context for the practical challenges of ML fairness auditing."
    397     }
    398   ]
    399 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs