scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24519B)
      1 {
      2   "paper": {
      3     "title": "Navigating representation: utilizing prompt engineering to minimize representational harms in journalist's image captions",
      4     "authors": ["Habiba Sarhan", "Morteza Shahrezaye", "Simon Hegelich"],
      5     "year": 2025,
      6     "venue": "AI and Ethics",
      7     "doi": "10.1007/s43681-025-00773-x"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval", "qualitative"],
     12   "key_findings": "The study compared three prompting strategies (Baseline, Reactive Mitigation, Proactive Guidance) for GPT-4o image captioning against journalist captions from The Guardian. Reactive Mitigation achieved 93.9% least-harmful rankings from the AI-as-judge, but the authors' own manual evaluation revealed the AI systematically penalizes politically and culturally specific language that human annotators consider journalistically appropriate. The Cronbach's Alpha of 0.67 indicates only moderate reliability. The paper's most valuable finding is the tension between harm-avoidance and journalistic accuracy—AI overcorrects by favoring decontextualized neutrality.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL or code archive is provided anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The data availability statement says 'Data is provided within the manuscript or supplementary information files' and supplementary material is available at the DOI link."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper specifies GPT-4o parameters (temperature=0, seed=10, max tokens) but provides no environment setup, dependency list, or reproducible configuration for running the experiments."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The methodology section describes the approach at a high level but lacks executable detail."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No confidence intervals or error bars are reported. Results are presented as raw percentages and counts without uncertainty quantification."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical significance tests are used to compare strategies. The paper uses Cronbach's Alpha for reliability but no tests for whether strategy differences are statistically significant."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are reported. Results are presented as raw percentage distributions and confusion matrix counts."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification is given for why 1,050 images were selected or why 500 were manually evaluated. No power analysis discussed."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Single run with temperature=0 and seed=10. No variance across runs or seeds reported."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Four strategies are compared: Baseline (unfiltered), Reactive Mitigation, Proactive Guidance, and Gold Standard (journalist captions). The Baseline serves as the unguided reference."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Comparison against actual journalist captions from The Guardian is a strong contemporary baseline for the captioning task."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "The three AI strategies represent different approaches but no ablation study isolates which specific components (definitions, examples, feedback mechanism) drive differences."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Multiple metrics used: harmfulness ranking (1-4), Cronbach's Alpha (0.67), precision, recall, and qualitative evaluation factors (Table 1)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Manual assessment of 500 randomly selected images by expert human annotators, evaluating factual accuracy, neutrality, inclusivity, contextual sensitivity, and other factors (Section 3.3.2)."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "All 1,050 images were used for both development and evaluation. No held-out split is described for the prompt development process."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by strategy (Fig. 2) and by thematic category (contextual sensitivity, political naming, migration, caution vs clarity) in Sections 4.2-4.9."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Extensive discussion of failure cases: AI overcorrection penalizing journalist captions, Reactive Mitigation producing overly cautious captions, model refusing to caption 69 images, and specific examples of misranking (Sections 4.1-4.9)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Reported that Reactive Mitigation overcorrects (Section 4.5), AI-as-judge has systematic bias against journalist captions (Section 4.1), and Cronbach's Alpha of only 0.67 indicates moderate reliability."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims that 'Reactive Mitigation emerged as the most effective strategy' and 'AI-generated captions often minimize representational harms more effectively' are supported by the ranking data (Fig. 2), though the paper itself critiques these findings extensively."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper implies prompting strategies cause harm reduction but does not address confounds. The same model generates and evaluates, creating circularity. No discussion of whether results reflect actual harm reduction or just the model's preference for its own neutral outputs."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "Title says 'Large Language Models' and abstract says 'how Large Language Models can be utilized' but the study uses only GPT-4o. The limitations section acknowledges this but the framing is broader than the evidence."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Extensively discusses that AI-as-judge may systematically favor neutrality over journalistic accuracy (Section 4.1), that the model's concept of harm is socially constructed and reflects developer priorities rather than editorial ethics (SCOT framework throughout)."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper explicitly discusses the gap between harmfulness rankings (the proxy) and actual representational harm (the outcome), arguing that the AI's ranking framework misses editorial context, narrative purpose, and journalistic ethics (Sections 4.1, 4.9)."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper specifies 'GPT-4o' but provides no snapshot date or API version. Per schema guidance, marketing names without version identifiers count as NO."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "Generation prompts are quoted in Section 3.1 (brief templates). However, the evaluation prompt—critical to the AI-as-judge methodology—is described in natural language but not reproduced in full. The reader cannot reconstruct the exact evaluation prompt."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 3.2: temperature=0, seed=10, max tokens=1024 for generation, 300 for evaluation."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The study uses direct API calls with different prompts."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "The paper states images came from The Guardian's 'Politics' section but does not describe the selection process for the specific 1,050 images, time period, or filtering criteria. 69 exclusions due to model refusals are noted but their characteristics are not analyzed."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 5 'Limitations' discusses model content policy refusals, single-model limitation, and decision not to use jailbreaking methods."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats discussed: 69 images excluded due to content policy may represent significant events (Section 5), single model (GPT-4o) limits generalizability, and AI-as-judge bias toward neutrality (Section 4.1)."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 5 explicitly states: 'findings may not fully generalize to other language models such as Claude and Gemini' and acknowledges GPT-4o exclusivity 'enhances internal validity' but limits external validity."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Data availability statement: 'Data is provided within the manuscript or supplementary information files.' Supplementary material is available at the DOI."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3.2 describes the dataset: 1,050 politically salient images from The Guardian's 'Politics' section, with a definition of political salience covering protests, conflicts, humanitarian crises, and politically charged events."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The paper does not describe how the specific 1,050 images were selected from The Guardian's Politics section—time period, sampling strategy, and selection criteria are missing."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The pipeline from image collection to final analysis is only loosely described. No counts at each stage (except 69 model refusals) and no explanation of intermediate processing steps."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "'This research received no external funding.' Open Access funding by Projekt DEAL is disclosed."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "All authors affiliated with Technical University of Munich. No affiliation with OpenAI (the evaluated model's maker)."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Projekt DEAL is an Open Access funding initiative with no stake in the research outcomes. No external research funding."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "'The authors declare no competing interests' is explicitly stated in the Declarations section."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "The study evaluates GPT-4o's ability to generate image captions, not its knowledge recall on a benchmark. Contamination in the traditional sense (model memorizing test answers) is not the relevant concern."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Not applicable—the task is generative captioning of images, not knowledge retrieval from a benchmark."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Not applicable—no standard benchmark is used. The images are collected from The Guardian for this study."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "The paper states 'This research did not involve human participants.' The manual evaluation was conducted by expert annotators (likely the authors) rather than recruited participants."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "Paper explicitly states: 'This research did not involve human participants, personal data or sensitive content requiring ethical approval.'"
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. Manual evaluation was by expert annotators, not recruited subjects."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants recruited for the study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in an experimental design."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in an experimental design."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No API costs, tokens consumed, or wall-clock time reported despite calling GPT-4o thousands of times (1,050 images × 4 strategies + evaluations)."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total computational budget, API spend, or hardware information provided."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Single seed (10) with temperature=0. No analysis of whether different seeds or temperature settings would change results."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of experimental runs is not explicitly stated. Temperature=0 and seed=10 imply a single deterministic run but this is not discussed."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No description of how temperature=0 and seed=10 were selected, or whether other settings were tried."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No explanation of how the specific prompt texts were developed or whether alternative formulations were tested. The three strategies appear to be the only configurations evaluated."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Four strategies are compared across multiple dimensions with no correction for multiple comparisons. No formal statistical tests are applied at all."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors designed both the prompting strategies and the evaluation framework. No acknowledgment of self-comparison bias or independent evaluation."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "All strategies use the same model with similar compute costs; compute differences are negligible."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The paper extensively discusses whether AI-generated harmfulness rankings actually measure representational harm, finding significant gaps between the AI's conception of harm and journalistic norms (Sections 4.1-4.9, SCOT analysis)."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is involved; direct API prompting only."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of whether The Guardian images or captions appeared in GPT-4o's training data."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup leaks information. The same model generates and evaluates captions, creating potential information leakage."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "All 1,050 images are from the same source (The Guardian Politics section). No discussion of whether this introduces distributional bias or non-independence."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention methods are used."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Reactive Mitigation is the most effective strategy for reducing harmfulness, with 93.9% of captions ranked as least harmful (Rank 1).",
    364       "evidence": "Figure 2 shows distribution of harmfulness ranks: Reactive Mitigation 93.9% at Rank 1, Baseline 60.6%, Proactive Guidance 60.8%, Journalist 25% roughly even across ranks (Section 4).",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "AI-generated captions minimize representational harms more effectively than journalist captions.",
    369       "evidence": "Harmfulness rankings show all AI strategies outperform journalist captions (Fig. 2). However, manual evaluation reveals the AI judge systematically penalizes politically/culturally specific language that humans consider appropriate (Section 4.1, Table 2). Precision for journalist captions was only 6% with 180 false positives.",
    370       "supported": "weak"
    371     },
    372     {
    373       "claim": "The AI-as-judge exhibits systematic bias toward neutrality, penalizing contextually appropriate journalistic language.",
    374       "evidence": "Manual evaluation of 500 images showed the AI frequently flagged references to 'Palestinians', 'Jewish communities', 'migrants' as harmful when human annotators deemed them editorially necessary. Journalist captions had 180 false positives vs 5 false negatives (Table 2, Section 4.1).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Cronbach's Alpha of 0.67 indicates moderate internal consistency in the AI's harmfulness rankings.",
    379       "evidence": "Cronbach's Alpha test reported at 0.67 (Section 4).",
    380       "supported": "strong"
    381     }
    382   ],
    383   "red_flags": [
    384     {
    385       "flag": "Same model generates and evaluates",
    386       "detail": "GPT-4o is used both to generate captions and to evaluate them for harmfulness. This circularity means the model is likely to prefer its own style of neutral, decontextualized language, biasing results against journalist captions. The paper acknowledges this tension but does not use an independent evaluator."
    387     },
    388     {
    389       "flag": "Single model, single seed, no variance",
    390       "detail": "All experiments use GPT-4o only, with temperature=0 and seed=10. No analysis of whether results change with different models, seeds, or temperature settings. No statistical tests applied to any comparisons."
    391     },
    392     {
    393       "flag": "Abstract overclaims relative to evidence",
    394       "detail": "The abstract states AI captions 'often minimize representational harms more effectively' but the paper's own analysis shows the AI judge systematically penalizes appropriate journalistic language (precision for journalist captions = 6%). The metric may be measuring neutrality preference, not actual harm reduction."
    395     },
    396     {
    397       "flag": "Evaluation prompt not reproduced",
    398       "detail": "The AI-as-judge evaluation prompt is described in natural language but not fully reproduced, making the evaluation methodology non-reproducible despite being central to all quantitative results."
    399     }
    400   ],
    401   "cited_papers": [
    402     {
    403       "title": "Taxonomizing and measuring representational harms: a look at image tagging",
    404       "authors": ["J. Katzman", "A. Wang", "M. Scheuerman", "S.L. Blodgett", "K. Laird", "H. Wallach", "S. Baracas"],
    405       "year": 2023,
    406       "relevance": "Foundational framework for representational harms in AI image systems, directly relevant to evaluating AI system outputs and biases."
    407     },
    408     {
    409       "title": "Measuring representational harms in image captioning",
    410       "authors": ["A. Wang", "S. Barocas", "K. Laird", "H. Wallach"],
    411       "year": 2022,
    412       "relevance": "Proposes methods for measuring stereotypes and errors in AI-generated image captions, directly relevant to AI evaluation methodology."
    413     },
    414     {
    415       "title": "On the dangers of stochastic parrots: can language models be too big?",
    416       "authors": ["E.M. Bender", "T. Gebru", "A. McMillan-Major", "S. Shmitchell"],
    417       "year": 2021,
    418       "relevance": "Seminal critique of large language model risks and biases, relevant to understanding LLM limitations and responsible AI deployment."
    419     },
    420     {
    421       "title": "Taxonomy of risks posed by language models",
    422       "authors": ["L. Weidinger", "J. Uesato", "M. Rauh"],
    423       "year": 2022,
    424       "relevance": "Comprehensive risk taxonomy for LLMs relevant to evaluating safety and bias in AI systems."
    425     },
    426     {
    427       "title": "Bias and fairness in large language models: a survey",
    428       "authors": ["I.O. Gallegos", "R.A. Rossi", "J. Barrow"],
    429       "year": 2024,
    430       "relevance": "Survey of bias and fairness in LLMs, directly relevant to the survey's scope of evaluating AI system quality and safety."
    431     },
    432     {
    433       "title": "Self-taught evaluators",
    434       "authors": ["T. Wang", "I. Kulikov", "O. Golovneva"],
    435       "year": 2024,
    436       "relevance": "Relevant to AI-as-judge methodology and LLM self-evaluation, a key methodological pattern in the survey scope."
    437     },
    438     {
    439       "title": "Meta-rewarding language models: self-improving alignment with LLM-as-a-meta-judge",
    440       "authors": ["T. Wu", "W. Yuan", "O. Golovneva"],
    441       "year": 2024,
    442       "relevance": "LLM-as-judge methodology for alignment evaluation, relevant to AI evaluation and safety research."
    443     },
    444     {
    445       "title": "Jailbroken: How does LLM safety training fail?",
    446       "authors": ["A. Wei", "N. Haghtalab", "J. Steinhardt"],
    447       "year": 2024,
    448       "relevance": "Analysis of LLM safety training failures and jailbreaking, directly relevant to AI safety evaluation."
    449     }
    450   ]
    451 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs