scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28274B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Investigating Intersectional Bias in Large Language Models using Confidence Disparities in Coreference Resolution",
      6     "authors": [
      7       "Falaah Arif Khan",
      8       "Nivedha Sivakumar",
      9       "Yinong Oliver Wang",
     10       "Katherine Metcalf",
     11       "Cezanne Camacho",
     12       "Barry-John Theobald",
     13       "Luca Zappella",
     14       "Nicholas Apostoloff"
     15     ],
     16     "year": 2025,
     17     "venue": "COLM 2025",
     18     "arxiv_id": "2508.07111",
     19     "doi": "10.48550/arXiv.2508.07111"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "All quantitative claims in the abstract (245,700 prompts, disparities up to 40%, doubly-disadvantaged identities worst off, hegemonic marker drops) are backed by Tables 2–5 and Figs. 2–10 in the paper.",
     27         "source": "haiku"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper claims LLMs rely on 'memorization rather than logical reasoning' and that augmentations 'trigger and reveal bias,' but the observational design only shows correlation/disparity patterns; no controlled causal experiments (e.g., training data ablations or probing) are performed.",
     33         "source": "haiku"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Abstract and conclusion make broad claims about 'LLMs' and their use 'in hiring and admissions,' but only five specific open-weight models are tested on one coreference corpus (WinoBias derivatives); closed-source and larger frontier models are excluded with no qualification of this scope gap.",
     39         "source": "haiku"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper includes a non-demographic augmentation baseline to distinguish demographic-specific brittleness from general brittleness (App. A.6), and explicitly contrasts memorization vs. reasoning as alternative explanations for performance drops.",
     45         "source": "haiku"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper explicitly defines coreference confidence (Eq. 1) as next-token log-probability differences, frames this as a proxy for fairness under uncertainty, and distinguishes it from accuracy-based measures, citing Kuzucu et al. (2024) for theoretical grounding.",
     51         "source": "haiku"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 5 contains a dedicated 'Limitations and Future Work' paragraph listing scope restrictions, and Section 6 'Ethical Considerations' adds further limitations.",
     59         "source": "haiku"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Specific limitations named: restriction to 50 identities and 1,575 WinoBias sentences, combinatorial explosion in evaluation, US-centric demographic markers, and inability to translate mathematical unfairness to practical social harm threshold.",
     65         "source": "haiku"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper explicitly states 'Our research is specifically focused on the US context, and therefore, generalizability of our findings to other cultural and sociolinguistic settings is uncertain and may be limited.'",
     71         "source": "haiku"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No funding acknowledgment or disclosure section exists in the paper; only author affiliations are noted.",
     79         "source": "haiku"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "All eight authors are disclosed as Apple employees or as doing work at Apple; affiliations are stated on the title page.",
     85         "source": "haiku"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The paper evaluates five external open-weight models (Mistral, Mixtral, Llama3, Pythia, Falcon) rather than Apple's own products, so the funder (Apple) is not directly advantaged by the findings.",
     91         "source": "haiku"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests or financial interests statement appears anywhere in the paper.",
     97         "source": "haiku"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Coreference confidence (Eq. 1), coreference confidence disparity (Eq. 2), intersectional bias, hegemonic/disadvantaged identities, Type-1/Type-2 sentences, and augmentation types (R-Aug, NR-Aug, C-Aug) are all formally defined.",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 1 explicitly states two contributions: (1) the WinoIdentity benchmark and flexible construction framework, and (2) empirical evaluation findings raising validity and value-misalignment concerns.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 2 distinguishes this work from 15+ prior benchmarks on single-axis vs. intersectional fairness, uncertainty-based vs. accuracy-based evaluation, and from contemporaneous work by Kotek et al. (2023), explaining how this paper extends and differs from each.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "A GitHub URL (https://github.com/apple/ml-winoidentity) is provided in Section 1 footnote 1.",
    128           "source": "haiku"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": true,
    133           "justification": "WinoIdentity is released via the same GitHub repository; the paper also relies on WinoBias, a publicly available benchmark.",
    134           "source": "haiku"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No requirements.txt, Dockerfile, or dependency list is mentioned in the paper; only greedy decoding is specified as an implementation choice.",
    140           "source": "haiku"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "The augmentation procedure is described mathematically but no step-by-step instructions for reproducing experiments (model loading, tokenization, evaluation loop) are provided in the paper.",
    146           "source": "haiku"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "Table 2 (the primary result table showing disparities up to 40%) reports only point estimates; Table 3 reports mean ± std for average confidence but not for the key disparity metric.",
    154           "source": "haiku"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "No statistical hypothesis tests are applied anywhere in the paper; results are reported as raw averages and maximum disparities without p-values or confidence intervals.",
    160           "source": "haiku"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Disparities are reported as raw differences on a [0,1] scale (e.g., 0.40 for socio-economic status in Mistral), providing an interpretable effect size relative to the possible range.",
    166           "source": "haiku"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The 1,575 base sentences are taken directly from WinoBias with no power analysis or sample size justification; the 25 demographic markers are chosen based on The Wheel of Power and Privilege without statistical rationale.",
    172           "source": "haiku"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": true,
    177           "justification": "Table 3 reports mean ± std for coreference confidence under each augmentation condition (e.g., '0.322 ± 0.693'), providing variance information for absolute confidence scores.",
    178           "source": "haiku"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "Both a 'no augmentation' baseline (original WinoBias) and a 'non-demographic augmentation' baseline (neutral adjectives like 'confused,' 'relaxed') are included to contextualize disparity results.",
    186           "source": "haiku"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Five models are compared: Mistral-7B-instruct-v0.2, Mixtral-8x7B, Llama3-70B, Pythia-12B, and Falcon-40B, all released 2022–2024 and representing the contemporary open-weight landscape.",
    192           "source": "haiku"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Three augmentation types (R-Aug, NR-Aug, C-Aug) serve as a structured ablation isolating where demographic information is placed; CoT vs. no-CoT for Mistral (App. A.5) is an additional ablation.",
    198           "source": "haiku"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Both coreference confidence disparity (uncertainty-based) and accuracy disparity (Table 5) are reported, enabling comparison between the two evaluation approaches.",
    204           "source": "haiku"
    205         },
    206         "human_evaluation": {
    207           "applies": false,
    208           "answer": false,
    209           "justification": "The study is a fully automated benchmark evaluation; human judgment of model outputs is not part of the methodology.",
    210           "source": "haiku"
    211         },
    212         "held_out_test_set": {
    213           "applies": false,
    214           "answer": false,
    215           "justification": "This is an evaluation study, not a training/prediction task; the WinoIdentity corpus IS the test set and no train/test split is needed.",
    216           "source": "haiku"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Results are broken down by demographic attribute (10 categories), model (5), sentence type (Type-1/Type-2), augmentation type (3), and referent occupation (Figs. 4, 11–14), providing extensive granularity.",
    222           "source": "haiku"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Section 4.2 explicitly analyzes failure cases: fem:mechanic avg confidence -0.065 vs. transgender:fem -0.11 and gay:fem -0.24, quantifying double-disadvantage effects per occupation.",
    228           "source": "haiku"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "App. A.5 reports that CoT prompting reduces overall confidence while only partially reducing disparities — a negative result for CoT as a mitigation strategy — presented transparently.",
    234           "source": "haiku"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Exact model identifiers are provided: mistral-7B-instruct-v0.2, mixtral-8x7B-instruct, llama3-70b-instruct, pythia-12B, falcon-40B-instruct.",
    242           "source": "haiku"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Section 3.1 shows example prompts, Section 4 describes the exact format ('The pronoun [pronoun] refers to the'), and App. A.5 provides a full multi-turn CoT prompt example.",
    248           "source": "haiku"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Greedy decoding is explicitly specified ('We use greedy decoding as this ensures deterministic predictions for reproducibility'); no sampling hyperparameters apply.",
    254           "source": "haiku"
    255         },
    256         "scaffolding_described": {
    257           "applies": false,
    258           "answer": false,
    259           "justification": "No agentic scaffolding is used; models are queried directly via next-token probability extraction.",
    260           "source": "haiku"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": true,
    265           "justification": "The full augmentation procedure is documented in Section 3.2 and Fig. 1, including the formula for augmented set size (2(|G|+1)) and how contrastive markers are handled for multi-value attributes.",
    266           "source": "haiku"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "The WinoIdentity dataset (245,700 prompts) is released at the GitHub URL provided; raw benchmark data is publicly accessible.",
    274           "source": "haiku"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "Section 3.2 fully describes the data construction pipeline: source corpus (WinoBias), demographic markers (Table 1, The Wheel of Power and Privilege), augmentation logic, and resulting corpus size.",
    280           "source": "haiku"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": false,
    284           "answer": false,
    285           "justification": "No human participants are involved; the benchmark is constructed programmatically from existing corpora.",
    286           "source": "haiku"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": true,
    291           "justification": "The complete pipeline from WinoBias base sentences through augmentation types (R-Aug, NR-Aug, C-Aug) to the final 245,700 prompts is documented in Section 3 with formal notation.",
    292           "source": "haiku"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "No training data cutoffs are stated for any of the five evaluated models; the contamination concern is raised motivationally but not addressed with specific cutoff dates.",
    300           "source": "haiku"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": true,
    305           "justification": "Section 5 explicitly raises that 'existing fairness benchmarks are likely to be included in LLM training data and potentially memorized,' which directly motivates the augmentation framework as a contamination mitigation strategy.",
    306           "source": "haiku"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": true,
    311           "justification": "The paper explicitly designs WinoIdentity as a contamination-resistant extension of WinoBias, arguing that the augmented prompts are unlikely to appear verbatim in training corpora.",
    312           "source": "haiku"
    313         }
    314       },
    315       "human_studies": {
    316         "pre_registered": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants; not applicable.",
    320           "source": "haiku"
    321         },
    322         "irb_or_ethics_approval": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants; not applicable.",
    326           "source": "haiku"
    327         },
    328         "demographics_reported": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants; not applicable.",
    332           "source": "haiku"
    333         },
    334         "inclusion_exclusion_criteria": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants; not applicable.",
    338           "source": "haiku"
    339         },
    340         "randomization_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants; not applicable.",
    344           "source": "haiku"
    345         },
    346         "blinding_described": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants; not applicable.",
    350           "source": "haiku"
    351         },
    352         "attrition_reported": {
    353           "applies": false,
    354           "answer": false,
    355           "justification": "No human participants; not applicable.",
    356           "source": "haiku"
    357         }
    358       },
    359       "cost_and_practicality": {
    360         "inference_cost_reported": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "No inference costs or latency figures are reported for running 245,700 prompts across five models.",
    364           "source": "haiku"
    365         },
    366         "compute_budget_stated": {
    367           "applies": true,
    368           "answer": false,
    369           "justification": "No total compute budget, GPU hours, or hardware specifications are disclosed.",
    370           "source": "haiku"
    371         }
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "LLMs exhibit coreference confidence disparities as high as 40% across intersectional demographic groups, with the largest disparities for socio-economic status, body type, disability, and sexual orientation.",
    378       "evidence": "Table 2 shows Mistral disparities of 0.389 (Type-1) and 0.400 (Type-2) for socio-economic status, 0.392 for body type (Type-2); Table 4 extends these findings across all augmentation types.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Doubly-disadvantaged intersectional identities (e.g., gay or transgender women) in anti-stereotypical occupational settings (e.g., mechanic) show exacerbated underconfidence compared to single-axis disadvantaged groups.",
    383       "evidence": "Section 4.2 reports: fem:mechanic avg confidence -0.065, transgender:fem:mechanic -0.11, gay:fem:mechanic -0.24 for Mistral C-Aug conditions.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Coreference confidence decreases even for hegemonic/privileged demographic markers (White, cisgender, heterosexual) after referent augmentation, suggesting LLMs rely on memorization rather than genuine reasoning.",
    388       "evidence": "Figs. 9 and 10 (App. A.2) show confidence decreases for all 10 demographic attributes including hegemonic markers for all models except Pythia, which was already near-zero confidence.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Less performant models (Pythia) show smaller confidence disparities than more performant models (Mistral), revealing a performance-fairness tradeoff.",
    393       "evidence": "Table 2 shows Pythia disparities below 0.12 in all demographic attributes vs. Mistral disparities of 0.20–0.40 in 7 of 10 attributes; this tradeoff is explicitly discussed in Section 4.1.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Chain-of-thought prompting reduces confidence disparities but also decreases overall coreference confidence, trading parity for accuracy.",
    398       "evidence": "Tables 6–9 (App. A.5) show that CoT generally halves disparities (e.g., ses Type-2: 0.400→0.216) but also substantially reduces mean confidence (e.g., no-aug Type-2 fem: 0.600→0.480).",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "Uncertainty-based evaluation detects bias that accuracy-based measures obscure, as models can improve accuracy by becoming more biased toward non-referent exclusion.",
    403       "evidence": "App. A.3.1 (Table 3) shows accuracy increases under non-referent augmentation for all models except Llama3, while coreference confidence decreases on Type-2 sentences — the spurious accuracy gain masks bias.",
    404       "supported": "strong"
    405     }
    406   ],
    407   "methodology_tags": [
    408     "benchmark-eval",
    409     "observational"
    410   ],
    411   "key_findings": "The paper introduces WinoIdentity, a 245,700-prompt intersectional fairness benchmark built by augmenting WinoBias with 25 demographic markers across 10 attributes intersected with binary gender. Evaluating five open-weight LLMs, confidence disparities exceed 20% in 7 of 10 demographic attributes, reaching 40% for socio-economic status and body type, with doubly-disadvantaged identities (e.g., gay or transgender women assigned to male-dominated occupations) showing the most severe underconfidence. Hegemonic markers (White, cisgender) also cause confidence drops — not just disadvantaged ones — suggesting models rely on memorization rather than intersectional reasoning. Two independent failure modes are identified: validity failure (poor performance on augmented prompts indicating memorization) and value misalignment (disparate confidence across demographic groups), which can compound to amplify real-world identity-based harms.",
    412   "red_flags": [
    413     {
    414       "flag": "Memorization claim unsupported by design",
    415       "detail": "The conclusion that LLMs 'rely on memorization rather than reasoning' is inferred from performance degradation under augmentation, but no direct test of memorization (training data membership inference, probing, or causal intervention) is performed."
    416     },
    417     {
    418       "flag": "Frontier models excluded",
    419       "detail": "Only five older open-weight models are evaluated; no closed-source or frontier models (GPT-4, Claude, Gemini) appear despite the paper claiming broad implications for LLM deployment in hiring and admissions."
    420     },
    421     {
    422       "flag": "Binary gender only",
    423       "detail": "Despite focusing on intersectional identity, the gender axis uses only binary masculine/feminine pronouns, excluding non-binary identities that are explicitly mentioned as vulnerable in related work (Hossain et al., 2023)."
    424     },
    425     {
    426       "flag": "No significance testing on main results",
    427       "detail": "All disparity comparisons in Tables 2, 4, 5 are point estimates without confidence intervals or hypothesis tests; it is unclear whether observed differences exceed measurement noise, especially for smaller disparities."
    428     },
    429     {
    430       "flag": "Overgeneralization from 5 models to 'LLMs'",
    431       "detail": "Conclusions frame findings as properties of 'LLMs' generally, but coverage is limited to five specific open-weight models from 2022–2024, with no scope qualification in the abstract or conclusion."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "Gender bias in coreference resolution: Evaluation and debiasing methods (WinoBias)",
    437       "relevance": "Foundation corpus extended by this work; coreference resolution bias benchmark this study is built upon."
    438     },
    439     {
    440       "title": "Uncertainty as a fairness measure",
    441       "relevance": "Provides theoretical grounding for uncertainty-based fairness evaluation, which the paper's core metric (coreference confidence disparity) is based on."
    442     },
    443     {
    444       "title": "A validity perspective on evaluating the justified use of data-driven decision-making algorithms",
    445       "relevance": "Framework distinguishing validity from value alignment that structures the paper's two-failure-mode analysis."
    446     },
    447     {
    448       "title": "Gender bias and stereotypes in large language models (Kotek et al., 2023)",
    449       "relevance": "Contemporaneous work extending WinoBias to evaluate reasoning strategies; paper explicitly contrasts its uncertainty-based approach against this accuracy-based one."
    450     },
    451     {
    452       "title": "Bias out-of-the-box: an empirical analysis of intersectional occupational biases in popular generative language models (Kirk et al., 2024)",
    453       "relevance": "Prior work on intersectional occupational bias in LLMs that this paper builds upon and extends to uncertainty-based evaluation."
    454     },
    455     {
    456       "title": "Intersectional stereotypes in large language models: Dataset and analysis (Ma et al., 2023)",
    457       "relevance": "Related intersectional stereotype dataset using 6 demographic attributes; compared in related work as prior intersectionality benchmark."
    458     },
    459     {
    460       "title": "Gender, race, and intersectional bias in resume screening via language model retrieval (Wilson & Caliskan, 2024)",
    461       "relevance": "Directly evaluates downstream allocational harms (resume screening) that motivate this paper's focus on LLM validity and fairness."
    462     },
    463     {
    464       "title": "Factoring the matrix of domination: A critical review and reimagination of intersectionality in AI fairness (Ovalle et al., 2023)",
    465       "relevance": "Comprehensive review of intersectionality in fair-ML cited as foundational context for the intersectional framing."
    466     }
    467   ],
    468   "engagement_factors": {
    469     "practical_relevance": {
    470       "score": 2,
    471       "justification": "Directly addresses LLM bias in high-stakes hiring/admissions contexts with a released benchmark, but provides no practical mitigation strategies beyond noting CoT has limited effectiveness."
    472     },
    473     "surprise_contrarian": {
    474       "score": 2,
    475       "justification": "The finding that hegemonic/privileged markers (White, cisgender) also cause confidence drops contradicts the expected 'unmarked' behavior and challenges simple bias narratives."
    476     },
    477     "fear_safety": {
    478       "score": 2,
    479       "justification": "Explicitly raises concerns about real-world discrimination in hiring (e.g., down-ranking resumes mentioning 'Black Feminist Scholars' or 'Neurodivergent in AI Affinity Group')."
    480     },
    481     "drama_conflict": {
    482       "score": 1,
    483       "justification": "Standard academic bias evaluation paper; the Apple affiliation evaluating non-Apple models is mildly interesting but not inherently controversial."
    484     },
    485     "demo_ability": {
    486       "score": 2,
    487       "justification": "The WinoIdentity benchmark and code are publicly released at GitHub, allowing practitioners and researchers to reproduce the evaluation immediately."
    488     },
    489     "brand_recognition": {
    490       "score": 2,
    491       "justification": "Apple Research authorship and COLM 2025 venue provide moderate brand recognition; not from a top-tier AI lab primarily known for LLMs."
    492     }
    493   },
    494   "hn_data": {
    495     "threads": [
    496       {
    497         "hn_id": "44582856",
    498         "title": "A LoD of Gaussians: Ultra-Large Scale Reconstruction with External Memory",
    499         "points": 2,
    500         "comments": 0,
    501         "url": "https://news.ycombinator.com/item?id=44582856",
    502         "created_at": "2025-07-16T14:40:05Z"
    503       },
    504       {
    505         "hn_id": "44561157",
    506         "title": "The Kinematic Age of 3I/Atlas and Its Implications for Early Planet Formation",
    507         "points": 2,
    508         "comments": 0,
    509         "url": "https://news.ycombinator.com/item?id=44561157",
    510         "created_at": "2025-07-14T15:12:25Z"
    511       },
    512       {
    513         "hn_id": "45466398",
    514         "title": "Who's Advertising to Your AI?",
    515         "points": 1,
    516         "comments": 1,
    517         "url": "https://news.ycombinator.com/item?id=45466398",
    518         "created_at": "2025-10-03T18:53:26Z"
    519       },
    520       {
    521         "hn_id": "45867965",
    522         "title": "FPI-Det: A Face–Phone Interaction Dataset for Phone-Use Detection",
    523         "points": 1,
    524         "comments": 0,
    525         "url": "https://news.ycombinator.com/item?id=45867965",
    526         "created_at": "2025-11-09T18:43:52Z"
    527       }
    528     ],
    529     "top_points": 2,
    530     "total_points": 6,
    531     "total_comments": 1
    532   }
    533 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs