scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26518B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Exposing Privacy Gaps: Membership Inference Attack on Preference Data for LLM Alignment",
      6     "authors": [
      7       "Qizhang Feng",
      8       "Siva Rajesh Kasa",
      9       "Hyokun Yun",
     10       "Choon Hui Teo",
     11       "Sravan Bodapati"
     12     ],
     13     "year": 2024,
     14     "venue": "International Conference on Artificial Intelligence and Statistics",
     15     "arxiv_id": "2407.06443",
     16     "doi": "10.48550/arXiv.2407.06443"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims DPO is more vulnerable to MIA than PPO and introduces PREMIA; both are substantiated by theoretical derivations (Propositions 1-3, Theorem 2.1) and empirical AUROC comparisons across 9 model variants and 2 datasets.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The causal claim that DPO overfitting causes higher MIA susceptibility is backed by a formal theoretical derivation (Propositions 1-2 showing DPO's response estimation error bound is tighter than PPO's) and confirmed empirically by consistently higher AUROC for DPO.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The conclusion broadly states 'models trained with DPO are more susceptible to MIAs than those using PPO' without bounding to open-source models fine-tuned with LoRA under the specific hyperparameter configurations tested on these two datasets.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss whether hyperparameter differences (DPO lr=5e-4 vs PPO lr=5.4e-5; DPO 3 epochs vs PPO 4 epochs) or LoRA regularization differences could partially explain the vulnerability gap rather than the architectural distinction alone.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper uses AUROC as the MIA effectiveness metric and explicitly frames PREMIA as an 'optimistic' attack that provides a practical upper bound on vulnerability, distinguishing measured attack success from actual privacy breach probability.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Limitations are combined with the conclusion in Section 5 ('Conclusion and Limitations'), not a dedicated standalone section.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper notes PREMIA requires base model access and doesn't address mitigations, but does not discuss confounded hyperparameters, single-run variance, or the MALT assumption's restrictiveness as specific threats to the empirical conclusions.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper states PREMIA doesn't work for closed-source LLMs, but does not explicitly bound results to open-source models fine-tuned with LoRA on these two specific datasets or the tested range of model sizes.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding disclosure statement is present; all authors are at Amazon Inc. but no explicit grant, contract, or funding source is declared.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors list Amazon Inc. as their affiliation on the title page.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Amazon employs all authors and has commercial interest in LLM alignment and security; the funder is not independent of outcomes in LLM alignment privacy research.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or declaration of patents, equity, or consulting interests is present beyond the Amazon affiliation.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "MIA, DPO, PPO, preference data tuples (x, yw, yl), the score function M, and AUROC are all formally defined in Section 2 with mathematical notation.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Introduction explicitly lists two contributions: (1) comparative vulnerability assessment with theoretical motivation, and (2) introduction of the PREMIA reference-based attack framework.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper situates itself against prior MIA work (Shokri 2017, Fu 2023, Shi 2024, Duan 2024), explains why existing frameworks are insufficient for preference data tuples, and explicitly builds on Li et al. (2023) and Sablayrolles et al. (2019) for the theoretical analysis.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "The paper's own checklist claims code is available 'as footnotes in §4.2,' but the only links in §4.2 and Appendix C are to existing open-source packages (TRL, PEFT, BitsAndBytes), not to a paper-specific repository with experimental code.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Both datasets are publicly available: Stack-Exchange-Paired is linked at huggingface.co/datasets/lvwerra/stack-exchange-paired and IMDB-RLHF-Pair is from Rafailov et al. (2024).",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Appendix C names TRL, PEFT, and BitsAndBytes packages and lists hyperparameters but provides no version numbers, requirements.txt, or Dockerfile; insufficient for exact environment reproduction.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided; Appendix C gives hyperparameters but not an end-to-end runnable pipeline or scripts.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All AUROC values in Tables 1-5 are point estimates with no confidence intervals or error bars; experiments appear to be single runs.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "Comparative claims (DPO AUROC > PPO AUROC) are made across tables without any statistical significance tests or p-values.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "AUROC values are reported for both DPO and PPO in every cell, with the magnitude of the gap (e.g., 0.803 vs 0.521 for Mistral-7B-v0.1 on SE chosen) providing direct effect size context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No power analysis or justification is given for the 80k SE training samples or 20k IMDB training samples chosen.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No standard deviation or variance across multiple training runs or seeds is reported; all results are single-run point estimates.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Six existing MIA frameworks are used as baselines: Perplexity, Zlib, Lowercase, Ref (cross-model LM), MIN-K%, and Neighbourhood attack.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines include MIN-K% (Shi et al. 2023/2024) and Neighbourhood attack (Mattern 2023), which are state-of-the-art MIA methods specifically for fine-tuned LLMs.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "PREMIA-base vs PREMIA-SFT tests different reference model choices but is not a formal ablation of PREMIA's design; no ablation of the ratio metric, tuple scoring, or threshold choices is performed.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Table 2 reports both MIA performance (AUROC) and a comprehensive set of utility metrics (reward, perplexity, MSSTR, Distinct-1/2, BERTScore, ROUGE, BLEU, METEOR).",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Human evaluation is not applicable to this MIA security research paper.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "SE uses the 'data/evaluation' split as the non-member test set; IMDB uses the remaining data after the 20k training samples; member/non-member sets are distinct.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by model family and size (GPT2 series, Mistral, OpenLlama), dataset (SE vs IMDB), response type (Chosen vs Rejected vs Pair), and attack method across Tables 1-5.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "The paper explicitly discusses that PPO models are 'nearly impregnable to MIA,' that IMDB (easier task) shows lower DPO vulnerability, and that PREMIA fails for closed-source models without base model access.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "PPO-aligned models consistently show AUROC near 0.5 across all frameworks, indicating MIA is ineffective against PPO; this is clearly reported as a key finding.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific versioned model names are given: Gemma-2-2B, Mistral-7B-v0.3, Mistral-7B-v0.1, Open-llama-3b, Open-llama-7b, and GPT2/GPT2-medium/GPT2-large/GPT2-xl.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": false,
    243           "answer": false,
    244           "justification": "The tasks are dataset-driven (SE Q&A, IMDB sentiment) with no custom prompt templates or system instructions; prompts are the dataset examples themselves.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Appendix C provides full hyperparameters for SFT (lr=8e-5, epochs=2), PPO (KL=0.1, batch=16, epochs=4), and DPO (lr=5e-4, beta=0.4, epochs=3), plus LoRA and quantization settings.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is involved in this MIA study.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Data splits are documented (80k SE training samples from train/rl split, 20k IMDB training samples), and PPO training notes filtering data points with maximum length constraints.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "Both datasets are publicly accessible: SE at the linked HuggingFace URL, IMDB-RLHF-Pair from Rafailov et al.; raw data can be independently obtained.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Dataset structure is described: SE contains Stack Overflow Q&A pairs with upvote-based preference labels; IMDB-RLHF-Pair has sentiment-labeled response pairs with chosen/rejected structure.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "Standard public benchmarks are used; no participant recruitment involved.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "The process for constructing the MIA evaluation set — specifically how non-member examples are sampled and balanced against member examples for AUROC computation — is not explicitly documented.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "The paper studies MIA on fine-tuning preference data, not model capability on benchmarks; training cutoff contamination is not applicable.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "NA — not evaluating model capabilities on NLP benchmarks where pre-training contamination would be relevant.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "NA — the paper does not evaluate model capabilities on standard NLP benchmarks.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants involved.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants involved.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants involved.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants involved.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants involved.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants involved.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants involved.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "The computational cost of mounting PREMIA (inference over large models to compute probability ratios) is not reported; no latency or per-attack cost figures are given.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "The paper's self-checklist claims computing infrastructure is described, but no GPU type, cluster size, or training wall-clock time appears in the appendix text provided.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "DPO-aligned LLMs are substantially more susceptible to membership inference attacks than PPO-aligned models",
    375       "evidence": "AUROC for DPO consistently exceeds PPO across all 9 model variants and both datasets; e.g., PREMIA-SFT on Mistral-7B-v0.3 SE: DPO 0.789 vs PPO 0.543 for chosen responses; tuple-level DPO reaches 0.93 AUROC",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "DPO overfits on preference data compared to PPO, making it theoretically more vulnerable to MIA",
    380       "evidence": "Propositions 1-2 show r(π*)-r(πDPO) ≤ 2εr while r(π*)-r(πPPO) ≤ 2εr + 2εx; Figure 3 shows DPO reaching >90% train/eval accuracy within 0.2 epochs on IMDB",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "PREMIA consistently outperforms existing MIA baselines on preference data",
    385       "evidence": "PREMIA-SFT achieves the highest or second-highest AUROC in most columns of Table 1, outperforming PPL, Zlib, Lowercase, Ref, MIN-K, and N-hood baselines",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "MIA vulnerability varies with model size and task complexity in a task-dependent manner",
    390       "evidence": "GPT2-xl (1.5B) shows higher DPO vulnerability than Mistral-7B on SE; on IMDB (easier task) both PPO and DPO are more robust; contrasts with pretraining MIA literature where larger models are more vulnerable",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "PPO-aligned models are nearly impregnable to existing MIA frameworks",
    395       "evidence": "PPO AUROC across Table 1 and Table 4 is consistently 0.50-0.56 for all methods and all models, barely above random chance (0.5)",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "DPO and PPO have comparable utility performance despite DPO's higher MIA vulnerability",
    400       "evidence": "Table 2 shows similar BERTScore (0.877 vs 0.883), ROUGE (0.443 vs 0.457), BLEU, and METEOR for DPO and PPO on Mistral-7B SE, with PPO showing better reward and perplexity",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "theoretical"
    407   ],
    408   "key_findings": "DPO-aligned LLMs are substantially more vulnerable to membership inference attacks than PPO-aligned models, with AUROC up to 0.93 for DPO versus ~0.52 for PPO on the same data. The authors provide both theoretical grounding (DPO's direct optimization on preference data causes greater overfitting than PPO's reward-model-mediated optimization) and empirical confirmation across 9 model variants and 2 datasets. The proposed PREMIA framework, which treats preference tuple structure explicitly and uses the base model as reference, consistently outperforms existing MIA baselines. MIA vulnerability is modulated by task difficulty — on simpler tasks (IMDB sentiment) both methods become more robust — and by model size in a task-dependent way that inverts the trend seen for pre-training data MIA.",
    409   "red_flags": [
    410     {
    411       "flag": "No statistical significance tests",
    412       "detail": "AUROC comparisons between DPO and PPO across 9 models and 2 datasets are reported as point estimates with no confidence intervals, error bars, or significance tests, making it impossible to assess whether observed differences are statistically reliable."
    413     },
    414     {
    415       "flag": "Confounded hyperparameters",
    416       "detail": "DPO and PPO are trained with different learning rates (5e-4 vs 5.4e-5), epoch counts (3 vs 4), and regularization configurations; the vulnerability gap could partly reflect these differences rather than the architectural distinction alone."
    417     },
    418     {
    419       "flag": "MALT assumption added post-hoc",
    420       "detail": "Proposition 3 depends on the restrictive MALT assumption; a footnote states this 'was added in a later revision to address a limitation in the original analysis,' suggesting the original theoretical claim was overreaching."
    421     },
    422     {
    423       "flag": "No variance across runs",
    424       "detail": "All results appear to be single training runs with no seeds or repeated experiments reported; random variation in AUROC from a single run could be on the order of the observed differences for some model/dataset combinations."
    425     },
    426     {
    427       "flag": "Optimistic base-model-access assumption",
    428       "detail": "PREMIA requires access to the exact base model used for fine-tuning, which may not be available in real deployments; the high AUROC numbers (up to 0.93) reflect this optimistic assumption, not necessarily realistic attacker capability."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    434       "relevance": "Foundational DPO method whose privacy properties are the central subject of this paper"
    435     },
    436     {
    437       "title": "Detecting Pre-Training Data from Large Language Models (Min-K%)",
    438       "relevance": "Key MIA baseline used for comparison and contextualizes MIA effectiveness on LLMs"
    439     },
    440     {
    441       "title": "Do Membership Inference Attacks Work on Large Language Models?",
    442       "relevance": "Prior work showing most MIAs barely outperform random guessing on pre-trained LLMs, motivating focus on fine-tuning/alignment data"
    443     },
    444     {
    445       "title": "White-box vs Black-box: Bayes Optimal Strategies for Membership Inference",
    446       "relevance": "Provides the MALT theoretical framework and Bayes optimal membership formulation used throughout the paper"
    447     },
    448     {
    449       "title": "Fundamental Limits of Membership Inference Attacks on Machine Learning Models",
    450       "relevance": "Provides overfitting-based MIA lower bounds used in Theorem 2.1"
    451     },
    452     {
    453       "title": "Policy Optimization in RLHF: The Impact of Out-of-Preference Data",
    454       "relevance": "Theoretical framework distinguishing PPO vs DPO optimization that grounds the DPO overfitting analysis"
    455     },
    456     {
    457       "title": "Is DPO Superior to PPO for LLM Alignment? A Comprehensive Study",
    458       "relevance": "Contextualizes the privacy-utility tradeoff findings: DPO's alignment advantage does not extend to privacy"
    459     },
    460     {
    461       "title": "Membership Inference Attacks Against Machine Learning Models",
    462       "relevance": "Foundational MIA paper establishing the attack framework and score-function formulation"
    463     }
    464   ],
    465   "engagement_factors": {
    466     "practical_relevance": {
    467       "score": 2,
    468       "justification": "Directly actionable for practitioners: choosing PPO over DPO for privacy-sensitive applications is a concrete, usable takeaway."
    469     },
    470     "surprise_contrarian": {
    471       "score": 2,
    472       "justification": "DPO is widely adopted as simpler and equally effective; revealing it has substantially worse privacy properties challenges mainstream alignment practice."
    473     },
    474     "fear_safety": {
    475       "score": 2,
    476       "justification": "Raises concrete privacy risks for organizations collecting human preference data for RLHF — individual annotator inputs can be identified through MIA."
    477     },
    478     "drama_conflict": {
    479       "score": 1,
    480       "justification": "Moderate tension between DPO's popularity and its privacy weakness, but no major named controversy."
    481     },
    482     "demo_ability": {
    483       "score": 1,
    484       "justification": "Uses public HuggingFace datasets and TRL framework, but no demo or runnable notebook is provided."
    485     },
    486     "brand_recognition": {
    487       "score": 1,
    488       "justification": "Amazon-affiliated authors; paper mentions Claude and ChatGPT in passing but evaluates only open-source models."
    489     }
    490   },
    491   "hn_data": {
    492     "threads": [
    493       {
    494         "hn_id": "36858335",
    495         "title": "No Train No Gain:Revisiting Efficient Training Algrthm for Transformer-BasedLM",
    496         "points": 11,
    497         "comments": 1,
    498         "url": "https://news.ycombinator.com/item?id=36858335"
    499       },
    500       {
    501         "hn_id": "42566444",
    502         "title": "DeepSeek-V2: A Strong, Economical, and Efficient MOE Language Model",
    503         "points": 3,
    504         "comments": 0,
    505         "url": "https://news.ycombinator.com/item?id=42566444"
    506       },
    507       {
    508         "hn_id": "27847063",
    509         "title": "Learning to Recommend Items to Wikidata Editors",
    510         "points": 3,
    511         "comments": 0,
    512         "url": "https://news.ycombinator.com/item?id=27847063"
    513       },
    514       {
    515         "hn_id": "40107757",
    516         "title": "A Comprehensive Overview of Large Language Models",
    517         "points": 2,
    518         "comments": 0,
    519         "url": "https://news.ycombinator.com/item?id=40107757"
    520       },
    521       {
    522         "hn_id": "37514790",
    523         "title": "A Comprehensive Overview of Large Language Models",
    524         "points": 2,
    525         "comments": 0,
    526         "url": "https://news.ycombinator.com/item?id=37514790"
    527       },
    528       {
    529         "hn_id": "42084557",
    530         "title": "AI Knowledge and Reasoning: Emulating Expert Creativity in Scientific Research",
    531         "points": 1,
    532         "comments": 2,
    533         "url": "https://news.ycombinator.com/item?id=42084557"
    534       }
    535     ],
    536     "top_points": 11,
    537     "total_points": 22,
    538     "total_comments": 3
    539   }
    540 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs