scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (30224B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Detecting Adversarial Fine-tuning with Auditing Agents",
      6     "authors": [
      7       "Sarah Egler",
      8       "John Schulman",
      9       "Nicholas Carlini"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2510.16255",
     14     "doi": "10.48550/arXiv.2510.16255"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Key abstract claims (56.2% detection rate at 1% FPR, detection of covert cipher attacks, failure of basic content moderation, over 1400 audits) are directly supported by Table 1, Figure 6, Appendix C, and the experimental setup description.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper uses systematic ablation of tool configurations (9 configurations in Table 1) to support causal claims about which affordances improve detection, which is adequate for this type of agent system study.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The conclusion explicitly acknowledges that '50% true positive rate at 1% false positive rate means attackers can still frequently evade the detector,' and the evaluation is bounded to 8 attacks from the literature using GPT models via OpenAI API.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper does not discuss whether detection success could stem from Claude's safety training creating bias toward flagging cipher-like content, or whether the auditor's performance reflects pattern matching rather than genuine reasoning about harm.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper clearly distinguishes between the risk score (proxy) and actual adversarial fine-tuning (outcome), and explicitly discusses how benign models can trigger high risk scores through unintentional safety degradation.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "There is no dedicated limitations section; limitations are scattered across the conclusion (Section 6) and adversarial robustness section (5.6) without consolidation.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 5.6 specifically identifies future evasion strategies (covert steganography, clean-label backdoors, semantic backdoor triggers, dataset poisoning to distract the auditor), and Section 5.3 gives specific false positive sources including low resource language and educational insecure code fine-tunes.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper explicitly states scope: 'we seek to prevent deployment of adversarial models and leave additional inference time monitoring of fine-tuned models to future work,' bounded to SFT attacks on GPT models via OpenAI API.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 7 (Acknowledgments) discloses funding from the MATS program and Anthropic Fellows Program with additional support from Constellation.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All author affiliations are disclosed on the title page: Sarah Egler (MATS & Anthropic Fellows Program), John Schulman (Thinking Machines Lab), Nicholas Carlini (Anthropic).",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "Anthropic is both a funder and employer of co-author Carlini; the paper uses Claude Sonnet 4 as the primary auditing agent and reports it outperforms competitor models (o3, Qwen 2.5 72b), creating a direct conflict between funder interests and reported outcomes.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "There is no competing interests statement or declaration of financial interests (patents, equity, consulting); the acknowledgments section covers only funding sources.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms are defined: 'fine-tuning auditing agent' (Section 3), 'attack-specific elicitation' (Section 2), and the threat model (Section 2.1) precisely defines the adversary and defender roles and assumptions.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper clearly states its contribution: introducing fine-tuning auditing agents, demonstrating 56.2% detection on 8 diverse attacks, and releasing the auditor as a baseline for future work.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 substantively engages with prior adversarial fine-tuning attacks, content moderation defenses, and alignment auditing agents (Bricken et al., Marks et al.), explicitly distinguishing this work by its false-positive constraint and access to the pre-fine-tuned model.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The auditing agent is released at https://github.com/safety-research/finetuning-auditor as stated in the abstract—a current release, not a future promise.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "All adversarial fine-tuning datasets are from publicly available literature sources, with Appendix B providing full dataset links, base models, and hyperparameters; the HEx-PHI evaluation benchmark is also publicly available.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper mentions Python and LangChain but provides no requirements.txt, Dockerfile, or explicit dependency specification in the paper text; reproducibility depends on what the GitHub repository contains.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "The Reproducibility Statement commits to open-sourcing code, Appendix A provides full auditor prompts, Appendix D provides tool implementations, and Appendix B provides fine-tuning details; though full reproduction requires paid OpenAI API access (~$120/fine-tune).",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Table 1 reports only point estimates for detection rates across all tool configurations; no confidence intervals or error bars are provided despite 20 independent audits per condition.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No statistical significance tests are used when comparing detection rates across tool configurations or auditor models, despite making comparative claims such as 'super-agent significantly outperforms single-agent.'",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Detection rates at multiple FPR thresholds (0%, 1%, 5%) and AUROC values are reported in Table 1, providing quantitative effect sizes for comparisons across tool configurations.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The choice of 20 independent audits per fine-tune per configuration is not formally justified; no power analysis or reasoning for this number is provided.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Figure 5 shows risk score distributions visually, but the main results in Table 1 report only point estimates without standard deviations or confidence intervals, making reliability of comparisons unassessable.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "The OpenAI Moderation API serves as a content filtering baseline (Appendix C shows it fails on most attacks), and single-tool configurations in Table 1 serve as ablation baselines for the full tool set.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "The OpenAI moderation API baseline uses 'omni-moderation-latest'; Youstra (2024) probe monitors for cipher attacks are mentioned as contemporary prior work in the same space.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Table 1 presents a comprehensive ablation of 9 tool configurations (D, M, B, M+D, M+D+B, M+D+B*, M+D+S, M+D+B*+S, M+D+B*+S+P) systematically isolating each component's contribution.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Multiple metrics are used: TPR at 0%, 1%, and 5% FPR thresholds, AUROC (Table 1), and per-attack detection rates (Figure 6).",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "Human evaluation is not applicable to this automated detection system; the task is automated binary classification of fine-tuned models, not a human judgment task.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "The 13 fine-tuned models constitute an evaluation set; the auditor (Claude Sonnet 4) is not fine-tuned on these models, so there is no training/test contamination within the auditing framework itself.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Figure 6 provides per-attack detection rates for all 8 adversarial fine-tunes at the 1% FPR threshold, and Figure 10 (appendix) shows per-model risk scores across all tool configurations.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Section 5.3 analyzes false positives on benign fine-tunes (Kashmiri, educational insecure code, Alpaca), and Section 5.2 discusses why subliminal learning is the most elusive attack to detect.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The paper reports that model querying alone achieves only 1.4% detection at 1% FPR, that 'more tools does not result in strictly better performance,' and that M+D+B*+S underperforms M+D+B* alone at 1% FPR.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Exact version IDs are specified for the auditor (Claude Sonnet 4) and all fine-tuned base models (gpt-4.1-2025-04-14, gpt-4o-2024-08-06, gpt-3.5-turbo-1106) in Tables 4 and 6.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Full auditor system prompt (A.1), user prompt (A.2), super-agent system prompt (A.3), and super-agent user prompt (A.4) are provided verbatim in Appendix A.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "Fine-tuning epochs are reported in Table 4, but temperature, top-p, and other sampling hyperparameters for the Claude Sonnet 4 auditing agent are not reported.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Agentic scaffolding is described in detail in Section 3 (six tools, agent loop mechanism) and Appendix D (full Python tool implementations for all six tool types).",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Fine-tuning data format (JSONL chat completions) is documented in Appendix D, the recursive summarization pipeline (chunk size, prompts) is in Appendix D.4, and HEx-PHI benchmark administration is described in Section 3.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "The raw audit transcripts and risk scores across all 1400+ audits are not released in a dataset format; only selected transcript excerpts are shown in the appendix.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 4 describes the experimental setup including 20 independent audits per fine-tune per configuration, attack success rate verification before inclusion, and GPT judge methodology for the HEx-PHI benchmark.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants were recruited; all evaluations are conducted via automated API calls to LLMs.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The complete pipeline from fine-tuning (dataset → OpenAI API → fine-tuned model) through auditing (tool calls → risk score → threshold classification) is documented across Sections 3-4 and Appendices B-D.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Claude Sonnet 4's training data cutoff is not stated; HEx-PHI (sourced from Anthropic red-teaming data) may have been in Claude's training data, which is not acknowledged.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "The paper does not discuss whether HEx-PHI benchmark prompts, known adversarial fine-tuning techniques, or related datasets were in Claude Sonnet 4's training data, which could inflate detection performance.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "HEx-PHI is sourced from Anthropic red-teaming data and Claude Sonnet 4 is an Anthropic model; potential contamination of the evaluation benchmark is not addressed anywhere in the paper.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants; pre-registration is not applicable.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants; IRB approval is not applicable. The ethics statement addresses dual-use concerns about replicating attacks.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants; demographics are not applicable.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants; inclusion/exclusion criteria are not applicable.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants; randomization in the human studies sense is not applicable.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants; blinding is not applicable.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants; attrition is not applicable.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Appendix H reports that 'the average single-agent audit with Claude Sonnet 4 lasts approximately 24 turns over several minutes and costs approximately $1,' enabling practical deployment cost estimation.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": true,
    364           "justification": "Appendix H states the average fine-tune cost was $120 via OpenAI API; per-unit audit cost (~$1) and fine-tuning cost are provided, though total study budget is not explicitly summed.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Fine-tuning auditing agents achieve 56.2% detection rate at 1% false positive rate across 8 adversarial attacks",
    373       "evidence": "Table 1: M+D+B*+S+P configuration achieves 56.2% TPR at 1% FPR across 20 independent audits per fine-tune",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "All 8 adversarial attacks have non-zero detection rates at 1% FPR with the full tool set",
    378       "evidence": "Figure 6 shows per-attack detection rates at threshold τ=9 (1% FPR); each attack has some non-zero detection",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Standard content moderation cannot detect attacks where each training datapoint is individually harmless",
    383       "evidence": "Appendix C and Figure 9 show OpenAI Moderation API flags 0 examples from most adversarial fine-tuning datasets",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Super-agent approach significantly outperforms single-agent at 0% FPR (49.4% vs 5.6%)",
    388       "evidence": "Table 2 directly compares super-agent and single-agent with D+M affordances; large TPR gap at 0% FPR with no statistical test",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Dataset inspection is the most useful single affordance, outperforming model querying and benchmark running alone",
    393       "evidence": "Table 1: D alone achieves 23.1% TPR at 1% FPR vs 19.4% for B alone and 1.4% for M alone",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Claude Sonnet 4 outperforms o3 and Qwen 2.5 72b as an auditing agent",
    398       "evidence": "Figure 13 and Section 5.5 compare models on D+M configuration only; Claude achieves higher TPR but also more false positives",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "Auditing agents can learn ciphers in-context from training data to perform attack-specific elicitation",
    403       "evidence": "Figures 2 and 7 show auditor successfully learning Walnut53 and EndSpeak ciphers from dataset samples and querying models in cipher to elicit harmful responses",
    404       "supported": "strong"
    405     },
    406     {
    407       "claim": "Benign fine-tuning can unintentionally degrade safety, creating false positives for the auditor",
    408       "evidence": "Section 5.3 and Appendix I show Alpaca, educational insecure code, and Kashmiri fine-tunes receiving elevated risk scores with justified reasoning",
    409       "supported": "strong"
    410     }
    411   ],
    412   "methodology_tags": [
    413     "benchmark-eval",
    414     "case-study"
    415   ],
    416   "key_findings": "Fine-tuning auditing agents powered by Claude Sonnet 4 can detect adversarial fine-tuning at 56.2% detection rate with only 1% false positive rate, successfully identifying all 8 attack types including covert cipher attacks that completely evade content moderation. Dataset inspection is the most critical affordance, enabling the auditor to learn attack-specific elicitation strategies in-context from training examples. A super-agent approach aggregating 10 sub-audits dramatically improves performance at 0% FPR (49.4% vs 5.6% single-agent). However, the ~50% detection ceiling is insufficient for production deployment, benign fine-tunes with unintentional safety degradation represent a persistent false positive challenge, and the paper's conflict of interest (Anthropic-funded, Claude-as-auditor, HEx-PHI from Anthropic red-teaming data) is not addressed.",
    417   "red_flags": [
    418     {
    419       "flag": "No variance on main results",
    420       "detail": "Table 1 reports only point estimates for detection rates across 9 tool configurations despite 20 independent audits per condition; no confidence intervals or standard deviations are reported, making it impossible to assess reliability of differences between configurations."
    421     },
    422     {
    423       "flag": "Anthropic conflict of interest unaddressed",
    424       "detail": "The paper is funded by Anthropic, co-authored by an Anthropic researcher (Carlini), uses Claude Sonnet 4 as the primary auditor, and reports Claude outperforms o3 and Qwen—without an explicit competing interests statement or acknowledgment of this conflict."
    425     },
    426     {
    427       "flag": "HEx-PHI benchmark contamination risk",
    428       "detail": "The evaluation benchmark is sourced from Anthropic red-teaming data; Claude Sonnet 4 (Anthropic model) may have been trained on or otherwise exposed to these prompts, potentially inflating detection performance in ways not discussed."
    429     },
    430     {
    431       "flag": "Small evaluation set",
    432       "detail": "Only 8 adversarial and 5 benign fine-tuned models are evaluated, all using GPT model families via OpenAI API; detection rates may not generalize to other model families, fine-tuning providers, or novel attack vectors."
    433     },
    434     {
    435       "flag": "No significance tests on comparisons",
    436       "detail": "Comparative claims (e.g., 'super-agent significantly outperforms single-agent,' Claude 'more consistently' outperforms o3) are made without statistical significance tests, despite having sufficient repeated measurements (20 audits per condition) to conduct them."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to!",
    442       "relevance": "Foundational work demonstrating adversarial fine-tuning bypasses safety training; motivates detection mechanisms and introduces the AOA attack evaluated here"
    443     },
    444     {
    445       "title": "Covert malicious finetuning: Challenges in safeguarding LLM adaptation",
    446       "relevance": "Introduces Walnut53 and EndSpeak cipher-based covert attacks that this paper attempts to detect; establishes pointwise-undetectability as the core challenge"
    447     },
    448     {
    449       "title": "Emergent misalignment: Narrow finetuning can produce broadly misaligned LLMs",
    450       "relevance": "Introduces the emergent misalignment and backdoored insecure code attacks included in the evaluation set"
    451     },
    452     {
    453       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    454       "relevance": "Introduces backdoor 'sleeper agent' attacks included in the evaluation; provides theoretical grounding for trigger-based misalignment"
    455     },
    456     {
    457       "title": "Harmful fine-tuning attacks and defenses for large language models: A survey",
    458       "relevance": "Survey contextualizing the adversarial fine-tuning threat landscape from which attacks are drawn"
    459     },
    460     {
    461       "title": "Fundamental limitations in defending LLM fine-tuning APIs",
    462       "relevance": "Prior work establishing pointwise-undetectable attacks as the key defense challenge; this paper directly addresses that limitation"
    463     },
    464     {
    465       "title": "Building and evaluating alignment auditing agents",
    466       "relevance": "Direct methodological predecessor using LLM auditing agents for alignment research; this paper adapts the super-agent approach for fine-tuning API defense"
    467     },
    468     {
    469       "title": "Auditing language models for hidden objectives",
    470       "relevance": "The 'Auditing Game' that inspired this work; establishes the blue-team auditing paradigm applied here to fine-tuning API defense"
    471     },
    472     {
    473       "title": "Subliminal learning: Language models transmit behavioral traits via hidden signals in data",
    474       "relevance": "Introduces the subliminal learning attack (the most evasive attack in the evaluation) using a misaligned teacher model"
    475     },
    476     {
    477       "title": "No, of course I can! Deeper fine-tuning attacks that bypass token-level safety mechanisms",
    478       "relevance": "Introduces the NOICE prompt-based jailbreak attack evaluated in this paper"
    479     }
    480   ],
    481   "engagement_factors": {
    482     "practical_relevance": {
    483       "score": 3,
    484       "justification": "Directly addresses a real production threat faced by all major LLM providers (OpenAI, Anthropic, Google) who expose fine-tuning APIs, with released code and per-audit cost estimates enabling immediate deployment evaluation."
    485     },
    486     "surprise_contrarian": {
    487       "score": 2,
    488       "justification": "The finding that an LLM can learn substitution ciphers in-context from training data and use them for attack-specific elicitation is surprising; that more tools don't always improve performance challenges intuitive assumptions about capability scaling."
    489     },
    490     "fear_safety": {
    491       "score": 3,
    492       "justification": "Demonstrates that adversarial fine-tuning creates cipher-capable 'sleeper agent' models undetectable by content moderation, with a ~50% detection ceiling suggesting current defenses are inadequate for production deployment of fine-tuning APIs."
    493     },
    494     "drama_conflict": {
    495       "score": 1,
    496       "justification": "Mild conflict of interest angle (Anthropic-funded work finding Claude beats competitors on an Anthropic benchmark), but the paper is technically focused and not presented controversially."
    497     },
    498     "demo_ability": {
    499       "score": 2,
    500       "justification": "Code is released on GitHub and audit cost is ~$1, making it feasible to try; however, full reproduction requires paid OpenAI fine-tuning API access at ~$120 per fine-tuned model."
    501     },
    502     "brand_recognition": {
    503       "score": 3,
    504       "justification": "Authors include Nicholas Carlini (prominent ML security researcher at Anthropic) and John Schulman (OpenAI co-founder, now Thinking Machines Lab), lending high credibility and likely significant community attention."
    505     }
    506   },
    507   "hn_data": {
    508     "threads": [
    509       {
    510         "hn_id": "41929456",
    511         "title": "Quantum inspired factorization up to 100-bit RSA number in polynomial time [pdf]",
    512         "points": 4,
    513         "comments": 0,
    514         "url": "https://news.ycombinator.com/item?id=41929456",
    515         "created_at": "2024-10-23T21:34:43Z"
    516       },
    517       {
    518         "hn_id": "41933882",
    519         "title": "Quantum inspired factorization up to 100-bit RSA number in polynomial time",
    520         "points": 1,
    521         "comments": 0,
    522         "url": "https://news.ycombinator.com/item?id=41933882",
    523         "created_at": "2024-10-24T09:46:08Z"
    524       },
    525       {
    526         "hn_id": "41921364",
    527         "title": "Assessing the Performance of Human-Capable LLMs – Are LLMs Coming for Your Job?",
    528         "points": 1,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=41921364",
    531         "created_at": "2024-10-23T03:05:13Z"
    532       },
    533       {
    534         "hn_id": "41914405",
    535         "title": "Loss of 12 Starlink Satellites Due to the Extreme Geomagnetic Storm of May 2024",
    536         "points": 1,
    537         "comments": 0,
    538         "url": "https://news.ycombinator.com/item?id=41914405",
    539         "created_at": "2024-10-22T14:04:12Z"
    540       },
    541       {
    542         "hn_id": "38177348",
    543         "title": "CleanCoNLL: A Nearly Noise-Free Named Entity Recognition Dataset",
    544         "points": 1,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=38177348",
    547         "created_at": "2023-11-07T14:47:31Z"
    548       },
    549       {
    550         "hn_id": "38163590",
    551         "title": "Multi-Structure Objects Points-To Analysis",
    552         "points": 1,
    553         "comments": 0,
    554         "url": "https://news.ycombinator.com/item?id=38163590",
    555         "created_at": "2023-11-06T15:07:37Z"
    556       }
    557     ],
    558     "top_points": 4,
    559     "total_points": 9,
    560     "total_comments": 0
    561   }
    562 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs