scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28334B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Dialogue Injection Attack: Jailbreaking LLMs through Context Manipulation",
      6     "authors": [
      7       "Wenlong Meng",
      8       "Fan Zhang",
      9       "Wendao Yao",
     10       "Zhenyuan Guo",
     11       "Yuwei Li"
     12     ],
     13     "year": 2025,
     14     "venue": "IEEE Transactions on Information Forensics and Security",
     15     "arxiv_id": "2503.08195",
     16     "doi": "10.1109/TIFS.2026.3657898"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims of 0.89 ASR on Llama-3.1-8B and 0.82 on GPT-4o after 10 queries on AdvBench are supported by Figure 5 multi-query results; defense bypass claims are supported by Table 5.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Ablation study in Section 5.5 systematically removes system prompt, hypnosis, and answer guidance components with measured ASR impact, adequately justifying causal claims about component contributions.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The claim that 'larger LLMs are more susceptible to jailbreak attacks' is stated broadly but is contradicted by the Llama-3 family results and confounded by different alignment strategies and training cutoffs that are not controlled for.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Performance variation across models is attributed primarily to 'different alignments regarding attack types' without systematically exploring alternatives; the model-size finding gets only a single speculative explanation referencing training cutoffs.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "ASR is measured by LlamaGuard classifiers rather than actual harm assessment; the paper acknowledges automated classifiers are imperfect but does not adequately discuss the gap between classifier-confirmed bypass and real-world harmful content generation.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations section; Section 8 is an 'Ethics Consideration' addressing responsible disclosure, not methodological limitations of the study.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats-to-validity discussion exists; the paper does not address potential biases from LlamaGuard evaluation, the limited LLM families tested, or single-run measurement variance.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The black-box threat model is described as a scope constraint but the paper does not explicitly state what results do NOT show (e.g., non-chat interfaces, non-English prompts, domains outside AdvBench categories).",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment appears anywhere in the paper, including the National University of Defense Technology affiliation which would warrant disclosure.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations with Zhejiang University and National University of Defense Technology are clearly disclosed in the paper header.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding is disclosed, making this criterion not applicable.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "There is no competing interests or financial disclosure statement anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms including 'jailbreak attack,' 'dialogue injection,' 'attack success rate,' and the white/gray/black-box taxonomy are clearly defined in Sections 2.3 and 3.1.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly enumerates four contributions: the DIA paradigm with dialogue injection method, DIA-I and DIA-II methods, the template inference attack, and comparative evaluation across 10 LLMs.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 6 explicitly positions DIA as the first multi-turn dialogue-based jailbreak approach versus existing single-turn white/gray/black-box methods, with specific comparisons to GCG, DRA, DeepInception, and PAIR.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Source code is stated as available at https://github.com/meng-wenlong/DIA in the abstract footnote; however, the generated affirmative beginnings dataset is promised only 'after acceptance.'",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All three primary benchmark datasets (AdvBench, HEx-PHI, MaliciousInstruct) are publicly available on HuggingFace Datasets; the paper-generated affirmative beginnings are not yet released.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Only hardware is mentioned (Intel Xeon 8358, 4x Nvidia A100 80G) and inference engine (Ollama); no requirements.txt, Dockerfile, or software dependency versions are provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided; the algorithms (1-3) describe logic but not how to execute the full attack pipeline end-to-end.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 2, 3, 5, and 6 are single point estimates; no confidence intervals, error bars, or multiple-run averages are reported.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to any comparative claims; differences between DIA and baselines are presented as raw ASR values without testing.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Raw ASR values are reported for all method-model combinations, allowing direct effect size computation; the paper also explicitly states degradation percentages (e.g., DRA degrades 67% and 99% on Llama-3.1-8B).",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The paper uses AdvBench (520 items), HEx-PHI (330 items), and MaliciousInstruct (100 items) without justifying adequacy or discussing statistical power.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or multiple experimental runs are reported; all results appear to be single runs on probabilistic LLM outputs.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Four state-of-the-art baselines are included: DeepInception, ReNe, PAIR, and DRA, each representing distinct attack strategies tested under identical conditions.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "All baselines are from 2023-2024, including DRA (USENIX Security 2024) and ReNe; all are relevant recent black-box jailbreak methods.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 5.5 ablates system prompt replacement, hypnosis, and answer guidance for both DIA-I and DIA-II across three models, and separately ablates the prompt rewrite algorithm.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Two evaluation metrics are used: ASR (with both LlamaGuard-2 and LlamaGuard-3 as independent evaluators) and Defense Pass Rate (DPR) in the defense evaluation section.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human evaluation of attack outputs is performed; the paper explicitly opts for automated LlamaGuard classifiers over GPT-4 judging, citing cost and scalability concerns.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "This is an attack evaluation study without a prediction task; the benchmarks serve as attack targets, not prediction test sets requiring train/test separation.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "HEx-PHI contains 11 prohibited categories (illegal activity, fraud, privacy violation, etc.) but no per-category breakdown is provided; all results are aggregated.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "The paper explicitly identifies Llama-3.1-8B as the most secure model, reports near-zero single-query ASR for DIA on multiple models, and discusses DIA-I's poor performance on Llama-2-7B without the prompt rewrite module.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Negative results are clearly reported: DIA-I achieves ~0 ASR on Llama-3.1-8B single-query, DRA fails completely on GPT-4o (ASR=0.000), and component ablations showing degraded performance are included.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "GPT-4o and GPT-4o-mini are specified with exact API snapshot versions (gpt-4o-2024-08-06, gpt-4o-mini-2024-07-18); open-source models are specified by family and parameter count.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The paper describes prompt components structurally (system replacement directives, hypnosis dialogues, answer guidance pattern) but does not provide the actual text of any prompts used in experiments.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No inference hyperparameters (temperature, top-p, max tokens) are reported for DIA or baselines; the paper only states baselines use their originally specified hyperparameters.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "This paper evaluates attack construction pipelines, not agentic scaffolding; the ABGM/SDGM modules are attack components, not agent scaffolding.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "ABGM (Algorithm 1) and SDGM are described in detail including keyword extraction, NLTK-based morphological augmentation, cosine similarity matching, and word substitution steps.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Raw model outputs, attack transcripts, and LlamaGuard evaluation results are not released; only the public benchmark inputs are available.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Benchmarks are downloaded from HuggingFace Datasets and described with statistics (mean token lengths, category counts); affirmative beginning generation via ABGM is described in Algorithm 1.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants are involved; standard published benchmark datasets are used.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The complete pipeline from benchmark loading through ABGM/SDGM processing, dialogue construction, LLM querying via Ollama, and LlamaGuard evaluation is described across Sections 4 and 5.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training cutoffs are mentioned only for Llama-3 models (70B: December 2023, 8B: March 2023) as an incidental explanation; systematic cutoff reporting for all 10 tested models is absent.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The paper briefly notes LLM developers may add prior jailbreak prompts to alignment training (to explain DRA's degradation) but does not systematically address whether AdvBench or HEx-PHI prompts appear in alignment data.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "AdvBench and HEx-PHI are well-known published benchmarks that may be in alignment training data for newer models like Llama-3.1; this is not addressed despite being directly relevant to interpreting differential ASR results.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants are involved in this study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants are involved; Section 8 addresses responsible disclosure ethics, not IRB/participant protection.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants are involved.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants are involved.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants are involved.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants are involved.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants are involved.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No latency, API cost, or inference time is reported for any experiment despite testing 10 LLMs across 3 benchmarks with up to 10 query iterations.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Hardware is described (4x A100 80G GPUs) but total compute time, GPU-hours, or monetary cost for the experiments is not stated.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "DIA achieves 0.89 ASR on Llama-3.1-8B and 0.82 on GPT-4o after 10 queries on AdvBench",
    375       "evidence": "Figure 5 multi-query curves show ASR growth across 10 iterations; stated values are cited from the abstract but the figure shows DIA-II reaching these levels by iteration 10",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Historical dialogue manipulation in black-box settings is practical via dialogue injection using chat template delimiters",
    380       "evidence": "Section 3.2 provides formal construction of adversarial inputs using Su/Pa/Sa/Pu delimiters; logically sound given LLM inference pipeline design shown in Figure 1",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "DIA bypasses 5 defense mechanisms with average DPR of 0.93 (DIA-I) and 0.82 (DIA-II)",
    385       "evidence": "Table 5 shows DPR values for OpenAI Moderation, Perplexity Filter, Defensive System Prompt, Prompt Patch, and Bergeron tested on Gemma-2-9B only",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Larger LLMs within the same family are more susceptible to jailbreak attacks",
    390       "evidence": "Figure 8 shows ASR vs model size; the Llama-3 family contradicts this pattern, and the comparison is confounded by different training cutoffs and alignment strategies",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "Template inference attack achieves ~90% accuracy within 5 query attempts",
    395       "evidence": "Figure 2 shows accuracy vs max try times for three LLM pairs (Qwen2/Gemma2, Qwen2/Llama3, Gemma2/Llama3) reaching ~0.9 at NT_max=5",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Deferred harmful responses have higher log-likelihood than immediate harmful responses",
    400       "evidence": "Figure 4 shows log-likelihood distributions with and without prepended benign text for Llama-3.1-8B and Llama-3.2-11B; distributions shift rightward (less negative) with prepended benign context",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval"
    406   ],
    407   "key_findings": "DIA introduces a novel black-box jailbreak paradigm exploiting LLM chat template structure: attackers can inject fabricated dialogue histories by embedding chat template delimiters directly in user-visible input fields, enabling gray-box prefilling attacks without model access. DIA-II discovers a previously unreported vulnerability that deferred harmful responses have higher generation log-likelihood, and exploits it by having models perform word substitution tasks before answering, achieving high ASR on recently aligned models (e.g., 0.80 on Llama-3.1-70B on HEx-PHI with LlamaGuard-3). Ablation studies confirm all dialogue components contribute to performance, with answer guidance being the most critical. Despite strong empirical results across 10 LLMs and 3 benchmarks, all results lack statistical validation and the generalization claim that larger models are more vulnerable is undermined by contradictory Llama-3 results.",
    408   "red_flags": [
    409     {
    410       "flag": "No statistical testing or variance",
    411       "detail": "All comparative results are single point estimates with no confidence intervals, significance tests, or multiple runs across 10 models and 3 benchmarks, making it impossible to assess reliability."
    412     },
    413     {
    414       "flag": "Guard model as sole success criterion",
    415       "detail": "ASR is measured only by LlamaGuard classifiers; the paper acknowledges these are imperfect proxies but does not quantify how often guard-confirmed 'attacks' produce genuinely actionable harmful content."
    416     },
    417     {
    418       "flag": "Affirmative beginnings dataset not released",
    419       "detail": "Paper-generated affirmative beginnings are a core artifact promised only 'after acceptance,' making full reproduction impossible at evaluation time."
    420     },
    421     {
    422       "flag": "Unsupported model-size vulnerability claim",
    423       "detail": "The claim that larger LLMs are more susceptible is contradicted by the Llama-3 family and confounded by different alignment strategies and training cutoffs, without controlling for these variables."
    424     },
    425     {
    426       "flag": "Defense evaluation on single model only",
    427       "detail": "Table 5 defense bypass results are reported only for Gemma-2-9B; DPR generalization to other model families is unverified."
    428     },
    429     {
    430       "flag": "No prompt text provided",
    431       "detail": "The actual text of system replacement prompts, hypnosis dialogues, and answer guidance used in experiments is not disclosed, only structural descriptions, significantly limiting reproducibility."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    437       "relevance": "Introduces GCG white-box attack and AdvBench benchmark used as the primary evaluation dataset throughout"
    438     },
    439     {
    440       "title": "Making Them Ask and Answer: Jailbreaking Large Language Models in Few Queries via Disguise and Reconstruction",
    441       "relevance": "DRA baseline compared directly across all experiments; key prior work on black-box jailbreak via token disguise"
    442     },
    443     {
    444       "title": "DeepInception: Hypnotize Large Language Model to be Jailbreaker",
    445       "relevance": "Key baseline using fictional nested scenarios; DIA-I incorporates a hypnosis component inspired by this work"
    446     },
    447     {
    448       "title": "Jailbreaking Black Box Large Language Models in Twenty Queries",
    449       "relevance": "PAIR baseline using attacker LLM to iteratively refine prompts; directly compared and used as auxiliary model substitute"
    450     },
    451     {
    452       "title": "Safety Alignment Should Be Made More Than Just a Few Tokens Deep",
    453       "relevance": "Explains the prefilling attack vulnerability that DIA-I builds upon and the shallow alignment limitation DIA exploits"
    454     },
    455     {
    456       "title": "Fine-tuning Aligned Language Models Compromises Safety, Even When Users Do Not Intend To",
    457       "relevance": "Provides HEx-PHI benchmark with 11 prohibited categories used as second primary evaluation dataset"
    458     },
    459     {
    460       "title": "A Wolf in Sheep's Clothing: Generalized Nested Jailbreak Prompts Can Fool Large Language Models Easily",
    461       "relevance": "ReNe baseline with nested scenarios and prompt rewrite; directly compared and shown to sacrifice semantic integrity"
    462     },
    463     {
    464       "title": "Leveraging Context in Jailbreaking Attacks",
    465       "relevance": "Prior work demonstrating context enhances jailbreak success, motivating DIA's historical dialogue manipulation approach"
    466     }
    467   ],
    468   "engagement_factors": {
    469     "practical_relevance": {
    470       "score": 2,
    471       "justification": "Demonstrates real bypass of GPT-4o and Llama safety systems with code available, directly actionable for security teams defending deployed chatbots."
    472     },
    473     "surprise_contrarian": {
    474       "score": 2,
    475       "justification": "Counterintuitive finding that larger LLMs are more susceptible to jailbreak attacks challenges the assumption that scale improves safety alignment."
    476     },
    477     "fear_safety": {
    478       "score": 3,
    479       "justification": "Shows 82% success rate bypassing GPT-4o safety measures and defeats 5 defense mechanisms including OpenAI's own moderation API, with code available for reproduction."
    480     },
    481     "drama_conflict": {
    482       "score": 2,
    483       "justification": "Frames as arms race where prior attacks get patched into alignment training, motivating the need for novel multi-turn attack vectors; tests against current defenses."
    484     },
    485     "demo_ability": {
    486       "score": 2,
    487       "justification": "Code on GitHub and attack requires only chat API access, making it technically accessible; setup complexity (Ollama, ABGM pipeline) limits casual reproduction."
    488     },
    489     "brand_recognition": {
    490       "score": 2,
    491       "justification": "Explicitly targets GPT-4o with measured results; Llama and Gemma families are well-known, though authors are from Chinese universities without major lab brand."
    492     }
    493   },
    494   "hn_data": {
    495     "threads": [
    496       {
    497         "hn_id": "22624980",
    498         "title": "Neuroevolution of Self-Interpretable Agents",
    499         "points": 5,
    500         "comments": 1,
    501         "url": "https://news.ycombinator.com/item?id=22624980"
    502       },
    503       {
    504         "hn_id": "46686419",
    505         "title": "EnergyNet Explained: Internetification of Energy Distribution",
    506         "points": 2,
    507         "comments": 0,
    508         "url": "https://news.ycombinator.com/item?id=46686419"
    509       },
    510       {
    511         "hn_id": "45988739",
    512         "title": "Sheaf Topos Theory: A Powerful Setting for Lagrangian Field Theory",
    513         "points": 2,
    514         "comments": 0,
    515         "url": "https://news.ycombinator.com/item?id=45988739"
    516       },
    517       {
    518         "hn_id": "35219050",
    519         "title": "Large-scale end of life prediction of hard discs in distributed datacenters",
    520         "points": 2,
    521         "comments": 0,
    522         "url": "https://news.ycombinator.com/item?id=35219050"
    523       },
    524       {
    525         "hn_id": "26338513",
    526         "title": "Mixture of Volumetric Primitives for Efficient Neural Rendering",
    527         "points": 2,
    528         "comments": 0,
    529         "url": "https://news.ycombinator.com/item?id=26338513"
    530       },
    531       {
    532         "hn_id": "45302505",
    533         "title": "Verbalized Algorithms",
    534         "points": 1,
    535         "comments": 0,
    536         "url": "https://news.ycombinator.com/item?id=45302505"
    537       },
    538       {
    539         "hn_id": "44791713",
    540         "title": "MQFQ-Sticky: Fair Queueing for Serverless GPU Functions",
    541         "points": 1,
    542         "comments": 0,
    543         "url": "https://news.ycombinator.com/item?id=44791713"
    544       },
    545       {
    546         "hn_id": "44450854",
    547         "title": "Parallel-in-Time Preconditioning for Time-Dependent Variational Mean Field Games",
    548         "points": 1,
    549         "comments": 0,
    550         "url": "https://news.ycombinator.com/item?id=44450854"
    551       },
    552       {
    553         "hn_id": "44326982",
    554         "title": "Interpreting Agent Behaviors in RL-Based Cyber-Battle Simulation Platforms",
    555         "points": 1,
    556         "comments": 0,
    557         "url": "https://news.ycombinator.com/item?id=44326982"
    558       },
    559       {
    560         "hn_id": "22631779",
    561         "title": "Neuroevolution of Self-Interpretable Agents",
    562         "points": 1,
    563         "comments": 0,
    564         "url": "https://news.ycombinator.com/item?id=22631779"
    565       }
    566     ],
    567     "top_points": 5,
    568     "total_points": 18,
    569     "total_comments": 1
    570   }
    571 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs