scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29021B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "An Evaluation of Cultural Value Alignment in LLM",
      6     "authors": [
      7       "Nicholas Sukiennik",
      8       "Chen Gao",
      9       "Fengli Xu",
     10       "Yong Li"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2504.08863",
     15     "doi": "10.48550/arXiv.2504.08863"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All abstract claims (20 countries, 10 models, Hofstede framework, deviation metric, GLM-4 top performer, US best-aligned, model-origin effects) are demonstrated in results sections.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Paper frames findings as correlations and observational patterns (e.g., 'may explain' convergence by training data), not causal claims. No experimental manipulation to establish causation.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Claims bounded to tested 20 countries, 10 models, Hofstede framework. Acknowledged limitation: 'since we only test one language per country, our cultural alignment evaluations cannot be considered complete.'",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Paper proposes 'global average convergence' mechanism, discusses model origin, training data availability, and GDP/web-content correlations as explanations. Some alternatives explored but coverage could be broader.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Paper conflates Hofstede survey response alignment with 'cultural alignment' without discussing whether matching a 1980 questionnaire reflects true cultural understanding or just surface-level value reproduction.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Dedicated limitations paragraph in conclusion discusses language selection limitation and inability to assess multi-language cultures within single countries.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Limitations are generic (language choice). Missing specific threats: Hofstede survey contamination in training data, temperature=0 reducing diversity, translation validity, whether survey questions universalize across cultures.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Scope is implicit (20 countries, 10 models, Hofstede framework). Lacks explicit statements of what results do NOT show (e.g., 'does not assess actual user trust' or 'does not measure ability to adapt to within-country subcultures').",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding sources mentioned, no acknowledgments section, no statement that work was unfunded.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All authors affiliated with Tsinghua University or Tsinghua-affiliated institutions (BNRist, Zhipu AI) clearly stated in author block.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Potential undisclosed conflict: GLM-4 (top performer, 'best ability to align') is from Zhipu AI, affiliated with Tsinghua where all authors work. This relationship not discussed as potential bias.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement. No mention of patents, equity holdings, consulting relationships, or institutional financial stakes in evaluated models.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms defined: 'alignment' (pursuing goals matching values), 'cultural alignment' (via Hofstede framework), 'bias' (reflecting one subgroup), cultural dimensions explained in Appendix A.1.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three explicit contributions stated: (1) rank models/countries with deviation ratio metric, (2) analyze model origin/language/size effects, (3) characterize US dominance and data-availability correlation.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Related work section compares to BLEnD, AlKhamissi et al., Cao et al., Tao et al., explaining how prior efforts address only partial scope ('small portion of overall picture') vs. this paper's larger scale.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No code repository, GitHub link, or promise of code release mentioned. Only evaluation of existing proprietary/public LLM models.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "LLM response outputs (400 prompts × 3 runs = 1200 data points per model) not released. Hofstede ground truth is publicly available but paper's generated data unavailable.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Temperature=0 specified, model names and parameter counts in Table 2 provided, but missing API versions, framework versions, exact dates for model snapshots, and other hyperparameters (top_p, max_tokens).",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Methodology described (Hofstede survey, 20 languages, extract numerical response) but no step-by-step reproduction guide. Someone could partially recreate with effort but instructions are not detailed.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Results averaged across 3 runs but no confidence intervals or error bars reported. All figures (1-7) show point estimates without uncertainty bands.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No significance tests for model ranking differences, country differences, or model-origin effects. Correlations (Pearson r) reported but no significance tests for those.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Deviation ratios provided (e.g., US 1.99, Germany 1.13), absolute differences reported (Figure 3a), Pearson correlations with r values shown (Figure 6: r=0.94, r=0.81).",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No justification for 20 countries, 10 models, or 3 runs. No power analysis or minimum sample size argument provided.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Results are 3-run averages but variance/std dev not reported. No error bars in figures; single point estimates for each combination.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Hofstede ground truth (aggregated human survey responses) serves as baseline. Global average culture used as second reference point.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "Hofstede 1980 framework is classic but over 40 years old. Ground truth timing unclear ('aggregated from valid studies'). For 2025 cultural evaluation, baseline could be more recent.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Partial ablations: language effects (aligned vs English vs Chinese vs averaged), model origin effects (US vs China), model size effects. Not comprehensive but systematic examination of individual factors.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Two main metrics reported (ground truth difference and deviation ratio). Results broken down by all 6 dimensions separately. Both aggregate and per-category analysis.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "Uses human-derived Hofstede ground truth but does not include human raters evaluating whether LLM outputs align with cultural values in practice.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "Not a prediction task; evaluates on fixed 24-question survey. Not applicable.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Per-dimension results shown in Figure 1b, Figure 7 (dimension-specific deviation ratios). Per-country rankings in Table 3. Comprehensive category breakdown.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": false,
    223           "justification": "No discussion of specific failure modes, no examples of questions on which models perform poorly, no qualitative analysis of wrong answers.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "General finding that all models converge to 'moderate cultural middle ground' and poorly represent non-US cultures is negative. However, no specific failed experiments reported.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Table 2 specifies 10 models with company, version names (GPT-3.5-Turbo, GPT-4, GPT-4o), and parameter counts. However, exact snapshot dates and API versions not provided.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Table 1 shows prompting mechanism with system role template, example Hofstede question, response options, and sample response. System role and instructions ('make only one choice, include numerical value') documented.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "Only temperature=0 specified. Missing: top_p, top_k, max_tokens, frequency_penalty, presence_penalty, other inference parameters.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "Simple role-based prompting with survey questions. No multi-step reasoning scaffolding, chain-of-thought, or complex agentic patterns used. Not applicable.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Preprocessing documented: extract numerical response from model output, average across 3 runs, normalize to 0-100 scale, apply Hofstede dimension formula (Equation 2).",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "LLM response data (4,000 LLM×country×language combinations × 3 runs) not available for independent verification. Hofstede ground truth is public.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Collection procedure clear: 10 models × 20 countries × 20 languages × 3 runs = 1,200 calls per model. Extraction method (numerical response only) documented. Survey source (Hofstede) specified.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants beyond Hofstede ground truth. Hofstede's original participant recruitment not described in this paper. Not applicable.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Pipeline documented: prompt LLMs with survey questions in 20 languages → extract numerical response → average 3 runs → calculate dimension scores (Eq. 2) → compute deviation ratio (Eq. 1) → visualize.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "No training cutoff dates provided for any of the 10 models. Necessary to assess whether models seen Hofstede survey in training data.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "Hofstede 1980 framework is foundational and widely cited; extremely likely in training data of all models. This potential contamination not discussed or acknowledged.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "Hofstede Values Survey Module is well-known; models may have memorized ground truth scores or survey questions directly. Critical validity threat not addressed.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants; not applicable.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human subjects; not applicable.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants; not applicable.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants; not applicable.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants; not applicable.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants; not applicable.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants; not applicable.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No cost or latency reported for 4,000 API calls across 10 models. Computational budget (API expenses, wall-clock time) not mentioned.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "No total computational budget disclosed. No mention of total API costs, compute hours, or feasibility constraints.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "All LLMs converge toward a moderate 'global average culture' regardless of country-specific prompting.",
    374       "evidence": "Figure 1 shows LLM averages clustering tightly together near the middle of each dimension's axis, while ground truth values are widely dispersed.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "United States culture is best-aligned across all LLMs by a wide margin (deviation ratio 1.99).",
    379       "evidence": "Table 3 ranks US at top with 1.99, second-place Germany at 1.13. Figure 3b confirms US far ahead in deviation ratio ranking.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "GLM-4 has the best ability to align to cultural values despite having only 9 billion parameters.",
    384       "evidence": "Figure 3b ranks GLM-4 first in deviation ratio (0.91); authors highlight 'GLM-4 best on cultural alignment despite its small size.'",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Models regardless of origin align better with US culture than with Chinese culture.",
    389       "evidence": "Figure 4a: US-origin models deviation ratio 1.21 (US) vs 0.76 (China). Figure 4b: China-origin models 1.26 (US) vs 0.72 (China).",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Cultural alignment strongly correlates with the percentage of web content from a country (r=0.94).",
    394       "evidence": "Figure 6c shows scatter plot with r=0.94 correlation between web content percentage and deviation ratio.",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Larger model size does not predict better cultural alignment.",
    399       "evidence": "Figure 6a shows weak log-linear relationship (r=0.13) between parameters and alignment; GLM-4 (9B) outperforms Qwen-72B (72B).",
    400       "supported": "moderate"
    401     },
    402     {
    403       "claim": "Prompting in aligned language improves alignment for US-origin models but not consistently for China-origin models.",
    404       "evidence": "Figure 4 shows US-origin models improve with aligned/English prompting; China-origin models show opposite or mixed pattern.",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "benchmark-eval",
    410     "observational"
    411   ],
    412   "key_findings": "LLMs exhibit systematic convergence toward a 'cultural middle ground,' failing to represent the diverse values of non-US cultures. The United States achieves disproportionately high alignment (deviation ratio 1.99 vs. Germany's 1.13) across all models. Cultural alignment correlates strongly with training data availability, evidenced by a near-perfect correlation (r=0.94) with web content representation and high correlation with GDP (r=0.81). Counter-intuitively, model size is not a predictor of cultural alignment; GLM-4 (9B parameters) outperforms much larger models like Qwen-72B (72B), suggesting architectural choices or training data composition matter more than scale.",
    413   "red_flags": [
    414     {
    415       "flag": "No statistical significance testing",
    416       "detail": "All model and country rankings lack significance tests or confidence intervals. Unclear if rank differences (e.g., GLM-4 vs Gemini-1.5 at 0.91 vs 0.88) are meaningful or noise."
    417     },
    418     {
    419       "flag": "Unaddressed training data contamination",
    420       "detail": "Hofstede 1980 framework is foundational and almost certainly in training data of all models. Models may be recalling memorized values rather than demonstrating cultural understanding. No cutoff dates provided to assess risk."
    421     },
    422     {
    423       "flag": "Undisclosed conflict of interest",
    424       "detail": "GLM-4 (top performer) is from Zhipu AI affiliated with Tsinghua; all authors from Tsinghua. This relationship not discussed despite being a key finding."
    425     },
    426     {
    427       "flag": "Proxy validity not discussed",
    428       "detail": "Paper uses Hofstede survey response alignment as proxy for 'cultural alignment' without validating whether matching a 44-year-old questionnaire reflects actual cultural understanding or capability."
    429     },
    430     {
    431       "flag": "No error measures reported",
    432       "detail": "Three runs per condition but no variance, std dev, or confidence intervals reported. Impossible to assess measurement uncertainty."
    433     },
    434     {
    435       "flag": "Incomplete environmental documentation",
    436       "detail": "Missing API versions, exact model snapshot dates, hyperparameters (top_p, top_k, max_tokens), making full reproduction impossible."
    437     },
    438     {
    439       "flag": "Unjustified design choices",
    440       "detail": "No justification for 20 countries, 10 models, or 3 runs per condition. No power analysis or sample size determination."
    441     },
    442     {
    443       "flag": "No failure case analysis",
    444       "detail": "No qualitative analysis of specific questions where models fail, no examples of responses, no error pattern investigation."
    445     },
    446     {
    447       "flag": "Limited ablation coverage",
    448       "detail": "While language/origin/size examined, other critical factors untested: system prompt wording variation, instruction clarity, whether Hofstede questions themselves bias responses."
    449     },
    450     {
    451       "flag": "Temporal scope undefined",
    452       "detail": "No training cutoff dates for models; unclear whether 'recent' models tested or how rapidly evolving model behavior is captured."
    453     }
    454   ],
    455   "cited_papers": [
    456     {
    457       "title": "Auditing and Mitigating Cultural Bias in LLMs",
    458       "authors": "Tao et al.",
    459       "year": 2023,
    460       "relevance": "Prior work on quantifying cultural bias in GPT models using two-dimensional cultural scale; provides methodological precedent for this work."
    461     },
    462     {
    463       "title": "Investigating Cultural Alignment of Large Language Models",
    464       "authors": "AlKhamissi et al.",
    465       "year": 2024,
    466       "relevance": "Limited evaluation of 4 LLMs on cultural alignment for 2 cultures (Egypt, US) using Hofstede framework; this paper scales up the scope."
    467     },
    468     {
    469       "title": "Assessing Cross-Cultural Alignment between ChatGPT and Human Societies",
    470       "authors": "Cao et al.",
    471       "year": 2023,
    472       "relevance": "Evaluated cultural values alignment using 5 languages and 1 model on Hofstede survey; foundational methodological work."
    473     },
    474     {
    475       "title": "BLEnD: A Benchmark for LLMs on Everyday Knowledge in Diverse Cultures and Languages",
    476       "authors": "Myung et al.",
    477       "year": 2024,
    478       "relevance": "Related benchmark testing cultural knowledge across languages; complements values-based cultural assessment with knowledge-based evaluation."
    479     },
    480     {
    481       "title": "Stochastic Parrots: Can Language Models Be Too Big?",
    482       "authors": "Bender et al.",
    483       "year": 2021,
    484       "relevance": "Foundational critique of LLM capabilities and representation of human values; contextualizes alignment problem."
    485     },
    486     {
    487       "title": "Whose Opinions Do Language Models Reflect?",
    488       "authors": "Santurkar et al.",
    489       "year": 2023,
    490       "relevance": "Examines ideological and value representation in LLMs across political spectra; related work on value bias."
    491     },
    492     {
    493       "title": "Persistent Anti-Muslim Bias in Large Language Models",
    494       "authors": "Abid et al.",
    495       "year": 2021,
    496       "relevance": "Demonstrates specific cultural/demographic bias in LLM outputs; exemplifies cultural value misalignment."
    497     },
    498     {
    499       "title": "Towards Measuring the Representation of Subjective Global Opinions in Language Models",
    500       "authors": "Durmus et al.",
    501       "year": 2023,
    502       "relevance": "Related audit methodology for cultural values; identifies three types of cultural auditing approaches."
    503     }
    504   ],
    505   "engagement_factors": {
    506     "practical_relevance": {
    507       "score": 3,
    508       "justification": "Organizations deploying LLMs globally must understand cultural biases in value representation; findings directly inform multi-market product design, content moderation, and training data curation."
    509     },
    510     "surprise_contrarian": {
    511       "score": 2,
    512       "justification": "US alignment dominance expected given English/US web representation; however, GLM-4's superior performance despite smallest parameter count (9B) contrasts with industry assumption that bigger = better."
    513     },
    514     "fear_safety": {
    515       "score": 2,
    516       "justification": "Raises concerns about LLM cultural bias propagating in global applications and reinforcing Western values in non-Western contexts; moderate AI ethics safety relevance rather than existential risk."
    517     },
    518     "drama_conflict": {
    519       "score": 1,
    520       "justification": "US-China model origin comparison has geopolitical framing but finding (both align to US) is technically neutral and lacks dramatic conflict angle."
    521     },
    522     "demo_ability": {
    523       "score": 2,
    524       "justification": "Results reproducible by prompting various LLMs with Hofstede survey but requires access to 10 models, translations into 20 languages, and statistical aggregation; not a simple demo."
    525     },
    526     "brand_recognition": {
    527       "score": 1,
    528       "justification": "Tsinghua authors but not mega-brand research labs. OpenAI/DeepMind/Meta appear as model creators not authors; limited brand halo from evaluation team."
    529     }
    530   },
    531   "hn_data": {
    532     "threads": [
    533       {
    534         "hn_id": "44253021",
    535         "title": "SmartAttack: Air-Gap Attack via Smartwatches",
    536         "points": 18,
    537         "comments": 6,
    538         "url": "https://news.ycombinator.com/item?id=44253021"
    539       },
    540       {
    541         "hn_id": "44852610",
    542         "title": "Design Patterns for Securing LLM Agents Against Prompt Injections",
    543         "points": 14,
    544         "comments": 2,
    545         "url": "https://news.ycombinator.com/item?id=44852610"
    546       },
    547       {
    548         "hn_id": "44504434",
    549         "title": "Design Patterns for Securing LLM Agents Against Prompt Injections",
    550         "points": 3,
    551         "comments": 0,
    552         "url": "https://news.ycombinator.com/item?id=44504434"
    553       },
    554       {
    555         "hn_id": "44427833",
    556         "title": "Simple low-dimensional computations explain variability in neuronal activity",
    557         "points": 2,
    558         "comments": 0,
    559         "url": "https://news.ycombinator.com/item?id=44427833"
    560       },
    561       {
    562         "hn_id": "44858671",
    563         "title": "Design Patterns for Securing LLM Agents Against Prompt Injections",
    564         "points": 2,
    565         "comments": 0,
    566         "url": "https://news.ycombinator.com/item?id=44858671"
    567       },
    568       {
    569         "hn_id": "44855060",
    570         "title": "Design Patterns for Securing LLM Agents Against Prompt Injections",
    571         "points": 2,
    572         "comments": 0,
    573         "url": "https://news.ycombinator.com/item?id=44855060"
    574       },
    575       {
    576         "hn_id": "44366937",
    577         "title": "SmartAttack: Air-Gap Attack via Smartwatches",
    578         "points": 2,
    579         "comments": 0,
    580         "url": "https://news.ycombinator.com/item?id=44366937"
    581       },
    582       {
    583         "hn_id": "44254732",
    584         "title": "SmartAttack: Air-Gap Attack via Smartwatches",
    585         "points": 2,
    586         "comments": 0,
    587         "url": "https://news.ycombinator.com/item?id=44254732"
    588       },
    589       {
    590         "hn_id": "40159296",
    591         "title": "COCONut: Modernizing Coco Segmentation",
    592         "points": 2,
    593         "comments": 0,
    594         "url": "https://news.ycombinator.com/item?id=40159296"
    595       },
    596       {
    597         "hn_id": "44225464",
    598         "title": "An Extra RMSNorm Is All You Need for Fine Tuning to 1.58 Bits",
    599         "points": 1,
    600         "comments": 0,
    601         "url": "https://news.ycombinator.com/item?id=44225464"
    602       }
    603     ],
    604     "top_points": 18,
    605     "total_points": 48,
    606     "total_comments": 8
    607   }
    608 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs