scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27030B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "An Investigation on Group Query Hallucination Attacks",
      6     "authors": [
      7       "Kehao Miao",
      8       "Xiaolong Jin"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2508.19321",
     13     "doi": "10.48550/arXiv.2508.19321"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "All three abstract claims are supported: fine-tuned model degradation (Table 1), backdoor triggering (Table 2 showing option 'A' dominance), and reasoning task effectiveness (Table 3 showing severe code/math drops).",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "Paper claims GQA 'causes' degradation but provides only observational evidence. No ablation studies isolate causality; no manipulation of individual GQA components to rule out confounds like context length alone.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Results are bounded to specific model sizes (7-8 models tested), task types (MCQ, translation, code, math), and QGS ranges (1-30). Scope is mostly explicit about these constraints.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Paper speculates 'cumulative effect' for reasoning tasks but doesn't test alternative hypotheses. For Q2 (backdoors), doesn't distinguish triggering actual backdoors from mode collapse or statistical artifacts.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Accuracy directly measures correct answers; sacreBLEU directly measures translation quality. Measurements match claims without conflating different constructs.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 6 'Limitations' explicitly lists three specific constraints: limited scenario coverage, only first-query responses analyzed, and model coverage limited by time.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Specific threats stated: gap between multi-choice task focus vs. open-ended user queries, analyzing only first response rather than all responses, and incomplete model sampling due to time.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Paper bounds results to fine-tuned models, specific benchmarks (MedMCQA, PubMedQA, Aqua-RAT, MathQA, HumanEval, WMT20), QGS ≤30, and 7-8 model architectures tested.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No funding source disclosed in paper. No acknowledgments section mentioning grants or support.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Authors' affiliations with University of Science and Technology of China and Purdue University are clearly stated on page 1.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No funding mentioned, so independence cannot be assessed.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement provided. No mention of patents, equity, or consulting relationships.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Group Query Attack and QGS are defined, but 'hallucination' (in title) is never defined. Backdoor is referenced without formal definition. Alignment and contamination contexts are unclear.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Paper explicitly states two contributions: proposing GQA attack method and characterizing its effectiveness across fine-tuned vs. pre-trained models.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "Related work section (2.1-2.2) lists failure modes and backdoor papers but is largely descriptive. Paper doesn't deeply explain how GQA relates to or extends existing failure modes research beyond citation.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": false,
    121           "justification": "No code repository, GitHub link, or code availability statement provided. Reproduction would require implementing the pipeline from scratch.",
    122           "source": "haiku"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "Paper uses standard public benchmarks (HumanEval, MedMCQA, PubMedQA, Aqua-RAT, MathQA, WMT20) available from original sources.",
    128           "source": "haiku"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "Appendix B.2 provides hyperparameters (learning rate, batch size, epochs) but no requirements.txt, Dockerfile, or Python/PyTorch version specifications needed for reproduction.",
    134           "source": "haiku"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No step-by-step reproduction guide provided. Paper describes procedures and provides prompt templates but insufficient detail to reproduce without original code.",
    140           "source": "haiku"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "All results reported as single percentages (Tables 1-3, 9-15) with no error bars or confidence intervals, despite stating three random partitions were averaged.",
    148           "source": "haiku"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "No statistical tests performed. No p-values, t-tests, or significance comparisons between QGS conditions or across models.",
    154           "source": "haiku"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "Percentage point drops are shown (e.g., 53.3% → 19.7%) but not reported as standardized effect sizes (Cohen's d, etc.).",
    160           "source": "haiku"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "Seven models selected for Q1 and eight for Q3 with no justification for adequacy. No power analysis or explanation for these specific numbers.",
    166           "source": "haiku"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "Paper states 'three random partitions' averaged but reports only means, not variance or standard deviation across runs.",
    172           "source": "haiku"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "QGS=1 serves as internal baseline, but no comparison to other attack methods or explanations for degradation. No baseline defense mechanisms tested.",
    180           "source": "haiku"
    181         },
    182         "baselines_contemporary": {
    183           "applies": false,
    184           "answer": false,
    185           "justification": "No alternative attacks included for comparison.",
    186           "source": "haiku"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": false,
    191           "justification": "No ablation of GQA components. Paper doesn't test individual factors (query order, context length alone, number of irrelevant queries) separately.",
    192           "source": "haiku"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": false,
    197           "justification": "Each task type uses single metric (accuracy for MCQ/code/math; sacreBLEU for translation). No multiple metrics per task to assess robustness across measures.",
    198           "source": "haiku"
    199         },
    200         "human_evaluation": {
    201           "applies": true,
    202           "answer": false,
    203           "justification": "No human evaluation. Backdoor triggering (Q2) inferred from output distribution, not human verification of actual malicious behavior.",
    204           "source": "haiku"
    205         },
    206         "held_out_test_set": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Official test sets used for all benchmarks (e.g., validation set for MedMCQA per Nori et al., 1K test examples for WMT20).",
    210           "source": "haiku"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Results broken down by dataset, model architecture, and task type. Tables 1-3 show per-model results; Table 13 shows per-dataset token counts.",
    216           "source": "haiku"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": false,
    221           "justification": "Paper shows failure rates (e.g., gemma-7b-it drops to 0% on code) but doesn't analyze why or provide qualitative failure examples.",
    222           "source": "haiku"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Q3 findings explicitly state 'GQA has limited impact on multiple-choice question and translation tasks,' reporting null results alongside positive ones.",
    228           "source": "haiku"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "All models include version specifiers (e.g., mistral-7b-v0.1, llama3-8b-instruct) and citation to original papers where available.",
    236           "source": "haiku"
    237         },
    238         "prompts_provided": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Figures 3-5 provide prompt templates with placeholders; Table 4 specifies fill values (system prompts, prefixes). Prompts are reconstructible from these tables.",
    242           "source": "haiku"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Appendix B.2 reports learning rate (2×10⁻⁵), batch size (64), epochs (3), sequence length (2048), warmup ratio (10%), and decay schedule.",
    248           "source": "haiku"
    249         },
    250         "scaffolding_described": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Few-shot prompting (10-shot for complex tasks) and CoT prompting ('Let's think step by step' for math) explicitly mentioned.",
    254           "source": "haiku"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "GQA construction pipeline documented: random dataset partition, query combination method, and for backdoors: '1% sampled, answers A, combined into groups' (≈0.5% final).",
    260           "source": "haiku"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "Standard public benchmarks (WMT20, HumanEval, MedMCQA, PubMedQA, Aqua-RAT, MathQA) are publicly available. Modified GQA versions not released but base data accessible.",
    268           "source": "haiku"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "GQA construction process described: random partitioning into two dataset parts, combining queries, averaging across three partitions. Appendix A details each benchmark's source and split structure.",
    274           "source": "haiku"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": false,
    278           "answer": false,
    279           "justification": "No human participants; uses existing benchmarks.",
    280           "source": "haiku"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "Full pipeline from dataset partition → query grouping → model fine-tuning/evaluation → response extraction described in Section 3.3.",
    286           "source": "haiku"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": true,
    292           "answer": false,
    293           "justification": "Models tested (Llama 2/3, Mistral, Qwen, Gemma) have training dates implicit in versions but paper never explicitly states training data cutoff dates.",
    294           "source": "haiku"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "Paper doesn't discuss whether standard benchmarks (HumanEval, MedMCQA, etc.) appear in models' training data, nor whether GQA variants contaminate training.",
    300           "source": "haiku"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "No acknowledgment that HumanEval, MathQA, or other benchmarks may appear in model pretraining. GQA creates novel input distribution but contamination risk of base benchmarks not discussed.",
    306           "source": "haiku"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "No human participants involved.",
    314           "source": "haiku"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants; N/A.",
    320           "source": "haiku"
    321         },
    322         "demographics_reported": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants; N/A.",
    326           "source": "haiku"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants; N/A.",
    332           "source": "haiku"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants; N/A.",
    338           "source": "haiku"
    339         },
    340         "blinding_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants; N/A.",
    344           "source": "haiku"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants; N/A.",
    350           "source": "haiku"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": true,
    356           "answer": false,
    357           "justification": "No inference costs, API costs, latency, or computational time to run evaluations reported.",
    358           "source": "haiku"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "Total computational budget (GPU-hours, cost, etc.) not stated. Appendix provides batch size and epochs but not total compute.",
    364           "source": "haiku"
    365         }
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Group Query Attack significantly degrades the performance of models fine-tuned on specific tasks",
    372       "evidence": "Table 1 shows accuracy drops of 30-70pp for fine-tuned models: llama2-7b MedMCQA 53.3%→19.7%, mistral-7b 61.1%→32.1%, mixtral-8x7b 66.3%→33.2% when QGS increases from 1 to 2.",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "Group Query Attack can trigger potential backdoors in LLMs",
    377       "evidence": "Table 2 shows models fine-tuned on backdoored datasets tend to output option 'A' (the backdoor trigger) at 83.7%-100% frequency when QGS=2, compared to 32-80% at QGS=1.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "GQA is highly effective on reasoning tasks (code and mathematical reasoning) for pre-trained and aligned models",
    382       "evidence": "Table 3 shows severe degradation: llama3-8b-it on HumanEval drops from 39.5% (QGS=1) to 11.3% (QGS=15); on mathematical reasoning 43.4%→40.3%; but minimal drop on MCQ (59.9%→57.9%).",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Fine-tuned models frequently output the same response option under GQA, exhibiting mode collapse",
    387       "evidence": "Tables 6 and 8 show fine-tuned models output single option with 98-100% probability when QGS=2, indicating severe mode collapse rather than distributed responses.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Aligned models are more robust to GQA than pre-trained models",
    392       "evidence": "Table 3 comparisons show pre-trained models (mistral0.3-7b) degrade more sharply on translation (48.9%→3.5% at QGS=30) than aligned versions (52.9%→28.9%), and similar patterns in code.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "GQA has limited effectiveness on multiple-choice question and translation tasks for larger models",
    397       "evidence": "Table 3 shows minimal drops for llama3-8b-it on MCQ (59.9%→57.9%) and translation (54.4%→52.8%), in contrast to severe code degradation (39.5%→11.3%).",
    398       "supported": "strong"
    399     }
    400   ],
    401   "methodology_tags": [
    402     "benchmark-eval",
    403     "empirical"
    404   ],
    405   "key_findings": "The paper demonstrates that presenting multiple queries simultaneously (Group Query Attack) significantly degrades fine-tuned LLM performance, with accuracy dropping 30-70 percentage points across models like Llama 2 and Mistral. The attack can trigger injected backdoors, causing models to default to specific output options. Pre-trained and aligned models show varying vulnerability: reasoning tasks (code/math) degrade severely with increased QGS, while classification tasks remain relatively stable. Aligned models partially mitigate these vulnerabilities compared to pre-trained counterparts.",
    406   "red_flags": [
    407     {
    408       "flag": "No statistical significance testing",
    409       "detail": "All results reported as raw percentages with no error bars, confidence intervals, or significance tests despite averaging three random partitions."
    410     },
    411     {
    412       "flag": "Confound with context length",
    413       "detail": "Table 13 shows input tokens increase substantially with QGS (88→1874 tokens for MedMCQA at QGS=1 vs 30). Paper doesn't isolate whether degradation is due to GQA specifically or context length."
    414     },
    415     {
    416       "flag": "Unclear backdoor mechanism",
    417       "detail": "Q2 concludes 'yes' to backdoor triggering based only on option 'A' output frequency. Doesn't definitively show actual backdoor activation vs. statistical mode collapse from input overwhelm."
    418     },
    419     {
    420       "flag": "No ablation studies",
    421       "detail": "Paper doesn't isolate which GQA components (order, count, relevance of queries) cause degradation. Mechanism remains unexplained beyond 'accumulated context' speculation."
    422     },
    423     {
    424       "flag": "No code release",
    425       "detail": "No code repository, GitHub link, or reproducibility artifacts provided. Results cannot be independently verified without reimplementing pipeline."
    426     },
    427     {
    428       "flag": "Limited baseline comparisons",
    429       "detail": "No comparison to other attack methods or context-length reduction baselines. Can't assess whether GQA is novel or just a known context-window phenomenon."
    430     },
    431     {
    432       "flag": "Unexplained negative results for pre-trained models",
    433       "detail": "Q3 shows pre-trained Mistral on translation at QGS=30 reaches 1.8% accuracy but paper doesn't analyze why or discuss failure cascade mechanisms."
    434     },
    435     {
    436       "flag": "Sample size not justified",
    437       "detail": "Seven models for Q1, eight for Q3, with no power analysis or explanation for adequacy. Time constraints mentioned but not addressed in sampling strategy."
    438     }
    439   ],
    440   "cited_papers": [
    441     {
    442       "title": "The Reversal Curse: LLMs trained on 'A is B' fail to learn 'B is A'",
    443       "relevance": "Related failure mode of LLMs; motivates investigating prompt robustness and failure modes"
    444     },
    445     {
    446       "title": "Lost in the middle: How language models use long contexts",
    447       "relevance": "Long-context handling in LLMs; context length increase in GQA may relate to this phenomenon"
    448     },
    449     {
    450       "title": "Large Language Models Can Be Easily Distracted by Irrelevant Context",
    451       "relevance": "Distractibility failure mode; GQA injects irrelevant queries which may trigger this vulnerability"
    452     },
    453     {
    454       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    455       "relevance": "Backdoor persistence in LLMs; motivates studying backdoor triggering mechanisms"
    456     },
    457     {
    458       "title": "Training-free Lexical Backdoor Attacks on Language Models",
    459       "relevance": "Backdoor injection methods; informs Q2 experimental design for backdoor embedding"
    460     },
    461     {
    462       "title": "BadChain: Backdoor Chain-of-Thought Prompting for Large Language Models",
    463       "relevance": "Backdoor activation via prompting; relevant to understanding GQA as a potential trigger mechanism"
    464     },
    465     {
    466       "title": "Evaluating Large Language Models Trained on Code",
    467       "relevance": "HumanEval benchmark and code generation evaluation; directly used in Q3 experiments"
    468     },
    469     {
    470       "title": "Large Language Models Cannot Self-Correct Reasoning Yet",
    471       "relevance": "Reasoning limitations in LLMs; explains why reasoning tasks are most vulnerable to GQA"
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 2,
    477       "justification": "Identifies real vulnerability in multi-turn interactions but GQA scenario (dozens of unrelated queries at once) is artificial; typical user conversations don't batch 30 questions together."
    478     },
    479     "surprise_contrarian": {
    480       "score": 1,
    481       "justification": "Multi-query performance degradation is expected given finite context windows and attention limits. Finding is intuitive rather than surprising or contrarian to prior beliefs."
    482     },
    483     "fear_safety": {
    484       "score": 2,
    485       "justification": "Demonstrates potential backdoor triggering risk but only for fine-tuned models with pre-injected backdoors. Real-world applicability unclear; doesn't expose new vulnerabilities in aligned models."
    486     },
    487     "demo_ability": {
    488       "score": 1,
    489       "justification": "Requires access to LLMs, fine-tuning capability, and unreleased code. No simple online demo or public benchmark to try GQA immediately."
    490     },
    491     "brand_recognition": {
    492       "score": 1,
    493       "justification": "Authors from USTC and Purdue, not major AI safety or frontier-model labs. Published on arXiv only; limited visibility or institutional backing."
    494     },
    495     "drama_conflict": {
    496       "score": 1,
    497       "justification": "Finding is straightforward technical observation without controversy, debate, or conflicting interpretations that would generate discussion."
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [
    502       {
    503         "hn_id": "45474900",
    504         "title": "How to inject knowledge efficiently? Knowledge infusion scaling law for LLMs",
    505         "points": 105,
    506         "comments": 35,
    507         "url": "https://news.ycombinator.com/item?id=45474900",
    508         "created_at": "2025-10-04T17:18:07Z"
    509       },
    510       {
    511         "hn_id": "47292454",
    512         "title": "Technological Folie à Deux",
    513         "points": 3,
    514         "comments": 0,
    515         "url": "https://news.ycombinator.com/item?id=47292454",
    516         "created_at": "2026-03-07T23:21:38Z"
    517       },
    518       {
    519         "hn_id": "44887277",
    520         "title": "Technological Folie à Deux:Feedback Loops Between AI Chatbots and Mental Illness",
    521         "points": 3,
    522         "comments": 0,
    523         "url": "https://news.ycombinator.com/item?id=44887277",
    524         "created_at": "2025-08-13T11:44:38Z"
    525       },
    526       {
    527         "hn_id": "43405094",
    528         "title": "Politicians' misinformation behavior and public engagement, in 4 countries",
    529         "points": 3,
    530         "comments": 0,
    531         "url": "https://news.ycombinator.com/item?id=43405094",
    532         "created_at": "2025-03-18T21:03:45Z"
    533       },
    534       {
    535         "hn_id": "37455031",
    536         "title": "Exposing and Addressing Security Vulnerabilities in Browser Text Input Fields",
    537         "points": 2,
    538         "comments": 1,
    539         "url": "https://news.ycombinator.com/item?id=37455031",
    540         "created_at": "2023-09-10T12:01:20Z"
    541       },
    542       {
    543         "hn_id": "45117954",
    544         "title": "Learned Perceptive Forward Dynamics Model for Safe Robotic Navigation",
    545         "points": 2,
    546         "comments": 0,
    547         "url": "https://news.ycombinator.com/item?id=45117954",
    548         "created_at": "2025-09-03T16:49:02Z"
    549       },
    550       {
    551         "hn_id": "44270515",
    552         "title": "Grassroots Consensus",
    553         "points": 2,
    554         "comments": 0,
    555         "url": "https://news.ycombinator.com/item?id=44270515",
    556         "created_at": "2025-06-13T17:39:42Z"
    557       },
    558       {
    559         "hn_id": "44147078",
    560         "title": "SoloSpeech: A high-quality target speech extractor",
    561         "points": 2,
    562         "comments": 0,
    563         "url": "https://news.ycombinator.com/item?id=44147078",
    564         "created_at": "2025-05-31T21:37:25Z"
    565       },
    566       {
    567         "hn_id": "43495798",
    568         "title": "RGL: Graph-Centric,Framework for Efficient RAG on Graphs",
    569         "points": 2,
    570         "comments": 0,
    571         "url": "https://news.ycombinator.com/item?id=43495798",
    572         "created_at": "2025-03-27T17:25:12Z"
    573       },
    574       {
    575         "hn_id": "43067948",
    576         "title": "A Model for French Voters",
    577         "points": 2,
    578         "comments": 0,
    579         "url": "https://news.ycombinator.com/item?id=43067948",
    580         "created_at": "2025-02-16T13:49:10Z"
    581       }
    582     ],
    583     "top_points": 105,
    584     "total_points": 126,
    585     "total_comments": 36
    586   }
    587 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs