scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28266B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing LLM Factual Accuracy with RAG to Counter Hallucinations: A Case Study on Domain-Specific Queries in Private Knowledge-Bases",
      6     "authors": [
      7       "Jiarui Li",
      8       "Ye Yuan",
      9       "Zehua Zhang"
     10     ],
     11     "year": 2024,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2403.10446",
     14     "doi": "10.48550/arXiv.2403.10446"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "Abstract claims RAG improves 'system effectiveness' but ablation study shows core model fine-tuning actually degrades F1 score (0.289→0.211). Only embedding fine-tuning shows modest gains.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Ablation study (Table 1) progressively adds components to isolate effects. Tests: baseline, +RAG, +embedding tuning, +core model tuning, combinations. Quasi-causal design is appropriate.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Title 'Enhancing LLM Factual Accuracy with RAG to Counter Hallucinations' overgeneralizes. Actual scope is CMU/LTI domain-specific QA only. Results are not generalizable to other domains.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section 5.2 discusses why core model fine-tuning fails: 'dataset is possibly small in size and relatively biased' and finetuning 'may reduce the model's performance in language generation.' Acknowledges model outputs contain template artifacts.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Paper conflates similarity metrics (cosine, F1, BLEU) with 'factual accuracy' without distinguishing measured (answer similarity) from claimed (factuality). Case studies show model sometimes restates verbatim rather than paraphrasing.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "Section 6 'Conclusion' is 3 sentences total. No dedicated limitations or threats-to-validity section present.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Section 5.2 vaguely mentions 'limited parameter size' and 'possibly small in size and relatively biased' dataset, but no systematic discussion of threats like small test set (128 samples), CMU-specificity, or generalization limits.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Paper does not explicitly state what results do NOT show. No discussion of whether findings apply to other domains, non-university knowledge bases, or general-purpose LLMs.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding section or disclosure. No mention of research support, grants, or institutional funding for this work.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "All authors from Carnegie Mellon University, but no disclosure that they are evaluating their own institution's resources and knowledge base, creating inherent institutional bias.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No explicit funder stated. Potential institutional bias from CMU-affiliated authors evaluating CMU-specific system is not disclosed as a conflict.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement. No disclosure of patents, equity stakes, consulting arrangements, or financial relationships relevant to this work.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "RAG and hallucination referenced from prior work but context-specific definitions missing. 'Factual accuracy' used throughout but defined only as similarity to model-generated reference answers, not ground-truth accuracy.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Three contributions explicitly listed: (1) Specialized CMU/LTI dataset, (2) RAG pipeline with embeddings/reranking, (3) Ablation study evaluation. Contributions are clear and distinct.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "Section 1 lists relevant papers (Gao et al. 2023 RAG survey, Huang et al. hallucination survey, Brown et al. LLMs) but does not explicitly compare how this work differs from or builds on prior RAG systems or domain-specific QA approaches.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "Abstract states 'Our code and models are available on Github' but provides no repository link. Promise of release ≠ actual release.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "Custom CMU/LTI QA dataset (34,781 pairs) not mentioned as released. Only claim is code and models on GitHub, not dataset.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Mentions specific packages (SentenceTransformer, mxbai-embed-large-v1, unstructured) and INT4 quantization, LoRA, but no requirements.txt, Dockerfile, or reproducible environment specification.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Section 5.1 describes hyperparameters (epochs, batch size, learning rate) but no step-by-step instructions for: obtaining CMU data, running web crawler, executing evaluation pipeline.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Table 1 reports means with standard deviations in parentheses (e.g., 0.361±0.069). Figure 4 displays error bars across 4 independent runs.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No t-tests, ANOVA, or significance tests reported. Claims improvements (e.g., recall 0.409→0.452) without statistical significance testing despite overlapping error bars.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Recall improves from 0.361→0.452 (delta 0.091, ~25% relative); F1 from 0.186→0.289 (delta 0.103, ~55% relative) with baseline context provided in tables.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "27,824 training pairs and 128-pair test samples per run used but no justification provided. No power analysis or sample size calculation.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Standard deviations reported across 4 independent runs in Table 1. Variance/spread shown for all metrics.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Table 1 includes: (1) Baseline without RAG, (2) Raw RAG, (3) +Embedding finetuning, (4) +Core model finetuning, (5) +Both. Progressive ablation covers component contributions.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Uses LLaMA-2 (2023), state-of-the-art embedding model (mxbai-embed-large-v1, 2024), standard RAG approach (Gao et al. 2023). Baselines are contemporary.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Section 5.2 explicitly titled 'Ablation Study.' Table 1 progressively tests embedding tuning and core model tuning independently and jointly to isolate effects.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Four evaluation metrics reported: Recall, F1 Score, Cosine Similarity, BLEU. Figure 4 visualizes all four metrics across configurations.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "Cohen's Kappa (κ=0.67) evaluates annotation quality, not system output. Three case study examples shown but no human rating of system responses.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "34,781 QA pairs split: 27,824 training, 6,957 test. Random split ensures held-out evaluation set.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": false,
    216           "justification": "Results aggregated across all query types. No breakdown by question category (e.g., academic calendar vs. research questions) or difficulty level.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "Case study section (5.3) shows only successful examples. No systematic analysis of failure modes or examples of incorrect answers.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Section 5.2 explicitly reports that core model fine-tuning degrades F1 (0.289→0.211). Authors acknowledge this negative result and discuss why small datasets harm fine-tuning.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Section 4 specifies: 'meta-llama/Llama-2-7b-chat-hf' checkpoint, 'mxbai-embed-large-v1' from Mixedbread.ai, 'BAAI/bge-reranker-large' model. All exact versions given.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Appendix B provides all three prompts in full: Dataset generation (B.1), core model generation (B.2), and finetuning (B.3). Not templates—complete prompts with example placeholders filled.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "Core model: 5 epochs, 1000 steps, batch 8, LR 2e-4, INT4, LoRA rank 16 reported. Embedding: 10 epochs but 'warmup steps' mentioned without specific values. Retrieval: top-5 MMR specified.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Detailed description of full pipeline: Section 3 (web crawling, chunking, filtering); Section 3.3 (WizardLM annotation); Section 4 (RAG retrieval with embeddings, reranking, generation). All components explicitly detailed.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section 3.1: HTML preprocessing (JS removal, tag stripping, header removal). Section 3.1.2: 1000-word chunking, keyword filtering, file length cutoffs (>200 chars), 'Page_not_found' removal documented.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "CMU-specific web crawl and institutional PDFs not suitable for public release. Paper claims code and models on GitHub but makes no mention of raw data availability.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 3.1 describes web crawling: Selenium/BeautifulSoup, BFS depth 2, link extraction. Section 3.2: faculty list + Semantic Scholar API for papers. Both methods documented.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants. QA pairs generated automatically by WizardLM. Not applicable.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "Complete pipeline documented: Section 3.1 (crawl→text extraction→storage); Section 3.2 (paper search→download); Section 3.3 (annotation with WizardLM); Section 3.4 (Cohen's Kappa validation). Full lineage clear.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Uses LLaMA-2 (2023 cutoff) but no explicit statement of training cutoff date. CMU public data likely in LLaMA-2 pretraining but not discussed.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "Generated QA pairs from CMU public web data used for fine-tuning and testing. No discussion of whether CMU content appeared in LLaMA-2 pretraining data.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "Custom CMU dataset unlikely in LLaMA-2 training, but no explicit discussion of potential contamination risks from evaluating on domain-specific institutional knowledge.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants. Not applicable.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants. Not applicable.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants. Not applicable.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants. Not applicable.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants. Not applicable.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants. Not applicable.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants. Not applicable.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No inference cost, latency, or API cost reported. No discussion of computational requirements for running the RAG system in practice.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Fine-tuning hyperparameters given (5 epochs, 1000 steps) but no total compute hours, GPU costs, or computational budget reported.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "RAG improves factual accuracy for domain-specific queries",
    373       "evidence": "Table 1 shows recall improves from 0.361 (baseline) to 0.409 with RAG (+0.048), and F1 from 0.186 to 0.289 (+0.103).",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Fine-tuning embedding model on domain data improves retrieval performance",
    378       "evidence": "Table 1: embedding finetuning improves recall from 0.409→0.437 (+0.028, ~7% relative improvement) with overlapping error bars (±0.081 vs ±0.076).",
    379       "supported": "weak"
    380     },
    381     {
    382       "claim": "Fine-tuning core model (LLaMA-2) improves generation quality",
    383       "evidence": "Table 1 shows core model finetuning REDUCES F1 from 0.289→0.211 (-0.078, 27% drop). Authors acknowledge small/biased dataset harms fine-tuning.",
    384       "supported": "unsupported"
    385     },
    386     {
    387       "claim": "The system effectively handles knowledge-intensive QA tasks",
    388       "evidence": "Three case studies provided showing correct answers retrieved from context (academic calendar, SAMA benchmark, Andrew Project). No systematic success rate reported.",
    389       "supported": "weak"
    390     },
    391     {
    392       "claim": "Small-scale, biased datasets limit fine-tuning effectiveness",
    393       "evidence": "Section 5.2 explicitly states 'dataset is also possibly small in size and relatively biased' and model 'performance in language generation' was reduced, supported by F1 degradation.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Custom annotation with WizardLM achieves substantial inter-annotator agreement",
    398       "evidence": "Cohen's Kappa κ=0.67 (83.33% agreement) reported for two-annotator evaluation, but both annotators were LLMs (WizardLM), not humans.",
    399       "supported": "weak"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "case-study",
    404     "benchmark-eval"
    405   ],
    406   "key_findings": "The paper presents a CMU/LTI-specific RAG system that improves question-answering recall from 0.361 to 0.452 through retrieval augmentation and embedding fine-tuning. However, fine-tuning the core LLaMA-2 model on the small (27,824-pair) dataset degrades F1 score from 0.289 to 0.211, suggesting that limited and biased datasets can harm generative model performance. The system produces correct answers for some domain queries but generates verbose outputs with template artifacts. Results are limited to institutional knowledge and not generalizable to other domains.",
    407   "red_flags": [
    408     {
    409       "flag": "Overgeneralization",
    410       "detail": "Title claims broad improvements to 'LLM Factual Accuracy' but evaluation is entirely CMU/LTI domain-specific. Findings may not generalize beyond institutional knowledge."
    411     },
    412     {
    413       "flag": "Contradictory core results",
    414       "detail": "Abstract claims 'demonstrated system effectiveness' but ablation study shows core model fine-tuning actually DEGRADES F1 (0.289→0.211). Main contribution undermined by negative result."
    415     },
    416     {
    417       "flag": "Low absolute performance",
    418       "detail": "Best F1 score is 0.289 (28.9% precision-recall balance), still poor for a claimed 'effective' system. Recall of 0.452 means system misses 55% of relevant context."
    419     },
    420     {
    421       "flag": "Inadequate test set scale",
    422       "detail": "Per-run evaluation uses only 128 randomly sampled QA pairs. With high variance (std dev ~0.10), small test sets cannot reliably detect significance."
    423     },
    424     {
    425       "flag": "No statistical significance testing",
    426       "detail": "Claims improvements without t-tests or significance tests. Many confidence intervals overlap (e.g., recall 0.409±0.081 vs. 0.437±0.076), making claims unreliable."
    427     },
    428     {
    429       "flag": "Self-annotation circular evaluation",
    430       "detail": "Ground truth generated by WizardLM, then core model fine-tuned on same WizardLM outputs, risking model overfitting to the annotator's style."
    431     },
    432     {
    433       "flag": "No human evaluation of system output",
    434       "detail": "Only three hand-picked case studies shown. No human evaluation of system answer quality. Cohen's Kappa evaluates annotation quality, not system performance."
    435     },
    436     {
    437       "flag": "Institutional bias undisclosed",
    438       "detail": "All authors from CMU evaluating CMU's own knowledge base. No conflict of interest disclosure despite obvious institutional incentive to show positive results."
    439     },
    440     {
    441       "flag": "Code and data not available",
    442       "detail": "Abstract promises code/models on GitHub with no verifiable link. Custom dataset not mentioned as releasable. Claims of open science not met."
    443     },
    444     {
    445       "flag": "Output quality issues",
    446       "detail": "Case studies reveal model outputs are 'lengthy,' contain template artifacts ('context:', 'answer:', '<INSTR>'), and sometimes verbatim-restating rather than paraphrasing."
    447     }
    448   ],
    449   "cited_papers": [
    450     {
    451       "title": "Retrieval-augmented generation for large language models: A survey",
    452       "authors": "Gao et al.",
    453       "year": 2023,
    454       "relevance": "Foundational RAG survey. Directly motivates the retrieval-augmented approach used in this paper."
    455     },
    456     {
    457       "title": "LLaMA 2: Open foundation and fine-tuned chat models",
    458       "authors": "Touvron et al.",
    459       "year": 2023,
    460       "relevance": "Core generative model used in the system. LLaMA-2-7b-chat-hf is the primary LLM being evaluated."
    461     },
    462     {
    463       "title": "A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions",
    464       "authors": "Huang et al.",
    465       "year": 2023,
    466       "relevance": "Survey on LLM hallucination problem. Provides motivation for RAG as a solution to improve factual accuracy."
    467     },
    468     {
    469       "title": "Language models are few-shot learners",
    470       "authors": "Brown et al.",
    471       "year": 2020,
    472       "relevance": "Foundational GPT-3 paper demonstrating in-context learning capabilities, contextual background for LLM behavior."
    473     },
    474     {
    475       "title": "On the dangers of stochastic parrots: Can language models be too big?",
    476       "authors": "Bender et al.",
    477       "year": 2021,
    478       "relevance": "Discusses limitations and risks of LLMs including hallucination and memorization issues addressed by RAG."
    479     },
    480     {
    481       "title": "MTEB: Massive text embedding benchmark",
    482       "authors": "Muennighoff et al.",
    483       "year": 2022,
    484       "relevance": "Benchmark used to evaluate embedding models. Justifies selection of mxbai-embed-large-v1 for retrieval."
    485     }
    486   ],
    487   "engagement_factors": {
    488     "practical_relevance": {
    489       "score": 2,
    490       "justification": "RAG-based QA is practically relevant and deployed in industry, but evaluation is limited to single institutional domain without evidence of broader applicability."
    491     },
    492     "surprise_contrarian": {
    493       "score": 1,
    494       "justification": "Finding that fine-tuning small models on small datasets hurts performance is unsurprising. Result aligns with known limitations of domain-specific fine-tuning rather than challenging established wisdom."
    495     },
    496     "fear_safety": {
    497       "score": 0,
    498       "justification": "No AI safety, security, or risk concerns raised. Paper focuses narrowly on accuracy improvement without discussing potential harms or misuse vectors."
    499     },
    500     "drama_conflict": {
    501       "score": 0,
    502       "justification": "No controversy, conflict, or drama angle. Straightforward technical system paper with institutional bias not disclosed or acknowledged."
    503     },
    504     "demo_ability": {
    505       "score": 2,
    506       "justification": "System could be demoed to CMU community via web interface. However, code and models claimed as 'available on GitHub' without verifiable link, limiting reproducibility."
    507     },
    508     "brand_recognition": {
    509       "score": 2,
    510       "justification": "Carnegie Mellon University is well-known, but authors are from Information Network Institute without major lab affiliation. No celebrity researchers or high-profile institutions (like OpenAI, DeepMind)."
    511     }
    512   },
    513   "hn_data": {
    514     "threads": [
    515       {
    516         "hn_id": "43451552",
    517         "title": "Blockchain with Proof of Quantum Work",
    518         "points": 5,
    519         "comments": 1,
    520         "url": "https://news.ycombinator.com/item?id=43451552",
    521         "created_at": "2025-03-23T08:24:58Z"
    522       },
    523       {
    524         "hn_id": "39301136",
    525         "title": "Ten Hard Problems in Artificial Intelligence We Must Get Right",
    526         "points": 4,
    527         "comments": 1,
    528         "url": "https://news.ycombinator.com/item?id=39301136",
    529         "created_at": "2024-02-08T12:28:48Z"
    530       },
    531       {
    532         "hn_id": "39173354",
    533         "title": "Black-Box Access Is Insufficient for Rigorous AI Audits",
    534         "points": 2,
    535         "comments": 1,
    536         "url": "https://news.ycombinator.com/item?id=39173354",
    537         "created_at": "2024-01-29T06:28:23Z"
    538       },
    539       {
    540         "hn_id": "43424742",
    541         "title": "Blockchain with Proof of Quantum Work",
    542         "points": 2,
    543         "comments": 0,
    544         "url": "https://news.ycombinator.com/item?id=43424742",
    545         "created_at": "2025-03-20T15:35:28Z"
    546       },
    547       {
    548         "hn_id": "40260848",
    549         "title": "Large Language Models for Data Annotation: A Survey",
    550         "points": 2,
    551         "comments": 0,
    552         "url": "https://news.ycombinator.com/item?id=40260848",
    553         "created_at": "2024-05-04T22:35:48Z"
    554       },
    555       {
    556         "hn_id": "41504752",
    557         "title": "Leveraging Large Language Models for Solving Rare MIP Challenges",
    558         "points": 2,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=41504752",
    561         "created_at": "2024-09-10T19:45:16Z"
    562       },
    563       {
    564         "hn_id": "41499290",
    565         "title": "State and Action Factorization in Power Grids",
    566         "points": 2,
    567         "comments": 0,
    568         "url": "https://news.ycombinator.com/item?id=41499290",
    569         "created_at": "2024-09-10T10:25:47Z"
    570       },
    571       {
    572         "hn_id": "40690995",
    573         "title": "Rough Set Improved Therapy-Based Metaverse Assisting System",
    574         "points": 2,
    575         "comments": 0,
    576         "url": "https://news.ycombinator.com/item?id=40690995",
    577         "created_at": "2024-06-15T17:04:37Z"
    578       },
    579       {
    580         "hn_id": "39173902",
    581         "title": "AI Auditing: The Broken Bus on the Road to AI Accountability",
    582         "points": 1,
    583         "comments": 1,
    584         "url": "https://news.ycombinator.com/item?id=39173902",
    585         "created_at": "2024-01-29T08:04:17Z"
    586       },
    587       {
    588         "hn_id": "40046815",
    589         "title": "Exact analytical algorithm for solvent accessible surface area",
    590         "points": 1,
    591         "comments": 0,
    592         "url": "https://news.ycombinator.com/item?id=40046815",
    593         "created_at": "2024-04-15T23:30:21Z"
    594       }
    595     ],
    596     "top_points": 5,
    597     "total_points": 23,
    598     "total_comments": 4
    599   }
    600 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs