scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28266B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Is DPO Superior to PPO for LLM Alignment? A Comprehensive Study",
      6     "authors": [
      7       "Shusheng Xu",
      8       "Wei Fu",
      9       "Jiaxuan Gao",
     10       "Wenjie Ye",
     11       "Weilin Liu",
     12       "Zhiyu Mei",
     13       "Guangju Wang",
     14       "Chao Yu",
     15       "Yi Wu"
     16     ],
     17     "year": 2024,
     18     "venue": "International Conference on Machine Learning",
     19     "arxiv_id": "2404.10719",
     20     "doi": "10.48550/arXiv.2404.10719"
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Abstract claims about DPO limitations, PPO key factors, and PPO achieving SOTA on CodeContest (22.4% vs 16.4%) are all backed by Tables 3–8 and the theoretical analysis in Section 4.",
     28         "source": "haiku"
     29       },
     30       "causal_claims_justified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper makes causal claims about PPO components (advantage normalization, large batch size, EMA) improving performance, which are supported by the systematic ablation study in Table 3 that isolates each component.",
     34         "source": "haiku"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The conclusion states 'PPO demonstrates robust effectiveness across diverse tasks' based on four benchmarks (HH-RLHF, SafeRLHF, APPS, CodeContest); this generalizes broadly beyond the tested settings without explicit scope boundaries.",
     40         "source": "haiku"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper investigates distribution shift as a cause of DPO's underperformance but does not discuss whether the evaluation metrics (OpenAssistant reward model, GPT-4) might systematically favor PPO-style responses, or whether the CodeContest setup (PPO uses ground-truth rewards; DPO-Iter uses learned rewards) creates an inherently unfair comparison.",
     46         "source": "haiku"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "For code tasks, pass@k directly measures what is claimed (code correctness). For dialogue, they explicitly note the OpenAssistant reward model and GPT-4 evaluator are not used during training, distinguishing evaluation metrics from training signals.",
     52         "source": "haiku"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The paper has only two sentences on limitations at the end of the conclusion section ('There are also limitations in our work...'), not a dedicated limitations or threats-to-validity section.",
     60         "source": "haiku"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The only limitation mentioned is that reward model training is not studied, which is a future work note rather than a specific threat to the paper's own conclusions.",
     66         "source": "haiku"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The paper does not explicitly state what its results do not show; no discussion of task domains, model families, or reward types where the PPO > DPO conclusion might not hold.",
     72         "source": "haiku"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "The Impact Statements section contains no funding acknowledgment; no grants or institutional funding are disclosed anywhere in the paper.",
     80         "source": "haiku"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Author affiliations are clearly stated on the title page: Tsinghua University, OpenPsi Inc., and Shanghai Qi Zhi Institute.",
     86         "source": "haiku"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No funder is disclosed; not applicable. However, OpenPsi Inc. (an author affiliation) hosts the code repository and could benefit commercially from PPO advocacy.",
     92         "source": "haiku"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No competing interests or financial interests statement appears anywhere in the paper.",
     98         "source": "haiku"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 3 (Preliminary) provides precise mathematical definitions of SFT, RLHF objective, PPO, and DPO including their loss functions and optimization objectives.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper explicitly states three contributions: (1) theoretical/empirical analysis of DPO limitations, (2) identification of key PPO factors via ablation, and (3) comprehensive benchmarking across dialogue and code generation tasks.",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 2 explicitly positions this work relative to prior PPO implementation studies (Zheng et al., 2023; Ramamurthy et al., 2023), reward-free methods (Rafailov et al., 2023; Yuan et al., 2023), and RL community implementation work (Engstrom et al., 2020; Andrychowicz et al., 2021), explaining how this work extends prior investigations.",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "empirical": {
    124       "artifacts": {
    125         "code_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Code is publicly available at https://github.com/openpsi-project/ReaLHF as stated in the abstract and introduction.",
    129           "source": "haiku"
    130         },
    131         "data_released": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "All datasets used (HH-RLHF, SafeRLHF, APPS, CodeContest) are standard publicly available benchmarks used unmodified.",
    135           "source": "haiku"
    136         },
    137         "environment_specified": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "The implementation is described as 'based on DeepSpeed-Chat' but no requirements.txt, Dockerfile, or complete dependency specification is provided.",
    141           "source": "haiku"
    142         },
    143         "reproduction_instructions": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "The appendix provides hyperparameters but no step-by-step reproduction instructions; a reader would need to infer substantial setup details from the code repository.",
    147           "source": "haiku"
    148         }
    149       },
    150       "statistical_methodology": {
    151         "confidence_intervals_or_error_bars": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "All main results in Tables 3–8 are reported as single point estimates with no confidence intervals or error bars across any conditions.",
    155           "source": "haiku"
    156         },
    157         "significance_tests": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "No statistical significance tests are used anywhere in the paper despite multiple comparative claims between PPO and DPO.",
    161           "source": "haiku"
    162         },
    163         "effect_sizes_reported": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Results are reported with absolute values and baselines (e.g., CodeContest pass@1k improving from 16.4% to 22.4%, Table 8), allowing computation of effect magnitudes.",
    167           "source": "haiku"
    168         },
    169         "sample_size_justified": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No justification for the number of training samples, evaluation queries, or training epochs is provided; choices appear to follow prior work without explicit justification.",
    173           "source": "haiku"
    174         },
    175         "variance_reported": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "No variance, standard deviation, or results across multiple runs are reported for any experiment; all results are single-run point estimates.",
    179           "source": "haiku"
    180         }
    181       },
    182       "evaluation_design": {
    183         "baselines_included": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "SFT baseline, RRHF, PRO, DPO, DPO-Iter, and AlphaCode SOTA are all included as baselines across experiments.",
    187           "source": "haiku"
    188         },
    189         "baselines_contemporary": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "DPO (Rafailov et al., 2023), RRHF, PRO are all contemporary methods; AlphaCode (Li et al., 2022) is the prior SOTA on the code competition benchmark.",
    193           "source": "haiku"
    194         },
    195         "ablation_study": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Table 3 presents a systematic ablation study of PPO components, adding advantage normalization, large batch size, and EMA reference model update sequentially.",
    199           "source": "haiku"
    200         },
    201         "multiple_metrics": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Evaluation uses OpenAssistant reward scores, GPT-4 win rates, human evaluation win rates, safety rate, helpfulness reward, and pass@k metrics across different tasks.",
    205           "source": "haiku"
    206         },
    207         "human_evaluation": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Appendix C.4 includes human evaluation on HH-RLHF with 4 evaluators per query pair comparing PPO vs DPO outputs.",
    211           "source": "haiku"
    212         },
    213         "held_out_test_set": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results are reported on held-out test sets for APPS and CodeContest; for HH-RLHF, checkpoints are selected on validation and evaluated on test set.",
    217           "source": "haiku"
    218         },
    219         "per_category_breakdown": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "APPS results are reported per difficulty level (Introductory, Interview, Competition) in Tables 7 and Figure 2, and CodeContest uses 10@1k on both validation and test sets.",
    223           "source": "haiku"
    224         },
    225         "failure_cases_discussed": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The paper explicitly discusses DPO's complete failure on CodeContest (0% pass rate, 'many meaningless code snippets') and DPO-Iter degrading below SFT on code tasks.",
    229           "source": "haiku"
    230         },
    231         "negative_results_reported": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "DPO-Iter performing worse than SFT on code generation (Table 7), DPO achieving 0% on CodeContest, and baseline PPO degrading on APPS with small batch sizes are all reported.",
    235           "source": "haiku"
    236         }
    237       },
    238       "setup_transparency": {
    239         "model_versions_specified": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Specific model versions are stated: Llama-2-7B, CodeLlama-7B, CodeLlama-13B, and CodeLlama-34B.",
    243           "source": "haiku"
    244         },
    245         "prompts_provided": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "The full GPT-4 evaluation prompt template is provided in Appendix B with exact formatting instructions.",
    249           "source": "haiku"
    250         },
    251         "hyperparameters_reported": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Appendix A provides detailed hyperparameters for both DPO (β=0.1, lr=1e-6) and PPO (actor lr=1e-5, critic lr=5e-6, batch size=512, temperature=1.0, top-k=200, KL β=0.1, clip=20, λ=1, γ=1).",
    255           "source": "haiku"
    256         },
    257         "scaffolding_described": {
    258           "applies": false,
    259           "answer": false,
    260           "justification": "This is an RLHF training paper, not an agentic scaffolding paper; no agentic scaffolding is involved.",
    261           "source": "haiku"
    262         },
    263         "data_preprocessing_documented": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "The paper describes how SafeRLHF preference labels are combined (Section 4.3), how code task rewards are defined (pass/fail with reward 10/0), and how DPO-Iter constructs preference pairs from model-generated samples.",
    267           "source": "haiku"
    268         }
    269       },
    270       "data_integrity": {
    271         "raw_data_available": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "All datasets used (HH-RLHF, SafeRLHF, APPS, CodeContest) are publicly available for independent verification.",
    275           "source": "haiku"
    276         },
    277         "data_collection_described": {
    278           "applies": true,
    279           "answer": true,
    280           "justification": "Data collection is described for each task: SafeRLHF preference label combination logic is explained, code task reward derivation from test cases is described, and DPO-Iter data construction procedure is documented.",
    281           "source": "haiku"
    282         },
    283         "recruitment_methods_described": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "The human evaluation (Appendix C.4) mentions '4 different persons' per query pair but provides no information about who they are, how they were recruited, or their qualifications.",
    287           "source": "haiku"
    288         },
    289         "data_pipeline_documented": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "The full pipeline from preference data to training labels is documented: reward model training on preference pairs, PPO optimization, and DPO-Iter's iterative sampling-and-labeling procedure are all described.",
    293           "source": "haiku"
    294         }
    295       },
    296       "contamination": {
    297         "training_cutoff_stated": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "The paper evaluates CodeLlama and Llama-2 on APPS and CodeContest without stating the training data cutoff dates for these models.",
    301           "source": "haiku"
    302         },
    303         "train_test_overlap_discussed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "No discussion of whether APPS or CodeContest problems appeared in CodeLlama's or Llama-2's pretraining data.",
    307           "source": "haiku"
    308         },
    309         "benchmark_contamination_addressed": {
    310           "applies": true,
    311           "answer": false,
    312           "justification": "The paper does not address whether the competitive programming benchmarks (APPS, CodeContest) were available before the model training cutoffs.",
    313           "source": "haiku"
    314         }
    315       },
    316       "human_studies": {
    317         "pre_registered": {
    318           "applies": true,
    319           "answer": false,
    320           "justification": "The human evaluation in Appendix C.4 was not pre-registered.",
    321           "source": "haiku"
    322         },
    323         "irb_or_ethics_approval": {
    324           "applies": true,
    325           "answer": false,
    326           "justification": "No IRB or ethics approval is mentioned for the human evaluation study.",
    327           "source": "haiku"
    328         },
    329         "demographics_reported": {
    330           "applies": true,
    331           "answer": false,
    332           "justification": "No demographics are reported for the 4 human evaluators used in Appendix C.4.",
    333           "source": "haiku"
    334         },
    335         "inclusion_exclusion_criteria": {
    336           "applies": true,
    337           "answer": false,
    338           "justification": "No inclusion or exclusion criteria are stated for the human evaluators.",
    339           "source": "haiku"
    340         },
    341         "randomization_described": {
    342           "applies": true,
    343           "answer": false,
    344           "justification": "No randomization procedure is described for assigning evaluators to query pairs in the human evaluation.",
    345           "source": "haiku"
    346         },
    347         "blinding_described": {
    348           "applies": true,
    349           "answer": false,
    350           "justification": "No blinding procedure is described for the human evaluation; it is unclear whether evaluators knew which responses came from PPO vs DPO.",
    351           "source": "haiku"
    352         },
    353         "attrition_reported": {
    354           "applies": false,
    355           "answer": false,
    356           "justification": "Not a longitudinal study; no attrition is applicable.",
    357           "source": "haiku"
    358         }
    359       },
    360       "cost_and_practicality": {
    361         "inference_cost_reported": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No inference cost or latency figures are reported for any of the evaluated models.",
    365           "source": "haiku"
    366         },
    367         "compute_budget_stated": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "The paper trains 34B parameter models for 16 PPO epochs but provides no information about GPU hours, compute cost, or hardware used.",
    371           "source": "haiku"
    372         }
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "PPO surpasses DPO across all evaluated RLHF benchmarks, including both dialogue and code generation tasks.",
    379       "evidence": "Tables 3–8 show PPO outperforming DPO and DPO-Iter on HH-RLHF, SafeRLHF, APPS, and CodeContest across all metrics.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "DPO has a fundamental limitation: it can find biased solutions that exploit out-of-distribution responses due to the narrow coverage of preference datasets.",
    384       "evidence": "Theorem 4.1 proves ΠPPO ⊊ ΠDPO, and Figure 1 empirically demonstrates DPO assigning higher probability to OOD responses in a synthetic scenario.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Three key factors substantially improve PPO performance: advantage normalization, large batch size, and EMA reference model updates.",
    389       "evidence": "Table 3 ablation study shows each component contributing incrementally; large batch size provides the most significant gain, especially on code tasks.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "DPO completely fails on CodeContest, achieving 0% pass rate and generating meaningless code snippets after one epoch.",
    394       "evidence": "Table 8 shows DPO: 0.0% 10@1k on both validation and test sets for CodeContest, with the paper noting 'the DPO model outputs many meaningless code snippets.'",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "PPO with CodeLlama-34B achieves state-of-the-art results on CodeContest (22.4% 10@1k), surpassing AlphaCode-41B (16.4%).",
    399       "evidence": "Table 8 reports PPO achieving 22.4% on the test set vs AlphaCode-41B's 16.4%, using only Python vs AlphaCode's Python+C++.",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "Iterative DPO mitigates distribution shift and achieves comparable safety rates to PPO, but still underperforms on helpfulness and challenging code tasks.",
    404       "evidence": "Table 2 shows DPO-Iter achieving 99.9% safety rate (close to PPO's 99.5%) but lower helpfulness; Tables 7–8 show DPO-Iter underperforming SFT on code tasks.",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "benchmark-eval",
    410     "theoretical",
    411     "ablation"
    412   ],
    413   "key_findings": "PPO consistently outperforms DPO across all evaluated RLHF benchmarks (dialogue, safety, and code generation), contradicting the prevailing academic narrative that DPO is superior. DPO's underperformance is attributed to distribution shift between the base model and preference dataset, as demonstrated theoretically (ΠPPO ⊊ ΠDPO) and empirically. Three key PPO implementation factors — advantage normalization, large batch size, and EMA reference model updates — are identified through ablation studies as critical for performance. With these techniques, PPO with CodeLlama-34B achieves state-of-the-art competitive programming results, surpassing AlphaCode-41B despite using only Python.",
    414   "red_flags": [
    415     {
    416       "flag": "Asymmetric reward setup for code tasks",
    417       "detail": "PPO uses ground-truth test-case rewards while DPO-Iter uses a learned reward model for preference labeling — an inherently asymmetric comparison that advantages PPO on code tasks. The paper briefly acknowledges this ('we utilize the ground-truth reward for PPO') but does not control for it."
    418     },
    419     {
    420       "flag": "No error bars or significance tests",
    421       "detail": "All results across Tables 3–8 are single point estimates with no variance, confidence intervals, or statistical significance tests despite making numerous comparative claims."
    422     },
    423     {
    424       "flag": "No compute budget disclosure",
    425       "detail": "The paper trains 34B parameter models for 16 PPO epochs and runs extensive ablations but provides no GPU hours, hardware specs, or compute cost information."
    426     },
    427     {
    428       "flag": "Potential author conflict of interest",
    429       "detail": "Multiple authors are affiliated with OpenPsi Inc. and the code is hosted at openpsi-project/ReaLHF, yet no competing interests are disclosed. OpenPsi could benefit commercially from advocacy for PPO over DPO."
    430     },
    431     {
    432       "flag": "Minimal limitations discussion",
    433       "detail": "The paper acknowledges only reward model training as a limitation in two sentences at the end of the conclusion; no threats to validity section, no discussion of scope boundaries or conditions under which DPO might be preferable."
    434     },
    435     {
    436       "flag": "Underpowered human evaluation",
    437       "detail": "The human evaluation uses only 4 evaluators per query with no recruitment description, demographic information, blinding procedure, or statistical analysis."
    438     }
    439   ],
    440   "cited_papers": [
    441     {
    442       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    443       "relevance": "Central comparison target; DPO is the primary method being challenged by this paper"
    444     },
    445     {
    446       "title": "Training language models to follow instructions with human feedback (InstructGPT)",
    447       "relevance": "Foundational RLHF paper; establishes the PPO-based alignment paradigm this work defends"
    448     },
    449     {
    450       "title": "Competition-level code generation with AlphaCode",
    451       "relevance": "Prior SOTA on CodeContest benchmark that PPO surpasses; provides APPS and CodeContest evaluation context"
    452     },
    453     {
    454       "title": "Safe RLHF: Safe Reinforcement Learning from Human Feedback",
    455       "relevance": "Provides SafeRLHF dataset and evaluation models used in safety alignment experiments"
    456     },
    457     {
    458       "title": "Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback (HH-RLHF)",
    459       "relevance": "Provides the HH-RLHF dataset used as the primary dialogue alignment benchmark"
    460     },
    461     {
    462       "title": "Secrets of RLHF in Large Language Models Part I: PPO",
    463       "relevance": "Prior work on PPO implementation details for LLMs that this paper extends"
    464     },
    465     {
    466       "title": "Implementation Matters in Deep RL: A Case Study on PPO and TRPO",
    467       "relevance": "Establishes that implementation details matter for RL algorithms; motivates this paper's ablation study"
    468     },
    469     {
    470       "title": "RRHF: Rank Responses to Align Language Models with Human Feedback Without Tears",
    471       "relevance": "Comparison baseline reward-free alignment method included in benchmark experiments"
    472     },
    473     {
    474       "title": "Measuring Coding Challenge Competence with APPS",
    475       "relevance": "Provides the APPS competitive programming benchmark used for code generation evaluation"
    476     },
    477     {
    478       "title": "Iterative preference learning from human feedback: Bridging theory and practice for RLHF under KL-constraint",
    479       "relevance": "Provides theoretical grounding for iterative DPO variant evaluated in this paper"
    480     }
    481   ],
    482   "engagement_factors": {
    483     "practical_relevance": {
    484       "score": 3,
    485       "justification": "Directly addresses the DPO vs PPO choice that every LLM alignment practitioner faces, with actionable implementation guidelines for three specific PPO improvements."
    486     },
    487     "surprise_contrarian": {
    488       "score": 3,
    489       "justification": "Challenges the prevailing belief (supported by academic benchmark results) that DPO is superior to PPO, with both theoretical proof and empirical evidence across four benchmarks."
    490     },
    491     "fear_safety": {
    492       "score": 1,
    493       "justification": "Touches on safety alignment (SafeRLHF experiments) but is primarily a methods comparison paper without direct AI risk framing."
    494     },
    495     "drama_conflict": {
    496       "score": 2,
    497       "justification": "PPO vs DPO is a genuine ongoing debate in the LLM alignment community; the paper takes a strong position against the prevailing DPO trend."
    498     },
    499     "demo_ability": {
    500       "score": 2,
    501       "justification": "Code is publicly available at openpsi-project/ReaLHF, but reproducing 34B model experiments requires substantial compute resources."
    502     },
    503     "brand_recognition": {
    504       "score": 1,
    505       "justification": "Tsinghua University and OpenPsi Inc. are not globally prominent AI labs on the level of OpenAI, Google, or Meta."
    506     }
    507   },
    508   "hn_data": {
    509     "threads": [
    510       {
    511         "hn_id": "43796419",
    512         "title": "Paper2Code: Automating Code Generation from Scientific Papers",
    513         "points": 133,
    514         "comments": 27,
    515         "url": "https://news.ycombinator.com/item?id=43796419"
    516       },
    517       {
    518         "hn_id": "39934322",
    519         "title": "Rule-based NLP system beats LLM for analysis of psychiatric clinical notes",
    520         "points": 120,
    521         "comments": 19,
    522         "url": "https://news.ycombinator.com/item?id=39934322"
    523       },
    524       {
    525         "hn_id": "40919762",
    526         "title": "Grokking the Sequent Calculus (Functional Pearl)",
    527         "points": 29,
    528         "comments": 1,
    529         "url": "https://news.ycombinator.com/item?id=40919762"
    530       },
    531       {
    532         "hn_id": "39442782",
    533         "title": "BlackJAX: Composable Bayesian Inference in Jax",
    534         "points": 3,
    535         "comments": 0,
    536         "url": "https://news.ycombinator.com/item?id=39442782"
    537       },
    538       {
    539         "hn_id": "40200892",
    540         "title": "Fine Tuning LLM for Enterprise: Practical Guidelines and Recommendations",
    541         "points": 2,
    542         "comments": 0,
    543         "url": "https://news.ycombinator.com/item?id=40200892"
    544       },
    545       {
    546         "hn_id": "39399660",
    547         "title": "BitDelta: Your Fine-Tune May Only Be Worth One Bit",
    548         "points": 2,
    549         "comments": 2,
    550         "url": "https://news.ycombinator.com/item?id=39399660"
    551       },
    552       {
    553         "hn_id": "40554251",
    554         "title": "Contextual Position Encoding: Learning to Count What's Important",
    555         "points": 2,
    556         "comments": 1,
    557         "url": "https://news.ycombinator.com/item?id=40554251"
    558       },
    559       {
    560         "hn_id": "35687268",
    561         "title": "Test-driving RISC-V Vector hardware for HPC",
    562         "points": 2,
    563         "comments": 1,
    564         "url": "https://news.ycombinator.com/item?id=35687268"
    565       },
    566       {
    567         "hn_id": "40388060",
    568         "title": "Comprehensive Causal Machine Learning",
    569         "points": 2,
    570         "comments": 0,
    571         "url": "https://news.ycombinator.com/item?id=40388060"
    572       },
    573       {
    574         "hn_id": "40708472",
    575         "title": "Travel Planning with Guarantees by Combining LLMs and Automated Planners",
    576         "points": 1,
    577         "comments": 0,
    578         "url": "https://news.ycombinator.com/item?id=40708472"
    579       }
    580     ],
    581     "top_points": 133,
    582     "total_points": 296,
    583     "total_comments": 51
    584   }
    585 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs