scan-v5.json (28634B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "LLM-Powered Test Case Generation for Detecting Bugs in Plausible Programs", 6 "authors": [ 7 "Kaibo Liu", 8 "Zhenpeng Chen", 9 "Yiyang Liu", 10 "Jie M. Zhang", 11 "Mark Harman", 12 "Yudong Han", 13 "Yun Ma", 14 "Yihong Dong", 15 "Ge Li", 16 "Gang Huang" 17 ], 18 "year": 2024, 19 "venue": "ACL 2025 (Annual Meeting of the Association for Computational Linguistics)", 20 "arxiv_id": "2404.10304", 21 "doi": "10.18653/v1/2025.acl-long.20" 22 }, 23 "checklist": { 24 "claims_and_evidence": { 25 "abstract_claims_supported": { 26 "applies": true, 27 "answer": true, 28 "justification": "All abstract claims (1.80×/2.65×/1.66× improvements, three-stage pipeline, evaluation on two datasets) are directly supported by Table 1 and Sections 4–6.", 29 "source": "haiku" 30 }, 31 "causal_claims_justified": { 32 "applies": true, 33 "answer": true, 34 "justification": "Comparative claims are justified by controlled experiments with same datasets/baselines. Ablation study (Section 6.3, Table 2) establishes causal contribution of each component.", 35 "source": "haiku" 36 }, 37 "generalization_bounded": { 38 "applies": true, 39 "answer": true, 40 "justification": "Evaluation bounded to coding task datasets (TrickyBugs, EvalPlus). Generalization test on deepseek-v3 (Section 7.2) is limited. Scope appropriately bounded.", 41 "source": "haiku" 42 }, 43 "alternative_explanations_discussed": { 44 "applies": true, 45 "answer": false, 46 "justification": "Paper does not explore alternative explanations for TrickCatcher's superior performance beyond baseline comparisons. Section 7.1 discusses buggy variant usefulness but lacks depth on why diversity-driven approach fundamentally works better.", 47 "source": "haiku" 48 }, 49 "proxy_outcome_distinction": { 50 "applies": true, 51 "answer": true, 52 "justification": "Measured outcomes (recall, precision, F1, TP/FP counts) directly align with claimed outcome (detecting bugs in plausible programs). True/false positive distinction is clearly defined in Section 3.", 53 "source": "haiku" 54 } 55 }, 56 "limitations_and_scope": { 57 "limitations_section_present": { 58 "applies": true, 59 "answer": true, 60 "justification": "Dedicated 'Limitations' section before Acknowledgements lists three specific limitations (model budget constraints, LLM uncertainty, data leakage risk).", 61 "source": "haiku" 62 }, 63 "threats_to_validity_specific": { 64 "applies": true, 65 "answer": true, 66 "justification": "Budget forced use of gpt-3.5-turbo instead of stronger models. Acknowledged uncertainty mitigated by multiple runs (Appendix B). Data leakage addressed by noting dataset release dates.", 67 "source": "haiku" 68 }, 69 "scope_boundaries_stated": { 70 "applies": true, 71 "answer": true, 72 "justification": "Scope to small coding tasks on plausible programs is implicit in experimental setup. Limitations section could be more explicit about non-applicability to other domains, but boundaries are clear from context.", 73 "source": "haiku" 74 } 75 }, 76 "conflicts_of_interest": { 77 "funding_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "Acknowledgements fully disclose funding from National Key R&D Program (Grant No. 2023YFB4503801), NSFC (Grants 62192733, 62192730), Hubei Province, and InnovateUK.", 81 "source": "haiku" 82 }, 83 "affiliations_disclosed": { 84 "applies": true, 85 "answer": true, 86 "justification": "All author affiliations listed: Peking University, Nanyang Technological University, King's College London, University College London, National Key Laboratory of Data Space Technology.", 87 "source": "haiku" 88 }, 89 "funder_independent_of_outcome": { 90 "applies": true, 91 "answer": true, 92 "justification": "Funders (NSFC, government grants, InnovateUK) are independent public bodies, not companies making the LLMs (OpenAI, DeepSeek) being evaluated.", 93 "source": "haiku" 94 }, 95 "financial_interests_declared": { 96 "applies": true, 97 "answer": false, 98 "justification": "No competing interests statement or financial interest declaration (patents, equity, consulting) present in the paper.", 99 "source": "haiku" 100 } 101 }, 102 "scope_and_framing": { 103 "key_terms_defined": { 104 "applies": true, 105 "answer": true, 106 "justification": "Paper defines: 'plausible programs' (Section 3), 'tricky bugs', 'test oracle', 'differential testing', and problem statement clearly framed in Section 3.", 107 "source": "haiku" 108 }, 109 "intended_contribution_clear": { 110 "applies": true, 111 "answer": true, 112 "justification": "Three novel contributions explicitly stated: PUT-guided program variant generation, generator-based input generation, diversity-driven differential testing. Novelty vs. prior work (esp. Differential Prompting) clearly delineated.", 113 "source": "haiku" 114 }, 115 "engagement_with_prior_work": { 116 "applies": true, 117 "answer": true, 118 "justification": "Section 2 surveys traditional test generation, LLM-based approaches, and explicitly compares TrickCatcher against Differential Prompting with detailed distinction in 4 points (Section 2).", 119 "source": "haiku" 120 } 121 } 122 }, 123 "type_checklist": { 124 "empirical": { 125 "artifacts": { 126 "code_released": { 127 "applies": true, 128 "answer": true, 129 "justification": "Abstract states 'Code and data used are available at https://github.com/RinCloud/TrickCatcher'. GitHub repository is publicly accessible.", 130 "source": "haiku" 131 }, 132 "data_released": { 133 "applies": true, 134 "answer": true, 135 "justification": "Both TrickyBugs and EvalPlus datasets are publicly available from prior work (Liu et al. 2024b, Liu et al. 2023a). Paper uses existing public benchmarks.", 136 "source": "haiku" 137 }, 138 "environment_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "LLM versions specified (gpt-3.5-turbo-0125, deepseek-v3) but no Docker/requirements.txt, Python version, or dependency specification provided.", 142 "source": "haiku" 143 }, 144 "reproduction_instructions": { 145 "applies": true, 146 "answer": false, 147 "justification": "No step-by-step reproduction walkthrough in paper. Code available on GitHub but paper does not include command-line examples or setup instructions.", 148 "source": "haiku" 149 } 150 }, 151 "statistical_methodology": { 152 "confidence_intervals_or_error_bars": { 153 "applies": true, 154 "answer": false, 155 "justification": "Table 1, Figures 5-8 report point estimates (recall, precision, F1) without confidence intervals, standard deviations, or error bars. Multiple runs mentioned (Appendix B) but variance not quantified.", 156 "source": "haiku" 157 }, 158 "significance_tests": { 159 "applies": true, 160 "answer": false, 161 "justification": "No statistical significance tests (t-test, ANOVA, etc.) reported. Claims like 'up to 1.80×' are not tested for statistical significance.", 162 "source": "haiku" 163 }, 164 "effect_sizes_reported": { 165 "applies": true, 166 "answer": true, 167 "justification": "Effect sizes reported as F1 scores (e.g., 41.31% vs 24.95%), improvement ratios (1.80×, 2.65×, 1.66×), and absolute point estimates in Table 1.", 168 "source": "haiku" 169 }, 170 "sample_size_justified": { 171 "applies": true, 172 "answer": false, 173 "justification": "TrickyBugs (366) and EvalPlus (151) datasets used, but no power analysis, sample size justification, or discussion of adequacy provided.", 174 "source": "haiku" 175 }, 176 "variance_reported": { 177 "applies": true, 178 "answer": false, 179 "justification": "Appendix B describes repetition strategy (100 inputs, 10 variants, combinatorial sampling) but results tables and figures do not report standard deviations, confidence intervals, or variance metrics.", 180 "source": "haiku" 181 } 182 }, 183 "evaluation_design": { 184 "baselines_included": { 185 "applies": true, 186 "answer": true, 187 "justification": "Three baselines: DirectChat (CHAT), Differential Prompting Plus (DPP), Automated Program Repair (APR). Each evaluated in Table 1.", 188 "source": "haiku" 189 }, 190 "baselines_contemporary": { 191 "applies": true, 192 "answer": true, 193 "justification": "DPP from Li et al. 2023, other LLM-based methods from 2024. Baselines are relatively recent and representative of state-of-the-art.", 194 "source": "haiku" 195 }, 196 "ablation_study": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 6.3 provides Table 2 ablation study testing 6 patterns systematically removing/adding components: program generation, input generation, differential testing.", 200 "source": "haiku" 201 }, 202 "multiple_metrics": { 203 "applies": true, 204 "answer": true, 205 "justification": "Primary metrics: recall, precision, F1 score. Secondary: TP, FP, FN counts, passing rates on base/extra test cases (Figure 8), task difficulty (Figure 7).", 206 "source": "haiku" 207 }, 208 "human_evaluation": { 209 "applies": false, 210 "answer": false, 211 "justification": "Not applicable. TrickyBugs input validity manually verified (mentioned in Section 5.3) but this is validation not human evaluation of system outputs.", 212 "source": "haiku" 213 }, 214 "held_out_test_set": { 215 "applies": true, 216 "answer": true, 217 "justification": "Both datasets have held-out test cases: TrickyBugs separates existing test suite from additional bug-revealing tests; EvalPlus has base vs extra test cases.", 218 "source": "haiku" 219 }, 220 "per_category_breakdown": { 221 "applies": true, 222 "answer": true, 223 "justification": "Results broken down by: dataset type (C++/Python), difficulty (low/high in Figure 7-8), and shown across multiple k values in Table 1 and Figure 6.", 224 "source": "haiku" 225 }, 226 "failure_cases_discussed": { 227 "applies": true, 228 "answer": true, 229 "justification": "Figure 5 shows false positives across methods. Section 7.1 discusses buggy variants that still contribute. Some analysis of when/why methods fail but limited depth.", 230 "source": "haiku" 231 }, 232 "negative_results_reported": { 233 "applies": true, 234 "answer": true, 235 "justification": "Improvements on TrickyBugs (Python) for recall are modest (2.01% average). Precision sometimes lower than DPP worst case. Limitations section acknowledges constraints.", 236 "source": "haiku" 237 } 238 }, 239 "setup_transparency": { 240 "model_versions_specified": { 241 "applies": true, 242 "answer": true, 243 "justification": "Specific versions: gpt-3.5-turbo-0125 (Section 5.5), deepseek-v3 (Section 7.2) with exact model IDs provided.", 244 "source": "haiku" 245 }, 246 "prompts_provided": { 247 "applies": true, 248 "answer": true, 249 "justification": "Figure 3 shows exact prompt for program variant generation. Figure 4 shows exact prompt for input generator creation. Both complete and unambiguous.", 250 "source": "haiku" 251 }, 252 "hyperparameters_reported": { 253 "applies": true, 254 "answer": false, 255 "justification": "LLM hyperparameters (temperature, top-p, max_tokens, etc.) not reported. Only model names given. Algorithm 1 specifies differential testing logic but not LLM sampling params.", 256 "source": "haiku" 257 }, 258 "scaffolding_described": { 259 "applies": true, 260 "answer": true, 261 "justification": "Three-stage pipeline clearly described: Section 4.1 (program variant generation), 4.2 (test input generation), 4.3 (differential testing). Algorithm 1 provides pseudocode.", 262 "source": "haiku" 263 }, 264 "data_preprocessing_documented": { 265 "applies": true, 266 "answer": true, 267 "justification": "EvalPlus filtering described: select AI-generated samples passing base but failing extra tests. TrickyBugs filtering by test suite mentioned. Program variant filtering after generation documented.", 268 "source": "haiku" 269 } 270 }, 271 "data_integrity": { 272 "raw_data_available": { 273 "applies": true, 274 "answer": true, 275 "justification": "Both datasets (TrickyBugs, EvalPlus) are publicly available from prior work. Authors provide code repository with evaluation scripts.", 276 "source": "haiku" 277 }, 278 "data_collection_described": { 279 "applies": true, 280 "answer": true, 281 "justification": "TrickyBugs collection described in Liu et al. 2024b (online judge platform submissions). EvalPlus from Liu et al. 2023a (code generation benchmark). Referenced, not collected by this paper.", 282 "source": "haiku" 283 }, 284 "recruitment_methods_described": { 285 "applies": false, 286 "answer": false, 287 "justification": "Not applicable. No human participant recruitment—benchmarks are programming tasks from online judges and code generation datasets.", 288 "source": "haiku" 289 }, 290 "data_pipeline_documented": { 291 "applies": true, 292 "answer": true, 293 "justification": "Pipeline documented: datasets → filter variants by existing tests → generate inputs → execute variants → compare outputs → collect TP/FP. Described in Section 4 and Figure 2.", 294 "source": "haiku" 295 } 296 }, 297 "contamination": { 298 "training_cutoff_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Training cutoff addressed: gpt-3.5-turbo-0125 release date noted. TrickyBugs released after model cutoff. EvalPlus prohibits training use. Discussed in Limitations.", 302 "source": "haiku" 303 }, 304 "train_test_overlap_discussed": { 305 "applies": true, 306 "answer": true, 307 "justification": "Data leakage discussed in Limitations: TrickyBugs released after gpt-3.5-turbo training, EvalPlus prohibits training use. Poor baseline performance argues against major leakage.", 308 "source": "haiku" 309 }, 310 "benchmark_contamination_addressed": { 311 "applies": true, 312 "answer": true, 313 "justification": "Contamination addressed through dataset release dates and explicit prohibition. Conversely, weak LLM baselines (Table 1) suggest benchmarks not memorized.", 314 "source": "haiku" 315 } 316 }, 317 "human_studies": { 318 "pre_registered": { 319 "applies": false, 320 "answer": false, 321 "justification": "No human participants. Not applicable.", 322 "source": "haiku" 323 }, 324 "irb_or_ethics_approval": { 325 "applies": false, 326 "answer": false, 327 "justification": "No human subjects. Not applicable.", 328 "source": "haiku" 329 }, 330 "demographics_reported": { 331 "applies": false, 332 "answer": false, 333 "justification": "No human participants. Not applicable.", 334 "source": "haiku" 335 }, 336 "inclusion_exclusion_criteria": { 337 "applies": false, 338 "answer": false, 339 "justification": "No human subjects. Not applicable.", 340 "source": "haiku" 341 }, 342 "randomization_described": { 343 "applies": false, 344 "answer": false, 345 "justification": "No human participants. Randomization used for input selection/variant sampling (Appendix B) but not human randomization.", 346 "source": "haiku" 347 }, 348 "blinding_described": { 349 "applies": false, 350 "answer": false, 351 "justification": "No human subjects. Not applicable.", 352 "source": "haiku" 353 }, 354 "attrition_reported": { 355 "applies": false, 356 "answer": false, 357 "justification": "No human participants. Not applicable.", 358 "source": "haiku" 359 } 360 }, 361 "cost_and_practicality": { 362 "inference_cost_reported": { 363 "applies": true, 364 "answer": false, 365 "justification": "Budget constraints mentioned as motivation for using gpt-3.5-turbo (Section 5.5) but no actual inference costs, latency, or cost per program reported.", 366 "source": "haiku" 367 }, 368 "compute_budget_stated": { 369 "applies": true, 370 "answer": false, 371 "justification": "Paper mentions 'budget constraints' limited model choice but does not quantify total computational budget or cost.", 372 "source": "haiku" 373 } 374 } 375 } 376 }, 377 "claims": [ 378 { 379 "claim": "TrickCatcher achieves 1.80× recall, 2.65× precision, 1.66× F1 score compared to best baseline (DPP)", 380 "evidence": "Table 1 reports F1 scores: TrickCatcher 41.31–51.34%, DPP 24.95–35.76% across three datasets", 381 "supported": "strong" 382 }, 383 { 384 "claim": "PUT-guided program generation produces higher-quality variants than specification-only generation", 385 "evidence": "Ablation study (Table 2): filtering+basic IG+ours DT (pattern 3) achieves 0.33 F1 vs filtered+basic (pattern 2) 0.23 F1; patterns 5-6 with full approach reach 0.37-0.41", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Generator-based input generation achieves higher validity than direct LLM generation", 390 "evidence": "Introduction mentions 40.10% invalid inputs from direct generation. Figure 5 shows TrickCatcher produces zero false positives from invalid inputs whereas DPP has many", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Diversity-driven differential testing outperforms majority voting for bug detection", 395 "evidence": "Table 2 ablation: filtered+basic+basic (pattern 2) vs filtered+basic+ours DT (pattern 3) improves F1 from 0.23 to 0.33 on TrickyBugs C++", 396 "supported": "strong" 397 }, 398 { 399 "claim": "TrickCatcher generates up to 16× fewer false positives than baselines on correct programs", 400 "evidence": "Figure 5 shows TrickCatcher max 5 FPs vs CHAT/DPP 26+ FPs. Ratio: 26/5 ≈ 5× minimum, up to 16× for specific configurations", 401 "supported": "strong" 402 }, 403 { 404 "claim": "Performance remains stable across different numbers of program variants (k=2 to k=10)", 405 "evidence": "Figure 6 shows TrickCatcher F1 stable 0.40-0.41 and precision 0.69-0.70 across k. DPP fluctuates 0.23-0.25 F1", 406 "supported": "strong" 407 }, 408 { 409 "claim": "TrickCatcher shows greater improvement on high-difficulty tasks than low-difficulty", 410 "evidence": "Figure 7-8: TrickCatcher median difficulty ~3500 for high-difficulty successes; DPP success median ~2200. Figure 8 shows TrickCatcher higher pass rate on hard tasks (base/extra)", 411 "supported": "moderate" 412 }, 413 { 414 "claim": "Buggy program variants can contribute meaningfully to bug detection", 415 "evidence": "Section 7.1: 23.2% (TrickyBugs) and 15.0% (EvalPlus) of useful variants are themselves buggy. TrickCatcher outperforms APR (repair-only) confirming non-repair detection", 416 "supported": "moderate" 417 } 418 ], 419 "methodology_tags": [ 420 "empirical", 421 "benchmark-eval", 422 "comparative" 423 ], 424 "key_findings": "TrickCatcher, an LLM-powered test generation method combining PUT-guided program variant generation, generator-based input generation, and diversity-driven differential testing, achieves 1.66–1.80× improvements in F1 score, recall, and precision over state-of-the-art baselines on two datasets (TrickyBugs: 366 human-written programs; EvalPlus: 151 AI-generated programs). The approach generates 16× fewer false positives than baselines while maintaining stable performance across variant counts and showing larger gains on high-difficulty tasks. Ablation studies confirm each component contributes meaningfully, and the method generalizes to deeper LLMs (deepseek-v3).", 425 "red_flags": [ 426 { 427 "flag": "No statistical significance testing", 428 "detail": "Reported improvements (1.80×, 2.65×) lack p-values or confidence intervals; results could reflect random variation rather than systematic advantage" 429 }, 430 { 431 "flag": "Variance not quantified", 432 "detail": "Appendix B describes repetition strategy (100 inputs, C(10,k) combinations) but results tables show only point estimates without standard deviations or confidence bounds" 433 }, 434 { 435 "flag": "Sample size not justified", 436 "detail": "No power analysis or justification provided for 366+151 programs; adequacy unclear for detecting true effect sizes" 437 }, 438 { 439 "flag": "LLM hyperparameters underspecified", 440 "detail": "Temperature, top-p, max_tokens, and other sampling parameters not reported; reproducibility compromised" 441 }, 442 { 443 "flag": "Limited generalization scope", 444 "detail": "Evaluation restricted to coding tasks on two benchmarks. Deepseek-v3 generalization test (Table 3) limited to one alternative model" 445 }, 446 { 447 "flag": "Data leakage not fully ruled out", 448 "detail": "While paper argues TrickyBugs post-dated training cutoff and EvalPlus prohibits training use, timing and enforcement not independently verified" 449 }, 450 { 451 "flag": "Manual validation required for TrickyBugs input validity", 452 "detail": "Input validity assessed by hand for TrickyBugs but automated for EvalPlus; introduces subjectivity and inconsistency" 453 }, 454 { 455 "flag": "Limitations section somewhat boilerplate", 456 "detail": "Budget constraint limiting model choice, LLM uncertainty mitigated by averaging, data leakage addressed by release dates—could be more concrete about residual threats" 457 }, 458 { 459 "flag": "Alternative explanations not explored", 460 "detail": "Paper compares against baselines but does not investigate why diversity-driven approach fundamentally works beyond empirical results" 461 }, 462 { 463 "flag": "No reproduction walkthrough in paper", 464 "detail": "Code available on GitHub but paper lacks step-by-step setup/execution instructions; readers must reverse-engineer from code" 465 } 466 ], 467 "cited_papers": [ 468 { 469 "title": "Nuances are the key: Unlocking ChatGPT to find failure-inducing tests with differential prompting", 470 "relevance": "Core prior work (Differential Prompting); paper explicitly builds on and improves this approach with three key modifications" 471 }, 472 { 473 "title": "TrickyBugs: A dataset of corner-case bugs in plausible programs", 474 "relevance": "Primary evaluation dataset; papers defines 'plausible programs' and 'tricky bugs' problem" 475 }, 476 { 477 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 478 "relevance": "EvalPlus benchmark used for AI-generated code evaluation; defines base/extra test case split" 479 }, 480 { 481 "title": "Large language model-based agents for software engineering: A survey", 482 "relevance": "Surveys LLM applications in code generation and testing; contextualizes contribution in broader agent/code space" 483 }, 484 { 485 "title": "Who judges the judge: An empirical study on online judge tests", 486 "relevance": "Foundational work identifying prevalence of tricky bugs (3,440 bugs in online judge); motivates problem" 487 }, 488 { 489 "title": "Evaluating large language models trained on code", 490 "relevance": "Foundational LLM code generation benchmark; shows LLM struggle with complex tasks, motivating TrickCatcher's two-step input generation" 491 }, 492 { 493 "title": "EvoSuite: Automated unit test generation for object-oriented software", 494 "relevance": "Traditional search-based test generation baseline; contrasts with LLM-powered approach" 495 }, 496 { 497 "title": "KLEE: unassisted automatic generation of high-coverage tests", 498 "relevance": "Symbolic execution test generation tool; represents pre-LLM approach to test generation" 499 } 500 ], 501 "engagement_factors": { 502 "practical_relevance": { 503 "score": 3, 504 "justification": "TrickCatcher is immediately usable for finding real bugs in code; released on GitHub with full implementation; directly applicable to developer workflows" 505 }, 506 "surprise_contrarian": { 507 "score": 2, 508 "justification": "Diversity-driven testing over majority voting is counterintuitive; using buggy variants as oracle contributors is creative; but core idea (LLM-based test generation) not novel" 509 }, 510 "fear_safety": { 511 "score": 0, 512 "justification": "Bug detection is positive for code safety but paper is tool-focused, not safety risk paper; no AI alignment or security concerns raised" 513 }, 514 "drama_conflict": { 515 "score": 1, 516 "justification": "Addresses tension between LLM-generated code quality and testing rigor; somewhat timely given AI code generation boom, but not high-conflict angle" 517 }, 518 "demo_ability": { 519 "score": 2, 520 "justification": "Code is open-source and runnable; requires having target programs to test; not immediately demandable to broad audience but doable for developers" 521 }, 522 "brand_recognition": { 523 "score": 2, 524 "justification": "Multiple top-tier institutions (Peking, NTU, KCL, UCL); published at ACL (prestigious NLP venue); not industry giants but credible academic pedigree" 525 } 526 }, 527 "hn_data": { 528 "threads": [ 529 { 530 "hn_id": "41319553", 531 "title": "First open source Legal AI retrieval benchmark for RAG finally released", 532 "points": 9, 533 "comments": 0, 534 "url": "https://news.ycombinator.com/item?id=41319553" 535 }, 536 { 537 "hn_id": "41663273", 538 "title": "Unsafe Impedance: Safe Languages and Safe by Design Software", 539 "points": 7, 540 "comments": 1, 541 "url": "https://news.ycombinator.com/item?id=41663273" 542 }, 543 { 544 "hn_id": "40209981", 545 "title": "Long-form music generation with latent diffusion", 546 "points": 3, 547 "comments": 0, 548 "url": "https://news.ycombinator.com/item?id=40209981" 549 }, 550 { 551 "hn_id": "39807740", 552 "title": "Perl: Parameter Efficient Reinforcement Learning from Human Feedback", 553 "points": 3, 554 "comments": 0, 555 "url": "https://news.ycombinator.com/item?id=39807740" 556 }, 557 { 558 "hn_id": "35687845", 559 "title": "Backporting RISC-V Vector assembly", 560 "points": 3, 561 "comments": 0, 562 "url": "https://news.ycombinator.com/item?id=35687845" 563 }, 564 { 565 "hn_id": "40122867", 566 "title": "Decentralized Trustless Bridge for Ethereum Full Node", 567 "points": 2, 568 "comments": 0, 569 "url": "https://news.ycombinator.com/item?id=40122867" 570 }, 571 { 572 "hn_id": "35676768", 573 "title": "The Law of Activity Delays", 574 "points": 2, 575 "comments": 0, 576 "url": "https://news.ycombinator.com/item?id=35676768" 577 }, 578 { 579 "hn_id": "39180109", 580 "title": "Personality Inference via Mobile Phone Sensors: A Machine Learning Approach", 581 "points": 2, 582 "comments": 1, 583 "url": "https://news.ycombinator.com/item?id=39180109" 584 }, 585 { 586 "hn_id": "39202163", 587 "title": "Using LLM Such as ChatGPT for Designing and Implementing a RISC Processor", 588 "points": 2, 589 "comments": 0, 590 "url": "https://news.ycombinator.com/item?id=39202163" 591 }, 592 { 593 "hn_id": "40061342", 594 "title": "Long-form music generation with latent diffusion", 595 "points": 1, 596 "comments": 0, 597 "url": "https://news.ycombinator.com/item?id=40061342" 598 } 599 ], 600 "top_points": 9, 601 "total_points": 34, 602 "total_comments": 2 603 } 604 }