scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27156B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LLM-Powered Test Case Generation for Detecting Bugs in Plausible Programs",
      6     "authors": [
      7       "Kaibo Liu",
      8       "Zhenpeng Chen",
      9       "Yiyang Liu",
     10       "Jie M. Zhang",
     11       "Mark Harman",
     12       "Yudong Han",
     13       "Yun Ma",
     14       "Yihong Dong",
     15       "Ge Li",
     16       "Gang Huang"
     17     ],
     18     "year": 2024,
     19     "venue": "Unknown",
     20     "arxiv_id": "2404.10304",
     21     "doi": "10.48550/arXiv.2404.10304"
     22   },
     23   "checklist": {
     24     "claims_and_evidence": {
     25       "abstract_claims_supported": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Abstract claims (1.80-2.65× performance improvements) are supported by Table 1, though multipliers cherry-pick best comparisons across datasets and k values. Worst baseline performs at k=8 for C++ (F1=24.95%), TrickCatcher at k=10 (F1=41.31%), yielding 1.66× stated improvement.",
     29         "source": "haiku"
     30       },
     31       "causal_claims_justified": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Empirical comparisons on fixed test sets with ablation study (Section 6.3, Table 2) decomposing component contributions. However, no true causal design (RCT); improvements are observational but well-supported by systematic ablation.",
     35         "source": "haiku"
     36       },
     37       "generalization_bounded": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Scope explicitly bounded to 'detecting bugs in plausible programs' on competitive programming and code generation benchmarks. Limited discussion of generalization beyond C++/Python or non-competitive domains.",
     41         "source": "haiku"
     42       },
     43       "alternative_explanations_discussed": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Ablation study systematically decomposes contributions of three components (Section 6.3). Section 7.1 discusses why buggy variants remain useful. However, broader competing hypotheses (e.g., LLM capability vs algorithmic innovations) are not discussed.",
     47         "source": "haiku"
     48       },
     49       "proxy_outcome_distinction": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Measurements directly align with claims: TP/FP/FN counts for bug detection map to Recall/Precision/F1. Section 3 and Appendix A formally define test case categories (Tc, Tr, Tw, Terr), matching measurement to claims.",
     53         "source": "haiku"
     54       }
     55     },
     56     "limitations_and_scope": {
     57       "limitations_section_present": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Section 8 'Limitations' explicitly lists three limitations: budget constraints on model choice, LLM uncertainty (mitigated by repetition), and data leakage risk.",
     61         "source": "haiku"
     62       },
     63       "threats_to_validity_specific": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Limitations address specific threats (TrickyBugs released after model cutoff, EvalPlus prohibits training use, poor baseline performance suggests no leakage). However, domain generalization (competitive programming → production code) and language specificity are unaddressed.",
     67         "source": "haiku"
     68       },
     69       "scope_boundaries_stated": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Scope bounded to 'plausible programs' on two specific benchmarks. Explicit boundaries on LOC, program types, or production applicability are not stated; scope is implicit through datasets.",
     73         "source": "haiku"
     74       }
     75     },
     76     "conflicts_of_interest": {
     77       "funding_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Funding sources explicitly listed: National Key R&D Program, NSF China, Hubei Province, InnovateUK. Jie M. Zhang's funding separately noted.",
     81         "source": "haiku"
     82       },
     83       "affiliations_disclosed": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "All author affiliations listed (Peking Univ, NTU, KCL, UCL). No apparent financial interest in evaluated tools (gpt-3.5-turbo, deepseek-v3, or baselines).",
     87         "source": "haiku"
     88       },
     89       "funder_independent_of_outcome": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Funders are government research agencies (Chinese R&D, NSF, Hubei Province, InnovateUK), independent of commercial LLM providers or tools evaluated.",
     93         "source": "haiku"
     94       },
     95       "financial_interests_declared": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No explicit competing interests statement provided. No apparent financial stake in outcomes (neither TrickCatcher commercialized nor authors have equity in compared tools).",
     99         "source": "haiku"
    100       }
    101     },
    102     "scope_and_framing": {
    103       "key_terms_defined": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Key terms formally defined in Section 3: 'plausible programs' (pass all test cases), 'tricky bugs' (escape existing tests), 'test oracle' (expected output), 'differential testing' (Section 4.3). Domain-specific terms (program variant, false positive) also defined.",
    107         "source": "haiku"
    108       },
    109       "intended_contribution_clear": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Three contributions explicitly stated in abstract and Section 4: (1) PUT-guided program variant generation, (2) generator-based test input generation, (3) diversity-driven differential testing. Each step explained with rationale.",
    113         "source": "haiku"
    114       },
    115       "engagement_with_prior_work": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Section 2 systematically reviews traditional test generation (EvoSuite, KLEE) and LLM-based approaches (ChatTester, TestPilot). Differential Prompting (DP) compared directly with 4 specific differences outlined, showing incremental novelty.",
    119         "source": "haiku"
    120       }
    121     }
    122   },
    123   "type_checklist": {
    124     "empirical": {
    125       "artifacts": {
    126         "code_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Abstract states 'Code and data used are available at https://github.com/RinCloud/TrickCatcher'. Code is publicly released under a repository.",
    130           "source": "haiku"
    131         },
    132         "data_released": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "Uses two public datasets (TrickyBugs, EvalPlus) with stated licenses (MIT, Apache 2.0). GitHub repository includes references/links to data sources.",
    136           "source": "haiku"
    137         },
    138         "environment_specified": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Model versions (gpt-3.5-turbo-0125, deepseek-v3) are specified but no Python version, dependency list (requirements.txt), or Docker environment provided in the paper.",
    142           "source": "haiku"
    143         },
    144         "reproduction_instructions": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "No step-by-step reproduction instructions in paper. GitHub link is provided but paper lacks detailed commands/workflows to replicate results.",
    148           "source": "haiku"
    149         }
    150       },
    151       "statistical_methodology": {
    152         "confidence_intervals_or_error_bars": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "Tables 1-3 report only point estimates (average precision/recall/F1). Appendix B describes repetition methodology but no variance/CIs are shown in results tables.",
    156           "source": "haiku"
    157         },
    158         "significance_tests": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "No statistical significance tests (t-tests, paired comparisons, bootstrap) reported between TrickCatcher and baselines despite multiple runs.",
    162           "source": "haiku"
    163         },
    164         "effect_sizes_reported": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "Effect sizes reported as multipliers (1.80×, 1.66×) and percentage improvements in Table 1. Absolute improvements also shown (F1: 41.31% vs 24.95%).",
    168           "source": "haiku"
    169         },
    170         "sample_size_justified": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Sample sizes (366 + 151 = 517 programs) are stated but not justified. No power analysis or sample size rationale provided.",
    174           "source": "haiku"
    175         },
    176         "variance_reported": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "Results show only averages across multiple runs; standard deviations, ranges, or confidence intervals not reported in main tables.",
    180           "source": "haiku"
    181         }
    182       },
    183       "evaluation_design": {
    184         "baselines_included": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Three baselines compared: DirectChat (CHAT), Automated Program Repair (APR), and Differential Prompting Plus (DPP). All evaluated on same datasets with same metrics.",
    188           "source": "haiku"
    189         },
    190         "baselines_contemporary": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "DPP based on Li et al. (2023) ICSE work. DirectChat is custom baseline. APR is their own implementation. Baselines are appropriate for 2024 publication.",
    194           "source": "haiku"
    195         },
    196         "ablation_study": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Table 2 (RQ3) systematically ablates three components (PUT-guided generation, generator-based input, diversity-driven testing) across 6 patterns, showing contribution of each.",
    200           "source": "haiku"
    201         },
    202         "multiple_metrics": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "Recall, Precision, and F1 score reported. RQ2 also breaks FPs by type (invalid inputs vs incorrect oracles). Multiple evaluation perspectives provided.",
    206           "source": "haiku"
    207         },
    208         "human_evaluation": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "No human evaluation of system outputs. Manual verification of input validity (TrickyBugs) is mentioned (Section 5.3) but inter-rater reliability not reported.",
    212           "source": "haiku"
    213         },
    214         "held_out_test_set": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Evaluation on separate programs from TrickyBugs and EvalPlus. Each program is a held-out test case; approach is evaluated on new programs not seen during training.",
    218           "source": "haiku"
    219         },
    220         "per_category_breakdown": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Results broken down by C++/Python (TrickyBugs) vs EvalPlus. RQ5 provides difficulty-based breakdown (low vs high). Limited per-bug-type or per-feature breakdown.",
    224           "source": "haiku"
    225         },
    226         "failure_cases_discussed": {
    227           "applies": true,
    228           "answer": false,
    229           "justification": "Negative results included (e.g., precision lower on EvalPlus vs DPP in Table 1). No systematic failure case analysis or examples of when TrickCatcher underperforms.",
    230           "source": "haiku"
    231         },
    232         "negative_results_reported": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "Table 1 shows cases where TrickCatcher underperforms: EvalPlus precision 83.14% vs DPP 90.36% (7.99% ↓). Negative results transparently included.",
    236           "source": "haiku"
    237         }
    238       },
    239       "setup_transparency": {
    240         "model_versions_specified": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Section 5.5 specifies 'gpt-3.5-turbo-0125' with version date. Table 3 uses 'deepseek-v3'. Exact model snapshots are provided.",
    244           "source": "haiku"
    245         },
    246         "prompts_provided": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Figures 3 and 4 show complete prompts used for program variant and input generator generation. Prompts are actual instructions, not templates.",
    250           "source": "haiku"
    251         },
    252         "hyperparameters_reported": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "Primary hyperparameter k (number of program variants) is systematically varied (2-10) in experiments. LLM temperature/top-p not reported; CYaRon library mentioned but no library-specific parameters detailed.",
    256           "source": "haiku"
    257         },
    258         "scaffolding_described": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 4 explains three-step pipeline with prompts and library usage (CYaRon) documented. Few-shot examples provided to LLM as described in Figure 4.",
    262           "source": "haiku"
    263         },
    264         "data_preprocessing_documented": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "Filtering steps documented: variants filtered by existing test suite (Section 4.1), EvalPlus programs filtered to retain buggy plausible ones (Section 5.2). Process could be more detailed.",
    268           "source": "haiku"
    269         }
    270       },
    271       "data_integrity": {
    272         "raw_data_available": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "TrickyBugs and EvalPlus are public datasets; GitHub repository should provide access. Canonical programs and test cases available through dataset releases.",
    276           "source": "haiku"
    277         },
    278         "data_collection_described": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "TrickyBugs sourced from online judge platform with real participant submissions. EvalPlus derived from code generation benchmark. Collection methods described at high level; detailed methodology sparse.",
    282           "source": "haiku"
    283         },
    284         "recruitment_methods_described": {
    285           "applies": false,
    286           "answer": false,
    287           "justification": "Not applicable — no human participants. Evaluation uses existing program datasets, not human-recruited data.",
    288           "source": "haiku"
    289         },
    290         "data_pipeline_documented": {
    291           "applies": true,
    292           "answer": true,
    293           "justification": "Selection process documented: 251 C++, 115 Python from TrickyBugs; 151 buggy plausible programs filtered from EvalPlus (pass base, fail extra test cases). Full pipeline detail could be more thorough.",
    294           "source": "haiku"
    295         }
    296       },
    297       "contamination": {
    298         "training_cutoff_stated": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Model version gpt-3.5-turbo-0125 specified with implicit cutoff. Limitations section explicitly states TrickyBugs released *after* this model's training cutoff.",
    302           "source": "haiku"
    303         },
    304         "train_test_overlap_discussed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "EvalPlus explicitly prohibits training use (Limitations). TrickyBugs has no contamination risk as it postdates model. Overlap thoroughly addressed.",
    308           "source": "haiku"
    309         },
    310         "benchmark_contamination_addressed": {
    311           "applies": true,
    312           "answer": true,
    313           "justification": "Both datasets cleared for use: TrickyBugs released after gpt-3.5-turbo-0125; EvalPlus prohibits training. Paper adds: poor baseline performance further argues against leakage.",
    314           "source": "haiku"
    315         }
    316       },
    317       "human_studies": {
    318         "applies": false
    319       },
    320       "cost_and_practicality": {
    321         "inference_cost_reported": {
    322           "applies": true,
    323           "answer": false,
    324           "justification": "No inference cost, latency, or API pricing reported. Paper mentions balancing 'performance and cost' for model selection but provides no actual figures.",
    325           "source": "haiku"
    326         },
    327         "compute_budget_stated": {
    328           "applies": true,
    329           "answer": false,
    330           "justification": "No total computational budget provided (e.g., number of API calls, GPU hours, total cost). Large-scale experiments on 517 programs suggest significant cost but not quantified.",
    331           "source": "haiku"
    332         }
    333       }
    334     }
    335   },
    336   "claims": [
    337     {
    338       "claim": "TrickCatcher achieves 1.80× recall, 2.65× precision, and 1.66× F1 compared to state-of-the-art baseline DPP",
    339       "evidence": "Table 1 detailed comparisons on TrickyBugs (C++, Python) and EvalPlus. Best vs. Best row shows multipliers across datasets (1.80× for C++ recall at k=10 vs DPP k=8).",
    340       "supported": "strong"
    341     },
    342     {
    343       "claim": "Generator-based input generation eliminates false positives from invalid inputs",
    344       "evidence": "Section 4.2 method description; RQ2 results (Figure 5) show TrickCatcher produces 0 FPs from invalid inputs vs 26+ for baselines on EvalPlus.",
    345       "supported": "strong"
    346     },
    347     {
    348       "claim": "PUT-guided program variant generation produces higher-quality variants than specification-only generation",
    349       "evidence": "Section 4.1 rationale; ablation study Table 2 Pattern 3 (Filtered baseline) vs Pattern 5 (Ours generator): 38% vs 38% F1 (no improvement shown for just filtering; improvement comes with input generation).",
    350       "supported": "moderate"
    351     },
    352     {
    353       "claim": "Diversity-driven differential testing outperforms majority voting",
    354       "evidence": "Section 4.3 algorithm; ablation Table 2 Pattern 2 (Basic DT) vs Pattern 3 (Our DT): 25% vs 33% F1 on C++ dataset.",
    355       "supported": "moderate"
    356     },
    357     {
    358       "claim": "TrickCatcher generalizes across different LLMs (gpt-3.5-turbo and deepseek-v3)",
    359       "evidence": "Table 3 shows deepseek-v3 achieves 59.54% F1 vs gpt-3.5-turbo's 51.34% F1 on EvalPlus; both perform well relative to baselines.",
    360       "supported": "moderate"
    361     },
    362     {
    363       "claim": "Buggy program variants can be useful for test generation despite containing errors",
    364       "evidence": "Section 7.1 reports 23.2% (TrickyBugs) and 15.0% (EvalPlus) of useful variants are buggy; Table 1 shows TC outperforms APR despite APR's focus on correct repairs.",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "TrickCatcher is more effective on harder coding tasks than DPP",
    369       "evidence": "RQ5 (Figures 7-8) shows TC outperforms DPP more dramatically on high-difficulty tasks; variant quality (Figure 8 right) is notably better for TC on harder problems.",
    370       "supported": "moderate"
    371     }
    372   ],
    373   "methodology_tags": [
    374     "benchmark-eval",
    375     "empirical"
    376   ],
    377   "key_findings": "TrickCatcher, an LLM-powered approach combining three components (PUT-guided variant generation, generator-based input generation, and diversity-driven differential testing), effectively detects bugs in programs that pass existing test suites. Evaluated on 517 programs (TrickyBugs: 366 human-written, EvalPlus: 151 AI-generated), TrickCatcher achieves F1 scores of 41.31-51.34% versus 24.95-35.76% for the best baseline, with up to 16× fewer false positives. The method scales robustly across different numbers of program variants and different LLMs, with particular advantages on harder programming tasks.",
    378   "red_flags": [
    379     {
    380       "flag": "No statistical significance testing",
    381       "detail": "Results report point estimates only; no confidence intervals, standard deviations, or significance tests despite multiple runs. Unclear whether observed improvements are statistically reliable."
    382     },
    383     {
    384       "flag": "Precision trade-off on EvalPlus",
    385       "detail": "TrickCatcher precision 83.14% vs DPP 90.36% on EvalPlus (7.99% lower), undermining practical advantage despite better F1. High false positive rates could reduce adoption."
    386     },
    387     {
    388       "flag": "Domain generalization limited",
    389       "detail": "Evaluation restricted to competitive programming (TrickyBugs) and code generation benchmarks (EvalPlus). Generalization to production code, real-world bugs, or different programming paradigms unexamined."
    390     },
    391     {
    392       "flag": "Computational cost not quantified",
    393       "detail": "No reporting of API costs, model latency, or efficiency analysis. Large-scale experiments (517 programs × variants × inputs) suggest significant cost but unspecified."
    394     },
    395     {
    396       "flag": "Manual verification methodology sparse",
    397       "detail": "TrickyBugs input validity 'manually verified' but inter-rater reliability, verification protocol, and number of verifiers not reported; potential for inconsistency."
    398     },
    399     {
    400       "flag": "Failure mode analysis absent",
    401       "detail": "While negative results shown in tables, no systematic analysis of failure cases, error patterns, or when approach underperforms relative to baselines."
    402     },
    403     {
    404       "flag": "LLM hyperparameters underspecified",
    405       "detail": "Temperature, top-p, max_tokens not reported for LLM calls; CYaRon library usage mentioned but no parameter details. Reproducibility of exact outputs limited."
    406     }
    407   ],
    408   "cited_papers": [
    409     {
    410       "title": "Evaluating large language models trained on code",
    411       "relevance": "Foundational work establishing LLM code evaluation methodology; Chen et al. 2021 establishes baseline for code generation capability assessment."
    412     },
    413     {
    414       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    415       "relevance": "Introduces EvalPlus benchmark; Liu et al. 2023 addresses rigorous evaluation of AI-generated code correctness, central to TrickCatcher's evaluation strategy."
    416     },
    417     {
    418       "title": "TrickyBugs: A dataset of corner-case bugs in plausible programs",
    419       "relevance": "Creates the TrickyBugs dataset used for main evaluation; Liu et al. 2024 defines the problem domain (plausible programs with escaping bugs)."
    420     },
    421     {
    422       "title": "Nuances are the key: Unlocking ChatGPT to find failure-inducing tests with differential prompting",
    423       "relevance": "State-of-the-art baseline method (Differential Prompting); Li et al. 2023 ICSE represents prior work on LLM-based bug detection via test generation."
    424     },
    425     {
    426       "title": "Large language model-based agents for software engineering: A survey",
    427       "relevance": "Surveys LLM agents for SE tasks including testing; Liu et al. 2024 provides broader context for LLM testing applications."
    428     },
    429     {
    430       "title": "Who judges the judge: An empirical study on online judge tests",
    431       "relevance": "Empirical study of test suite quality on online judge platforms (source of TrickyBugs); Liu et al. 2023 establishes prevalence of plausible programs."
    432     },
    433     {
    434       "title": "The counterfeit conundrum: Can code language models grasp the nuances of their incorrect generations?",
    435       "relevance": "Studies whether LLMs understand their own generated code errors; Gu et al. 2024 relevant to understanding LLM capability in test case generation."
    436     }
    437   ],
    438   "engagement_factors": {
    439     "practical_relevance": {
    440       "score": 2,
    441       "justification": "Tool is released with code/data, but requires LLM API access (gpt-3.5-turbo) and is demonstrated only on competitive programming. Production code applicability unclear."
    442     },
    443     "surprise_contrarian": {
    444       "score": 2,
    445       "justification": "Finding that buggy variants remain useful (Section 7.1) is interesting but not deeply surprising. LLM-based testing with differential testing is predictable evolution of prior work."
    446     },
    447     "fear_safety": {
    448       "score": 1,
    449       "justification": "Paper addresses AI safety (finding bugs in AI-generated code) rather than raising concerns. Limited novelty in safety implications; framed as defensive tool."
    450     },
    451     "demo_ability": {
    452       "score": 2,
    453       "justification": "Code released on GitHub with public datasets (TrickyBugs, EvalPlus). Requires API access to gpt-3.5-turbo. Reproducibility possible but non-trivial; live demo unavailable."
    454     },
    455     "brand_recognition": {
    456       "score": 2,
    457       "justification": "Authors from well-known institutions (Peking, NTU, KCL, UCL) including Mark Harman (prominent in SE). Venue not explicitly stated; appears to be strong conference."
    458     },
    459     "drama_conflict": {
    460       "score": 1,
    461       "justification": "Solid technical contribution with positive results but no controversial findings, dramatic claims, or conflict angle. Straightforward methodological advance."
    462     }
    463   },
    464   "hn_data": {
    465     "threads": [
    466       {
    467         "hn_id": "41319553",
    468         "title": "First open source Legal AI retrieval benchmark for RAG finally released",
    469         "points": 9,
    470         "comments": 0,
    471         "url": "https://news.ycombinator.com/item?id=41319553"
    472       },
    473       {
    474         "hn_id": "41663273",
    475         "title": "Unsafe Impedance: Safe Languages and Safe by Design Software",
    476         "points": 7,
    477         "comments": 1,
    478         "url": "https://news.ycombinator.com/item?id=41663273"
    479       },
    480       {
    481         "hn_id": "40209981",
    482         "title": "Long-form music generation with latent diffusion",
    483         "points": 3,
    484         "comments": 0,
    485         "url": "https://news.ycombinator.com/item?id=40209981"
    486       },
    487       {
    488         "hn_id": "39807740",
    489         "title": "Perl: Parameter Efficient Reinforcement Learning from Human Feedback",
    490         "points": 3,
    491         "comments": 0,
    492         "url": "https://news.ycombinator.com/item?id=39807740"
    493       },
    494       {
    495         "hn_id": "35687845",
    496         "title": "Backporting RISC-V Vector assembly",
    497         "points": 3,
    498         "comments": 0,
    499         "url": "https://news.ycombinator.com/item?id=35687845"
    500       },
    501       {
    502         "hn_id": "40122867",
    503         "title": "Decentralized Trustless Bridge for Ethereum Full Node",
    504         "points": 2,
    505         "comments": 0,
    506         "url": "https://news.ycombinator.com/item?id=40122867"
    507       },
    508       {
    509         "hn_id": "35676768",
    510         "title": "The Law of Activity Delays",
    511         "points": 2,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=35676768"
    514       },
    515       {
    516         "hn_id": "39180109",
    517         "title": "Personality Inference via Mobile Phone Sensors: A Machine Learning Approach",
    518         "points": 2,
    519         "comments": 1,
    520         "url": "https://news.ycombinator.com/item?id=39180109"
    521       },
    522       {
    523         "hn_id": "39202163",
    524         "title": "Using LLM Such as ChatGPT for Designing and Implementing a RISC Processor",
    525         "points": 2,
    526         "comments": 0,
    527         "url": "https://news.ycombinator.com/item?id=39202163"
    528       },
    529       {
    530         "hn_id": "40061342",
    531         "title": "Long-form music generation with latent diffusion",
    532         "points": 1,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=40061342"
    535       }
    536     ],
    537     "top_points": 9,
    538     "total_points": 34,
    539     "total_comments": 2
    540   }
    541 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs