scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28634B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LLM-Powered Test Case Generation for Detecting Bugs in Plausible Programs",
      6     "authors": [
      7       "Kaibo Liu",
      8       "Zhenpeng Chen",
      9       "Yiyang Liu",
     10       "Jie M. Zhang",
     11       "Mark Harman",
     12       "Yudong Han",
     13       "Yun Ma",
     14       "Yihong Dong",
     15       "Ge Li",
     16       "Gang Huang"
     17     ],
     18     "year": 2024,
     19     "venue": "ACL 2025 (Annual Meeting of the Association for Computational Linguistics)",
     20     "arxiv_id": "2404.10304",
     21     "doi": "10.18653/v1/2025.acl-long.20"
     22   },
     23   "checklist": {
     24     "claims_and_evidence": {
     25       "abstract_claims_supported": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "All abstract claims (1.80×/2.65×/1.66× improvements, three-stage pipeline, evaluation on two datasets) are directly supported by Table 1 and Sections 4–6.",
     29         "source": "haiku"
     30       },
     31       "causal_claims_justified": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Comparative claims are justified by controlled experiments with same datasets/baselines. Ablation study (Section 6.3, Table 2) establishes causal contribution of each component.",
     35         "source": "haiku"
     36       },
     37       "generalization_bounded": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Evaluation bounded to coding task datasets (TrickyBugs, EvalPlus). Generalization test on deepseek-v3 (Section 7.2) is limited. Scope appropriately bounded.",
     41         "source": "haiku"
     42       },
     43       "alternative_explanations_discussed": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Paper does not explore alternative explanations for TrickCatcher's superior performance beyond baseline comparisons. Section 7.1 discusses buggy variant usefulness but lacks depth on why diversity-driven approach fundamentally works better.",
     47         "source": "haiku"
     48       },
     49       "proxy_outcome_distinction": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Measured outcomes (recall, precision, F1, TP/FP counts) directly align with claimed outcome (detecting bugs in plausible programs). True/false positive distinction is clearly defined in Section 3.",
     53         "source": "haiku"
     54       }
     55     },
     56     "limitations_and_scope": {
     57       "limitations_section_present": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Dedicated 'Limitations' section before Acknowledgements lists three specific limitations (model budget constraints, LLM uncertainty, data leakage risk).",
     61         "source": "haiku"
     62       },
     63       "threats_to_validity_specific": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Budget forced use of gpt-3.5-turbo instead of stronger models. Acknowledged uncertainty mitigated by multiple runs (Appendix B). Data leakage addressed by noting dataset release dates.",
     67         "source": "haiku"
     68       },
     69       "scope_boundaries_stated": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Scope to small coding tasks on plausible programs is implicit in experimental setup. Limitations section could be more explicit about non-applicability to other domains, but boundaries are clear from context.",
     73         "source": "haiku"
     74       }
     75     },
     76     "conflicts_of_interest": {
     77       "funding_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Acknowledgements fully disclose funding from National Key R&D Program (Grant No. 2023YFB4503801), NSFC (Grants 62192733, 62192730), Hubei Province, and InnovateUK.",
     81         "source": "haiku"
     82       },
     83       "affiliations_disclosed": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "All author affiliations listed: Peking University, Nanyang Technological University, King's College London, University College London, National Key Laboratory of Data Space Technology.",
     87         "source": "haiku"
     88       },
     89       "funder_independent_of_outcome": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Funders (NSFC, government grants, InnovateUK) are independent public bodies, not companies making the LLMs (OpenAI, DeepSeek) being evaluated.",
     93         "source": "haiku"
     94       },
     95       "financial_interests_declared": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No competing interests statement or financial interest declaration (patents, equity, consulting) present in the paper.",
     99         "source": "haiku"
    100       }
    101     },
    102     "scope_and_framing": {
    103       "key_terms_defined": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Paper defines: 'plausible programs' (Section 3), 'tricky bugs', 'test oracle', 'differential testing', and problem statement clearly framed in Section 3.",
    107         "source": "haiku"
    108       },
    109       "intended_contribution_clear": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Three novel contributions explicitly stated: PUT-guided program variant generation, generator-based input generation, diversity-driven differential testing. Novelty vs. prior work (esp. Differential Prompting) clearly delineated.",
    113         "source": "haiku"
    114       },
    115       "engagement_with_prior_work": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Section 2 surveys traditional test generation, LLM-based approaches, and explicitly compares TrickCatcher against Differential Prompting with detailed distinction in 4 points (Section 2).",
    119         "source": "haiku"
    120       }
    121     }
    122   },
    123   "type_checklist": {
    124     "empirical": {
    125       "artifacts": {
    126         "code_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Abstract states 'Code and data used are available at https://github.com/RinCloud/TrickCatcher'. GitHub repository is publicly accessible.",
    130           "source": "haiku"
    131         },
    132         "data_released": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "Both TrickyBugs and EvalPlus datasets are publicly available from prior work (Liu et al. 2024b, Liu et al. 2023a). Paper uses existing public benchmarks.",
    136           "source": "haiku"
    137         },
    138         "environment_specified": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "LLM versions specified (gpt-3.5-turbo-0125, deepseek-v3) but no Docker/requirements.txt, Python version, or dependency specification provided.",
    142           "source": "haiku"
    143         },
    144         "reproduction_instructions": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "No step-by-step reproduction walkthrough in paper. Code available on GitHub but paper does not include command-line examples or setup instructions.",
    148           "source": "haiku"
    149         }
    150       },
    151       "statistical_methodology": {
    152         "confidence_intervals_or_error_bars": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "Table 1, Figures 5-8 report point estimates (recall, precision, F1) without confidence intervals, standard deviations, or error bars. Multiple runs mentioned (Appendix B) but variance not quantified.",
    156           "source": "haiku"
    157         },
    158         "significance_tests": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "No statistical significance tests (t-test, ANOVA, etc.) reported. Claims like 'up to 1.80×' are not tested for statistical significance.",
    162           "source": "haiku"
    163         },
    164         "effect_sizes_reported": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "Effect sizes reported as F1 scores (e.g., 41.31% vs 24.95%), improvement ratios (1.80×, 2.65×, 1.66×), and absolute point estimates in Table 1.",
    168           "source": "haiku"
    169         },
    170         "sample_size_justified": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "TrickyBugs (366) and EvalPlus (151) datasets used, but no power analysis, sample size justification, or discussion of adequacy provided.",
    174           "source": "haiku"
    175         },
    176         "variance_reported": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "Appendix B describes repetition strategy (100 inputs, 10 variants, combinatorial sampling) but results tables and figures do not report standard deviations, confidence intervals, or variance metrics.",
    180           "source": "haiku"
    181         }
    182       },
    183       "evaluation_design": {
    184         "baselines_included": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Three baselines: DirectChat (CHAT), Differential Prompting Plus (DPP), Automated Program Repair (APR). Each evaluated in Table 1.",
    188           "source": "haiku"
    189         },
    190         "baselines_contemporary": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "DPP from Li et al. 2023, other LLM-based methods from 2024. Baselines are relatively recent and representative of state-of-the-art.",
    194           "source": "haiku"
    195         },
    196         "ablation_study": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Section 6.3 provides Table 2 ablation study testing 6 patterns systematically removing/adding components: program generation, input generation, differential testing.",
    200           "source": "haiku"
    201         },
    202         "multiple_metrics": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "Primary metrics: recall, precision, F1 score. Secondary: TP, FP, FN counts, passing rates on base/extra test cases (Figure 8), task difficulty (Figure 7).",
    206           "source": "haiku"
    207         },
    208         "human_evaluation": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "Not applicable. TrickyBugs input validity manually verified (mentioned in Section 5.3) but this is validation not human evaluation of system outputs.",
    212           "source": "haiku"
    213         },
    214         "held_out_test_set": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Both datasets have held-out test cases: TrickyBugs separates existing test suite from additional bug-revealing tests; EvalPlus has base vs extra test cases.",
    218           "source": "haiku"
    219         },
    220         "per_category_breakdown": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Results broken down by: dataset type (C++/Python), difficulty (low/high in Figure 7-8), and shown across multiple k values in Table 1 and Figure 6.",
    224           "source": "haiku"
    225         },
    226         "failure_cases_discussed": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Figure 5 shows false positives across methods. Section 7.1 discusses buggy variants that still contribute. Some analysis of when/why methods fail but limited depth.",
    230           "source": "haiku"
    231         },
    232         "negative_results_reported": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "Improvements on TrickyBugs (Python) for recall are modest (2.01% average). Precision sometimes lower than DPP worst case. Limitations section acknowledges constraints.",
    236           "source": "haiku"
    237         }
    238       },
    239       "setup_transparency": {
    240         "model_versions_specified": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Specific versions: gpt-3.5-turbo-0125 (Section 5.5), deepseek-v3 (Section 7.2) with exact model IDs provided.",
    244           "source": "haiku"
    245         },
    246         "prompts_provided": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Figure 3 shows exact prompt for program variant generation. Figure 4 shows exact prompt for input generator creation. Both complete and unambiguous.",
    250           "source": "haiku"
    251         },
    252         "hyperparameters_reported": {
    253           "applies": true,
    254           "answer": false,
    255           "justification": "LLM hyperparameters (temperature, top-p, max_tokens, etc.) not reported. Only model names given. Algorithm 1 specifies differential testing logic but not LLM sampling params.",
    256           "source": "haiku"
    257         },
    258         "scaffolding_described": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Three-stage pipeline clearly described: Section 4.1 (program variant generation), 4.2 (test input generation), 4.3 (differential testing). Algorithm 1 provides pseudocode.",
    262           "source": "haiku"
    263         },
    264         "data_preprocessing_documented": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "EvalPlus filtering described: select AI-generated samples passing base but failing extra tests. TrickyBugs filtering by test suite mentioned. Program variant filtering after generation documented.",
    268           "source": "haiku"
    269         }
    270       },
    271       "data_integrity": {
    272         "raw_data_available": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Both datasets (TrickyBugs, EvalPlus) are publicly available from prior work. Authors provide code repository with evaluation scripts.",
    276           "source": "haiku"
    277         },
    278         "data_collection_described": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "TrickyBugs collection described in Liu et al. 2024b (online judge platform submissions). EvalPlus from Liu et al. 2023a (code generation benchmark). Referenced, not collected by this paper.",
    282           "source": "haiku"
    283         },
    284         "recruitment_methods_described": {
    285           "applies": false,
    286           "answer": false,
    287           "justification": "Not applicable. No human participant recruitment—benchmarks are programming tasks from online judges and code generation datasets.",
    288           "source": "haiku"
    289         },
    290         "data_pipeline_documented": {
    291           "applies": true,
    292           "answer": true,
    293           "justification": "Pipeline documented: datasets → filter variants by existing tests → generate inputs → execute variants → compare outputs → collect TP/FP. Described in Section 4 and Figure 2.",
    294           "source": "haiku"
    295         }
    296       },
    297       "contamination": {
    298         "training_cutoff_stated": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Training cutoff addressed: gpt-3.5-turbo-0125 release date noted. TrickyBugs released after model cutoff. EvalPlus prohibits training use. Discussed in Limitations.",
    302           "source": "haiku"
    303         },
    304         "train_test_overlap_discussed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "Data leakage discussed in Limitations: TrickyBugs released after gpt-3.5-turbo training, EvalPlus prohibits training use. Poor baseline performance argues against major leakage.",
    308           "source": "haiku"
    309         },
    310         "benchmark_contamination_addressed": {
    311           "applies": true,
    312           "answer": true,
    313           "justification": "Contamination addressed through dataset release dates and explicit prohibition. Conversely, weak LLM baselines (Table 1) suggest benchmarks not memorized.",
    314           "source": "haiku"
    315         }
    316       },
    317       "human_studies": {
    318         "pre_registered": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants. Not applicable.",
    322           "source": "haiku"
    323         },
    324         "irb_or_ethics_approval": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human subjects. Not applicable.",
    328           "source": "haiku"
    329         },
    330         "demographics_reported": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants. Not applicable.",
    334           "source": "haiku"
    335         },
    336         "inclusion_exclusion_criteria": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human subjects. Not applicable.",
    340           "source": "haiku"
    341         },
    342         "randomization_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants. Randomization used for input selection/variant sampling (Appendix B) but not human randomization.",
    346           "source": "haiku"
    347         },
    348         "blinding_described": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human subjects. Not applicable.",
    352           "source": "haiku"
    353         },
    354         "attrition_reported": {
    355           "applies": false,
    356           "answer": false,
    357           "justification": "No human participants. Not applicable.",
    358           "source": "haiku"
    359         }
    360       },
    361       "cost_and_practicality": {
    362         "inference_cost_reported": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Budget constraints mentioned as motivation for using gpt-3.5-turbo (Section 5.5) but no actual inference costs, latency, or cost per program reported.",
    366           "source": "haiku"
    367         },
    368         "compute_budget_stated": {
    369           "applies": true,
    370           "answer": false,
    371           "justification": "Paper mentions 'budget constraints' limited model choice but does not quantify total computational budget or cost.",
    372           "source": "haiku"
    373         }
    374       }
    375     }
    376   },
    377   "claims": [
    378     {
    379       "claim": "TrickCatcher achieves 1.80× recall, 2.65× precision, 1.66× F1 score compared to best baseline (DPP)",
    380       "evidence": "Table 1 reports F1 scores: TrickCatcher 41.31–51.34%, DPP 24.95–35.76% across three datasets",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "PUT-guided program generation produces higher-quality variants than specification-only generation",
    385       "evidence": "Ablation study (Table 2): filtering+basic IG+ours DT (pattern 3) achieves 0.33 F1 vs filtered+basic (pattern 2) 0.23 F1; patterns 5-6 with full approach reach 0.37-0.41",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Generator-based input generation achieves higher validity than direct LLM generation",
    390       "evidence": "Introduction mentions 40.10% invalid inputs from direct generation. Figure 5 shows TrickCatcher produces zero false positives from invalid inputs whereas DPP has many",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Diversity-driven differential testing outperforms majority voting for bug detection",
    395       "evidence": "Table 2 ablation: filtered+basic+basic (pattern 2) vs filtered+basic+ours DT (pattern 3) improves F1 from 0.23 to 0.33 on TrickyBugs C++",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "TrickCatcher generates up to 16× fewer false positives than baselines on correct programs",
    400       "evidence": "Figure 5 shows TrickCatcher max 5 FPs vs CHAT/DPP 26+ FPs. Ratio: 26/5 ≈ 5× minimum, up to 16× for specific configurations",
    401       "supported": "strong"
    402     },
    403     {
    404       "claim": "Performance remains stable across different numbers of program variants (k=2 to k=10)",
    405       "evidence": "Figure 6 shows TrickCatcher F1 stable 0.40-0.41 and precision 0.69-0.70 across k. DPP fluctuates 0.23-0.25 F1",
    406       "supported": "strong"
    407     },
    408     {
    409       "claim": "TrickCatcher shows greater improvement on high-difficulty tasks than low-difficulty",
    410       "evidence": "Figure 7-8: TrickCatcher median difficulty ~3500 for high-difficulty successes; DPP success median ~2200. Figure 8 shows TrickCatcher higher pass rate on hard tasks (base/extra)",
    411       "supported": "moderate"
    412     },
    413     {
    414       "claim": "Buggy program variants can contribute meaningfully to bug detection",
    415       "evidence": "Section 7.1: 23.2% (TrickyBugs) and 15.0% (EvalPlus) of useful variants are themselves buggy. TrickCatcher outperforms APR (repair-only) confirming non-repair detection",
    416       "supported": "moderate"
    417     }
    418   ],
    419   "methodology_tags": [
    420     "empirical",
    421     "benchmark-eval",
    422     "comparative"
    423   ],
    424   "key_findings": "TrickCatcher, an LLM-powered test generation method combining PUT-guided program variant generation, generator-based input generation, and diversity-driven differential testing, achieves 1.66–1.80× improvements in F1 score, recall, and precision over state-of-the-art baselines on two datasets (TrickyBugs: 366 human-written programs; EvalPlus: 151 AI-generated programs). The approach generates 16× fewer false positives than baselines while maintaining stable performance across variant counts and showing larger gains on high-difficulty tasks. Ablation studies confirm each component contributes meaningfully, and the method generalizes to deeper LLMs (deepseek-v3).",
    425   "red_flags": [
    426     {
    427       "flag": "No statistical significance testing",
    428       "detail": "Reported improvements (1.80×, 2.65×) lack p-values or confidence intervals; results could reflect random variation rather than systematic advantage"
    429     },
    430     {
    431       "flag": "Variance not quantified",
    432       "detail": "Appendix B describes repetition strategy (100 inputs, C(10,k) combinations) but results tables show only point estimates without standard deviations or confidence bounds"
    433     },
    434     {
    435       "flag": "Sample size not justified",
    436       "detail": "No power analysis or justification provided for 366+151 programs; adequacy unclear for detecting true effect sizes"
    437     },
    438     {
    439       "flag": "LLM hyperparameters underspecified",
    440       "detail": "Temperature, top-p, max_tokens, and other sampling parameters not reported; reproducibility compromised"
    441     },
    442     {
    443       "flag": "Limited generalization scope",
    444       "detail": "Evaluation restricted to coding tasks on two benchmarks. Deepseek-v3 generalization test (Table 3) limited to one alternative model"
    445     },
    446     {
    447       "flag": "Data leakage not fully ruled out",
    448       "detail": "While paper argues TrickyBugs post-dated training cutoff and EvalPlus prohibits training use, timing and enforcement not independently verified"
    449     },
    450     {
    451       "flag": "Manual validation required for TrickyBugs input validity",
    452       "detail": "Input validity assessed by hand for TrickyBugs but automated for EvalPlus; introduces subjectivity and inconsistency"
    453     },
    454     {
    455       "flag": "Limitations section somewhat boilerplate",
    456       "detail": "Budget constraint limiting model choice, LLM uncertainty mitigated by averaging, data leakage addressed by release dates—could be more concrete about residual threats"
    457     },
    458     {
    459       "flag": "Alternative explanations not explored",
    460       "detail": "Paper compares against baselines but does not investigate why diversity-driven approach fundamentally works beyond empirical results"
    461     },
    462     {
    463       "flag": "No reproduction walkthrough in paper",
    464       "detail": "Code available on GitHub but paper lacks step-by-step setup/execution instructions; readers must reverse-engineer from code"
    465     }
    466   ],
    467   "cited_papers": [
    468     {
    469       "title": "Nuances are the key: Unlocking ChatGPT to find failure-inducing tests with differential prompting",
    470       "relevance": "Core prior work (Differential Prompting); paper explicitly builds on and improves this approach with three key modifications"
    471     },
    472     {
    473       "title": "TrickyBugs: A dataset of corner-case bugs in plausible programs",
    474       "relevance": "Primary evaluation dataset; papers defines 'plausible programs' and 'tricky bugs' problem"
    475     },
    476     {
    477       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    478       "relevance": "EvalPlus benchmark used for AI-generated code evaluation; defines base/extra test case split"
    479     },
    480     {
    481       "title": "Large language model-based agents for software engineering: A survey",
    482       "relevance": "Surveys LLM applications in code generation and testing; contextualizes contribution in broader agent/code space"
    483     },
    484     {
    485       "title": "Who judges the judge: An empirical study on online judge tests",
    486       "relevance": "Foundational work identifying prevalence of tricky bugs (3,440 bugs in online judge); motivates problem"
    487     },
    488     {
    489       "title": "Evaluating large language models trained on code",
    490       "relevance": "Foundational LLM code generation benchmark; shows LLM struggle with complex tasks, motivating TrickCatcher's two-step input generation"
    491     },
    492     {
    493       "title": "EvoSuite: Automated unit test generation for object-oriented software",
    494       "relevance": "Traditional search-based test generation baseline; contrasts with LLM-powered approach"
    495     },
    496     {
    497       "title": "KLEE: unassisted automatic generation of high-coverage tests",
    498       "relevance": "Symbolic execution test generation tool; represents pre-LLM approach to test generation"
    499     }
    500   ],
    501   "engagement_factors": {
    502     "practical_relevance": {
    503       "score": 3,
    504       "justification": "TrickCatcher is immediately usable for finding real bugs in code; released on GitHub with full implementation; directly applicable to developer workflows"
    505     },
    506     "surprise_contrarian": {
    507       "score": 2,
    508       "justification": "Diversity-driven testing over majority voting is counterintuitive; using buggy variants as oracle contributors is creative; but core idea (LLM-based test generation) not novel"
    509     },
    510     "fear_safety": {
    511       "score": 0,
    512       "justification": "Bug detection is positive for code safety but paper is tool-focused, not safety risk paper; no AI alignment or security concerns raised"
    513     },
    514     "drama_conflict": {
    515       "score": 1,
    516       "justification": "Addresses tension between LLM-generated code quality and testing rigor; somewhat timely given AI code generation boom, but not high-conflict angle"
    517     },
    518     "demo_ability": {
    519       "score": 2,
    520       "justification": "Code is open-source and runnable; requires having target programs to test; not immediately demandable to broad audience but doable for developers"
    521     },
    522     "brand_recognition": {
    523       "score": 2,
    524       "justification": "Multiple top-tier institutions (Peking, NTU, KCL, UCL); published at ACL (prestigious NLP venue); not industry giants but credible academic pedigree"
    525     }
    526   },
    527   "hn_data": {
    528     "threads": [
    529       {
    530         "hn_id": "41319553",
    531         "title": "First open source Legal AI retrieval benchmark for RAG finally released",
    532         "points": 9,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=41319553"
    535       },
    536       {
    537         "hn_id": "41663273",
    538         "title": "Unsafe Impedance: Safe Languages and Safe by Design Software",
    539         "points": 7,
    540         "comments": 1,
    541         "url": "https://news.ycombinator.com/item?id=41663273"
    542       },
    543       {
    544         "hn_id": "40209981",
    545         "title": "Long-form music generation with latent diffusion",
    546         "points": 3,
    547         "comments": 0,
    548         "url": "https://news.ycombinator.com/item?id=40209981"
    549       },
    550       {
    551         "hn_id": "39807740",
    552         "title": "Perl: Parameter Efficient Reinforcement Learning from Human Feedback",
    553         "points": 3,
    554         "comments": 0,
    555         "url": "https://news.ycombinator.com/item?id=39807740"
    556       },
    557       {
    558         "hn_id": "35687845",
    559         "title": "Backporting RISC-V Vector assembly",
    560         "points": 3,
    561         "comments": 0,
    562         "url": "https://news.ycombinator.com/item?id=35687845"
    563       },
    564       {
    565         "hn_id": "40122867",
    566         "title": "Decentralized Trustless Bridge for Ethereum Full Node",
    567         "points": 2,
    568         "comments": 0,
    569         "url": "https://news.ycombinator.com/item?id=40122867"
    570       },
    571       {
    572         "hn_id": "35676768",
    573         "title": "The Law of Activity Delays",
    574         "points": 2,
    575         "comments": 0,
    576         "url": "https://news.ycombinator.com/item?id=35676768"
    577       },
    578       {
    579         "hn_id": "39180109",
    580         "title": "Personality Inference via Mobile Phone Sensors: A Machine Learning Approach",
    581         "points": 2,
    582         "comments": 1,
    583         "url": "https://news.ycombinator.com/item?id=39180109"
    584       },
    585       {
    586         "hn_id": "39202163",
    587         "title": "Using LLM Such as ChatGPT for Designing and Implementing a RISC Processor",
    588         "points": 2,
    589         "comments": 0,
    590         "url": "https://news.ycombinator.com/item?id=39202163"
    591       },
    592       {
    593         "hn_id": "40061342",
    594         "title": "Long-form music generation with latent diffusion",
    595         "points": 1,
    596         "comments": 0,
    597         "url": "https://news.ycombinator.com/item?id=40061342"
    598       }
    599     ],
    600     "top_points": 9,
    601     "total_points": 34,
    602     "total_comments": 2
    603   }
    604 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs