scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28769B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LLM Test Generation via Iterative Hybrid Program Analysis",
      6     "authors": [
      7       "Sijia Gu",
      8       "Noor Nashid",
      9       "Ali Mesbah"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2503.13580",
     14     "doi": "10.1145/3744916.3764553"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract claims 26% higher line coverage and 23% higher branch coverage; Table 2 reports 26.3% and 22.7% respectively vs. SymPrompt, consistent within rounding. All other abstract claims about iterative feedback and hybrid analysis are supported by Algorithms 1–3 and evaluation results.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper makes causal claims that hybrid program analysis improves coverage, supported by a four-variant ablation study (baseline, Panta_basic, Panta_cov, Panta) in Table 3 that isolates contributions of the iterative framework, coverage feedback, and path selection separately.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Section 5 explicitly bounds scope to 'Java SE projects from the Defects4J benchmark' with CYC > 10, acknowledges limitations with inheritance-heavy classes, and flags Java EE as out of scope for future work.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper discusses cases where SymPrompt outperforms Panta (large utility classes like MapUtils with 71 methods) and attributes this to prompt length issues, providing an alternative mechanism rather than dismissing anomalies.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper explicitly notes 'coverage alone does not ensure fault detection if test assertions are non-existent or weak' and includes mutation score alongside coverage metrics to distinguish between coverage proxy and actual fault detection.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 5 contains a dedicated 'Limitations' subsection and Section 6 is titled 'Threats to Validity,' covering multiple specific concerns beyond a single sentence.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 6 names specific threats: LLM capability variance (mitigated by testing 4 models), coverage-effectiveness gap (addressed with mutation score), Comex CFG tool bugs (mitigated by path verification), and Defects4J training contamination (mitigated by relative comparisons).",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper explicitly states Panta is 'most applicable to Java SE projects' with complex control flow, excludes inheritance-heavy and extreme-switch classes, and identifies Java EE and multi-language support as out-of-scope future work.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding source is mentioned anywhere in the paper. There is no acknowledgments section or grant disclosure.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors list University of British Columbia as their affiliation in the paper header, with individual email addresses.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding is disclosed, so independence cannot be assessed.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement, patent declaration, or financial disclosure appears anywhere in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Cyclomatic complexity is formally defined (CYC = #edges − #nodes + 2), coverage deficiency score is defined as missed statements + missed branches per path, and key algorithmic terms are precisely defined in Algorithms 1–3.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 1 explicitly lists four numbered contributions: the iterative feedback technique, the path selection strategy, the Panta tool itself, and the empirical evaluation on Defects4J.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 7 systematically positions Panta against EvoSuite, HITS, SymPrompt, CoverUp, CodaMosa, ChatUniTest, and ASTER, explaining specific technical differences (e.g., static vs. dynamic, method-level vs. class-level, augmentation support).",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Panta is described as 'publicly available at [25]' with a GitHub URL (https://github.com/PANTA-TestAutomation/Panta) cited with an access date of August 20, 2025.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Evaluation uses Defects4J v2.0.1, a publicly available benchmark. The paper also states the dataset is publicly available in their repository.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Specific tool versions are named in the paper (Maven 3.6.3, JUnit 4.13.2, JaCoCo 0.8.11, Pitest 1.17.0, Java 8+), but no Dockerfile, requirements file, or environment specification file is included in the paper itself.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "Section 6 explicitly states 'we provide detailed instructions for replicating our experimental results' as part of the public GitHub release, which is a positive claim of provision rather than absence of evidence.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No confidence intervals or error bars appear in any result table. The authors explicitly perform a single run per class due to low temperature, so no variance data is collected.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "A Paired T-test is used only to compare Panta_cov vs. Panta_basic in the ablation study. The primary comparison between Panta and SymPrompt (Table 2) — the main claim of the paper — has no significance testing.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Effect sizes are reported as percentage improvements with baselines: '26.3% higher line coverage and 22.7% higher branch coverage' with full numerical tables providing baseline context.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The 130 classes are selected based on availability and complexity criteria (CYC > 10), but no power analysis or justification that this sample size is sufficient for statistical inference is provided.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No standard deviation, variance, or confidence ranges are reported for any metric. The paper explicitly states 'we perform a single run per class' with no cross-run variance.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "SymPrompt serves as the primary state-of-the-art baseline in Table 2, and a zero-shot LLM baseline is included in the ablation study (Table 3).",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "SymPrompt (FSE 2024) and HITS (ASE 2024) are both 2024 papers, contemporary with the 2025 submission. The authors explain why HITS cannot be compared directly (no public implementation).",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Table 3 presents a thorough ablation with four variants: baseline (no iteration), Panta_basic (iteration only), Panta_cov (iteration + coverage), and full Panta (iteration + hybrid analysis), isolating each component's contribution.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Five metrics are used: line coverage, branch coverage, pass rate, mutation score, and High Coverage Count (HCC). Mutation score is explicitly included to address coverage-adequacy limitations.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "Human evaluation is not relevant for an automated test generation tool evaluated on objective coverage and mutation metrics.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": false,
    209           "answer": false,
    210           "justification": "This is a coverage optimization task, not a prediction task; held-out test set evaluation is not applicable.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results are broken down by all 14 Defects4J projects in Tables 2–4, and Table 3 additionally segments by cyclomatic complexity (CYC≤20 vs. CYC>20) to assess performance on complex code.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Section 4.2 discusses cases where SymPrompt outperforms Panta (Collections pass rate, JDatabind/Lang mutation score) with specific attribution to large utility classes; Section 5 discusses LLM inability to fix runtime AssertionErrors.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The paper reports SymPrompt outperforming Panta in Collections pass rate and JDatabind/Lang mutation score, and notes average mutation scores below 50% across all LLMs, including Panta.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "All four models are named with version identifiers: Meta Llama 3.3 70B, Mistral Large 2, GPT-4o Mini, Claude 3.5 Haiku, each with access dates of March 4, 2025.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "Figure 4 shows a prompt template with placeholders ({source_file_numbered}, {test_file}, {test_dependencies}, {method_name_N}, {selected_path_for_method_N}). Per criterion, templates with unfilled placeholders do not satisfy this requirement.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Section 4.1 explicitly states 'max_tokens set to 4096 for output generation and temperature set to 0.2' applied consistently across all four LLMs.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "The iterative scaffolding is described in detail across three algorithms (CFG path extraction, path selection, iterative framework) with explicit stopping conditions (maxCYC, maxNoIncreaseLimit, 100% coverage).",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Class selection criteria are documented: public, non-abstract classes with at least one MUT with CYC > 10, with manual exclusions for extreme-switch classes (CYC > 40) and inheritance-heavy classes explained.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "Section 6 states 'we have made Panta's implementation and dataset publicly available [25]' with a GitHub URL, and 'More details about individual classes are publicly available in our repository.'",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Data collection is described: Defects4J v2.0.1, latest fixed version of each project, excluding Chart/Mockito/Closure (deprecated), selecting classes with CYC > 10, with manual outlier exclusion criteria explained.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants; evaluation uses a standard benchmark (Defects4J).",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The full pipeline is documented: class selection → CFG extraction → path selection → prompt construction → test generation → validation → repair → coverage measurement, with algorithms and tool versions specified.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Section 6 acknowledges data contamination as a concern but does not state training data cutoffs for any of the four evaluated LLMs.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": true,
    300           "justification": "Section 6 explicitly discusses 'the possibility that LLMs may have encountered parts of the Defects4J codebase during pretraining' as a named threat to internal validity.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": true,
    306           "justification": "The paper addresses contamination by reporting relative improvements between prompting strategies using the same LLM on the same projects, rather than absolute performance, explicitly stating this mitigation strategy.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in this study.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in this study.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants in this study.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in this study.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in this study.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in this study.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in this study.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Section 5 reports average runtime of 2.3 minutes per method (53 minutes per class), with project-specific breakdowns ranging from 0.8 min/method (Cli) to 9.4 min/method (Collections).",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No total API cost, GPU hours, or compute budget is reported. Runtime is given but not the total monetary or compute cost of running the full evaluation.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Panta achieves 26% higher line coverage and 23% higher branch coverage compared to SymPrompt (state-of-the-art).",
    373       "evidence": "Table 2 reports 70.18% vs. 43.91% line coverage and 60.83% vs. 38.17% branch coverage across 130 classes from 14 Defects4J projects.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "The iterative feedback framework alone (Panta_basic) improves coverage by ~28pp line and ~26pp branch over a single-pass baseline.",
    378       "evidence": "Table 3 shows baseline at 32.69%/24.87% vs. Panta_basic at 60.34%/50.64% for line/branch coverage averaged across all 130 classes.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Hybrid program analysis (static CFG + dynamic coverage) adds ~10pp coverage improvement beyond the iterative framework alone.",
    383       "evidence": "Table 3: Panta_basic 60.34%/50.64% vs. full Panta 70.18%/60.83% for line/branch. Paired T-test shows Panta_cov vs. Panta_basic difference is not statistically significant, so path selection drives the gain.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Panta maintains consistent performance across complexity levels (only 1pp difference), unlike baseline variants that degrade 6–9pp for high-CYC classes.",
    388       "evidence": "Table 3 CYC groups: Panta shows 70.37% vs. 69.42% (line coverage) for CYC≤20 vs. CYC>20, while baseline shows 34.01% vs. 27.38%.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Claude 3.5 Haiku is the best-performing model for code coverage metrics; Llama 3.3 70B is the most cost-effective alternative.",
    393       "evidence": "Table 4: Claude achieves 73.2% line / 66.5% branch / 62 HCC vs. Llama's 70.2% / 60.8% / 45 HCC; Claude is proprietary with higher API cost.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Average mutation score across all LLMs consistently falls below 50%, indicating weak fault detection capability in generated tests.",
    398       "evidence": "Table 4 mutation score averages: Llama 43.8%, Mistral 36.6%, GPT 34.9%, Claude 47.9% — all below 50%.",
    399       "supported": "strong"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "benchmark-eval"
    404   ],
    405   "key_findings": "Panta combines static control flow graph analysis with dynamic coverage feedback in an iterative, repair-enabled loop to guide LLM test generation toward uncovered branches. Evaluated on 130 complex Java classes from Defects4J, it achieves 26% and 23% higher line and branch coverage respectively compared to SymPrompt, with 45 vs. 16 classes exceeding 75% branch coverage. The iterative framework contributes the largest share of improvement (~28pp), with hybrid path selection adding ~10pp more, and uniquely maintaining performance consistency across high- and low-complexity code. Despite coverage gains, average mutation scores across all four evaluated LLMs remain below 50%, highlighting that coverage improvements do not translate proportionally to fault detection capability.",
    406   "red_flags": [
    407     {
    408       "flag": "Single run per class",
    409       "detail": "All main results are from a single run per class at temperature 0.2. No variance, standard deviation, or confidence intervals are reported for any metric, making it impossible to assess result stability or run significance tests on the primary comparison."
    410     },
    411     {
    412       "flag": "SymPrompt re-implemented by authors",
    413       "detail": "The main baseline (SymPrompt) was reimplemented by the Panta authors because 'neither HITS nor SymPrompt has publicly available implementations.' The reimplementation fidelity cannot be independently verified, introducing potential comparison bias."
    414     },
    415     {
    416       "flag": "No significance testing on primary comparison",
    417       "detail": "The main claim that Panta outperforms SymPrompt (Table 2) has no statistical significance test. A Paired T-test is only applied to one internal ablation comparison, not the headline result."
    418     },
    419     {
    420       "flag": "Selective evaluation sample",
    421       "detail": "Only classes with cyclomatic complexity > 10 are evaluated (130/all classes), and two outlier types are manually excluded. This means results are not representative of typical Java codebases and cannot be generalized to average-complexity code."
    422     },
    423     {
    424       "flag": "No funding disclosed",
    425       "detail": "No funding source is mentioned anywhere in the paper, making it impossible to assess potential conflicts of interest or institutional bias."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "Code-Aware Prompting: A Study of Coverage-Guided Test Generation in Regression Setting using LLM (SymPrompt)",
    431       "relevance": "Primary baseline compared against; uses symbolic execution path constraints for LLM-guided test generation"
    432     },
    433     {
    434       "title": "HITS: High-coverage LLM-based Unit Test Generation via Method Slicing",
    435       "relevance": "Contemporary LLM test generation technique using method decomposition; discussed but not empirically compared due to no public implementation"
    436     },
    437     {
    438       "title": "CoverUp: Coverage-guided LLM-based test generation",
    439       "relevance": "Related approach using coverage information without path-level guidance; used as motivation for Panta's more structured approach"
    440     },
    441     {
    442       "title": "EvoSuite: Automatic Test Suite Generation for Object-Oriented Software",
    443       "relevance": "Dominant search-based baseline for Java unit test generation; referenced as the prior state of the art that LLM methods have advanced beyond"
    444     },
    445     {
    446       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    447       "relevance": "Benchmark dataset used for all experiments; standard in Java testing research"
    448     },
    449     {
    450       "title": "CodaMosa: Escaping Coverage Plateaus in Test Generation with Pre-Trained Large Language Models",
    451       "relevance": "Hybrid LLM + search-based approach that motivated combining LLMs with existing test generation infrastructure"
    452     },
    453     {
    454       "title": "ChatUniTest: A framework for LLM-based test generation",
    455       "relevance": "Uses generation-validation-repair mechanism similar to Panta's test repair component"
    456     },
    457     {
    458       "title": "Assertions are strongly correlated with test suite effectiveness",
    459       "relevance": "Justifies use of mutation score alongside coverage metrics to measure test effectiveness"
    460     }
    461   ],
    462   "engagement_factors": {
    463     "practical_relevance": {
    464       "score": 3,
    465       "justification": "Panta is a complete, publicly available Java tool that directly addresses test coverage gaps in real projects, applicable to any Maven-based Java SE project."
    466     },
    467     "surprise_contrarian": {
    468       "score": 1,
    469       "justification": "The combination of static CFG and dynamic coverage is intuitive and incremental; no findings challenge conventional wisdom in a striking way."
    470     },
    471     "fear_safety": {
    472       "score": 0,
    473       "justification": "Paper is about test generation quality with no AI safety or risk implications."
    474     },
    475     "drama_conflict": {
    476       "score": 0,
    477       "justification": "Standard empirical evaluation with no controversy or contested claims."
    478     },
    479     "demo_ability": {
    480       "score": 2,
    481       "justification": "GitHub repo is publicly available with reproduction instructions; practitioners can run Panta on their own Java projects."
    482     },
    483     "brand_recognition": {
    484       "score": 1,
    485       "justification": "Authors are from UBC, a known research institution, but not a famous industry AI lab; no major brand association."
    486     }
    487   },
    488   "hn_data": {
    489     "threads": [
    490       {
    491         "hn_id": "35390153",
    492         "title": "HuggingGPT: Solving AI tasks with ChatGPT and its friends in HuggingFace",
    493         "points": 243,
    494         "comments": 267,
    495         "url": "https://news.ycombinator.com/item?id=35390153",
    496         "created_at": "2023-03-31T17:22:19Z"
    497       },
    498       {
    499         "hn_id": "45284766",
    500         "title": "Towards a Physics Foundation Model",
    501         "points": 117,
    502         "comments": 30,
    503         "url": "https://news.ycombinator.com/item?id=45284766",
    504         "created_at": "2025-09-18T03:06:08Z"
    505       },
    506       {
    507         "hn_id": "30807022",
    508         "title": "Mounting evidence for a 95 GeV Higgs boson",
    509         "points": 88,
    510         "comments": 26,
    511         "url": "https://news.ycombinator.com/item?id=30807022",
    512         "created_at": "2022-03-25T21:02:38Z"
    513       },
    514       {
    515         "hn_id": "45482380",
    516         "title": "Acoustic Eavesdropping via Mouse Sensors",
    517         "points": 4,
    518         "comments": 0,
    519         "url": "https://news.ycombinator.com/item?id=45482380",
    520         "created_at": "2025-10-05T15:40:37Z"
    521       },
    522       {
    523         "hn_id": "39892053",
    524         "title": "Systematic ordering of foods in the Soup-Salad-Sandwich phase space",
    525         "points": 3,
    526         "comments": 0,
    527         "url": "https://news.ycombinator.com/item?id=39892053",
    528         "created_at": "2024-04-01T09:02:55Z"
    529       },
    530       {
    531         "hn_id": "22785832",
    532         "title": "Mastering Mahjong with Deep Reinforcement Learning",
    533         "points": 3,
    534         "comments": 0,
    535         "url": "https://news.ycombinator.com/item?id=22785832",
    536         "created_at": "2020-04-05T14:45:14Z"
    537       },
    538       {
    539         "hn_id": "44066363",
    540         "title": "MMaDA: Multimodal Large Diffusion Language Models",
    541         "points": 2,
    542         "comments": 0,
    543         "url": "https://news.ycombinator.com/item?id=44066363",
    544         "created_at": "2025-05-22T20:13:21Z"
    545       },
    546       {
    547         "hn_id": "45503848",
    548         "title": "Invisible Ears at Your Fingertips: Acoustic Eavesdropping via Mouse Sensors",
    549         "points": 1,
    550         "comments": 0,
    551         "url": "https://news.ycombinator.com/item?id=45503848",
    552         "created_at": "2025-10-07T14:54:15Z"
    553       },
    554       {
    555         "hn_id": "44120397",
    556         "title": "MMaDA: Multimodal Large Diffusion Language Models",
    557         "points": 1,
    558         "comments": 0,
    559         "url": "https://news.ycombinator.com/item?id=44120397",
    560         "created_at": "2025-05-28T20:33:16Z"
    561       },
    562       {
    563         "hn_id": "35282416",
    564         "title": "Efficient multi-stage inference on tabular data",
    565         "points": 1,
    566         "comments": 0,
    567         "url": "https://news.ycombinator.com/item?id=35282416",
    568         "created_at": "2023-03-23T22:38:28Z"
    569       }
    570     ],
    571     "top_points": 243,
    572     "total_points": 463,
    573     "total_comments": 323
    574   }
    575 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs