scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27329B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Hybrid Automated Program Repair by Combining Large Language Models and Program Analysis",
      6     "authors": [
      7       "Fengjie Li",
      8       "Jiajun Jiang",
      9       "Jiajun Sun",
     10       "Hongyu Zhang"
     11     ],
     12     "year": 2024,
     13     "venue": "ACM Transactions on Software Engineering and Methodology",
     14     "arxiv_id": "2406.00992",
     15     "doi": "10.1145/3715004"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All quantitative claims in the abstract (27.78% and 23.40% avg improvement; at least 42 and 7 more bugs under perfect and automated FL) are directly supported by Tables III, IV, and V.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Controlled comparison against 22 baselines on a fixed benchmark provides adequate grounds for causal improvement claims; however, the component analysis in RQ3 uses frequency counting rather than proper ablation, weakening claims about which components drive gains.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The limitations section explicitly restricts claims to Java and four specific LLMs, and external threats acknowledge results may not generalize to other datasets or languages.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Data leakage is addressed in Section VI.B, but the paper does not discuss whether improvement stems from the skeleton approach itself versus the larger candidate patch budget (500 per skeleton vs 200 LLM patches), a meaningful confound.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly distinguishes 'plausible patches' (pass test suite) from 'correct patches' (semantically equivalent to developer patch via manual inspection), and reports both.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section VI contains dedicated 'C. Limitation' and 'D. Threats to validity' subsections with substantive content beyond a single sentence.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Internal threats name specific issues: manual patch review subjectivity and LLM training data overlap analyzed concretely for StarCoder; external threats cite evaluation dataset scope as a concrete constraint.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Limitations explicitly bound results to Java and four LLMs, and note that LLM patch generation time is excluded from the 5-hour repair budget.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No acknowledgments or funding disclosure section is present anywhere in the paper.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations (Tianjin University and Chongqing University) are clearly disclosed in the author block.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "Funding source not disclosed; independence cannot be assessed.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement appears in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are defined precisely: 'plausible patch' vs 'correct patch' distinction is explicit, 'patch skeleton' is introduced and explained with formal rules, and APR approach categories are enumerated.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Four explicit contributions are enumerated at the end of the introduction: the GIANTREPAIR technique, the patch skeleton method, comprehensive two-scenario evaluation, and open-source release.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The related work section differentiates GIANTREPAIR from FitRepair, Repilot, GAMMA, AlphaRepair, and older deep-learning/template approaches, explaining mechanistic differences rather than just listing citations.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Source code is openly released on GitHub (https://github.com/Feng-Jay/GiantRepair) as explicitly stated in the paper.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Experiments use Defects4J and GrowingBugs (both public benchmarks); all experimental data including correct and plausible patches is stated to be on the GitHub repository.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Hardware is described (dual Xeon 6388, 512GB RAM, 4x A800, Ubuntu 20.04.6LTS) but no dependency specification files (requirements.txt, Dockerfile, build scripts) are mentioned in the paper.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "The paper points to a GitHub repository but provides no step-by-step reproduction instructions within the paper itself.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All results are reported as raw counts of correctly fixed bugs with no confidence intervals or error bars.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are applied to any comparative claims; improvement percentages are reported without p-values.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Relative improvement percentages are reported (27.78% and 23.40% average improvement) alongside absolute counts and baseline values.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The paper uses all available single-function bugs from Defects4J (255 and 228) but provides no power analysis or sample size justification.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "LLM patch generation uses temperature=0.8 (stochastic), but no variance across multiple generation runs is reported; all results reflect single-run outcomes.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "22 baseline APR tools are included, spanning template-based, heuristic, deep-learning, and LLM-based approaches from 2016–2023.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Baselines include 2023 state-of-the-art tools (Tare, KNOD, FitRepair, GAMMA, Repilot) published in top venues (ICSE, ASE, FSE).",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": false,
    193           "justification": "RQ3 analyzes rule contribution through frequency counting of which abstraction rules appear in correct fixes, but does not remove components and remeasure performance — this is descriptive, not a proper ablation.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Both number of correctly repaired bugs and patch precision (ratio of correct to plausible patches) are reported across scenarios.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "Manual inspection is used to determine semantic equivalence of plausible patches with developer patches, following standard APR practice.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Defects4J v1.2 and v2.0 serve as fixed evaluation benchmarks; GrowingBugs (filtered subset) is used as an additional unseen validation set.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Table IV breaks results down by project (Chart, Closure, Lang, Math, Time, Mockito) for Defects4J v1.2.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": false,
    223           "justification": "The paper mentions that low patch precision relates to weak test suites (Section V.B) but does not systematically categorize or analyze failure cases.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Table V shows GIANTREPAIR achieves lower patch precision (57.66%) than Hanabi (80.95%) and SimFix (67.50%), and the paper attributes this to weak test suites.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Exact model versions are specified: StarCoderBase (15.5B parameters), CodeLlama-7B, Llama-2-13B, GPT-3.5-turbo-0301.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "The paper states it reuses the prompt from Xia et al. [19] but does not reproduce the actual prompt text.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Top-p (0.95), temperature (0.8), max patches per LLM per bug (200), max candidate patches per skeleton (500), and 5-hour time budget per bug are all reported.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "The full two-component pipeline (skeleton construction via AST differencing with Algorithm 1 pseudocode, and patch instantiation via static analysis with Table I abstraction rules) is described in algorithmic detail.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Filtering criteria are documented: single-function bugs only, removing cross-function modifications, resulting in 255 (v1.2) and 228 (v2.0) bugs from the full Defects4J sets.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "The paper states all correct and plausible patches are published on the GitHub repository for independent verification.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Defects4J and GrowingBugs are cited with original publications; the GrowingBugs subset selection (removing StarCoder-trained projects, filtering cross-function bugs) is documented step by step.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No participant recruitment — the study uses existing software defect benchmarks only.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The pipeline from LLM patch generation through skeleton construction, instantiation, ranking, and validation is documented in Sections III and IV with pseudocode and formal rules.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "Training data cutoffs are not stated for any of the four LLMs; StarCoder's training data composition is referenced but without a temporal cutoff date.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Section VI.B explicitly analyzes train/test overlap for StarCoder, finding 23 of 109 correct patches appeared in training data, and conducts additional experiments on GrowingBugs to address this.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "The paper checks StarCoder's published training data against correct patches and validates on GrowingBugs (34 projects confirmed outside StarCoder training data) to demonstrate results hold without contamination.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in this study.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "Hardware is listed but no inference cost in dollars, GPU-hours, or wall-clock time for LLM patch generation is reported; the paper explicitly notes LLM time is excluded from the 5-hour budget.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Hardware specifications are provided but no total compute budget (GPU-hours, total cost) for running all experiments is stated.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "GIANTREPAIR improves correct bug fixes over direct LLM patch use by an average of 27.78% on Defects4J v1.2 and 23.40% on Defects4J v2.0",
    374       "evidence": "Table III shows per-LLM improvement: GPT-3.5 43→53, StarCoder 42→55, CodeLlama 40→51, Llama-2 19→25 (v1.2); and 45→53, 44→54, 34→43, 18→24 (v2.0)",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "GIANTREPAIR outperforms all 22 SOTA APRs under perfect fault localization, fixing at least 42 more bugs than the best baseline (FitRepair)",
    379       "evidence": "Table IV: GIANTREPAIR 87+84=171 total vs FitRepair 85+44=129 total on Defects4J v1.2 and v2.0",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Under automated fault localization, GIANTREPAIR repairs at least 7 more bugs than the best SOTA (64 vs Tare's 57 on Defects4J v1.2)",
    384       "evidence": "Table V shows GIANTREPAIR repairs 64 bugs vs Tare 57, TBar 31, SimFix 27, Hanabi 17",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "GIANTREPAIR fixes 21 unique bugs not repaired by any of 22 baseline APR tools",
    389       "evidence": "Figure 3 (right panel) shows GIANTREPAIR contributing 21 unique fixes compared against all 22 baselines on Defects4J v1.2",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Data leakage from training data accounts for only a minority (23/109 = 21%) of GIANTREPAIR+StarCoder correct patches",
    394       "evidence": "Section VI.B manual analysis of StarCoder's published training data against 109 correct patches; supplemented by GrowingBugs experiment (10/51 bugs fixed on contamination-filtered set)",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "GIANTREPAIR is more general than baseline APRs, performing consistently across both Defects4J v1.2 and v2.0",
    399       "evidence": "Table IV shows baselines achieve much better performance on v1.2 than v2.0 (e.g., FitRepair 85 vs 44), while GIANTREPAIR shows smaller drop (87 vs 84)",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval"
    405   ],
    406   "key_findings": "GIANTREPAIR introduces a two-phase hybrid APR approach: LLM-generated patches are abstracted into structural 'patch skeletons' via AST differencing, then instantiated with context-appropriate program elements via static analysis, achieving 27.78% and 23.40% average improvement over direct LLM use on Defects4J v1.2 and v2.0 respectively. The system outperforms 22 state-of-the-art APR tools, fixing at least 42 more bugs than the best baseline under perfect fault localization and 7 more under automated fault localization. A key methodological contribution is evaluating under automated fault localization — where GIANTREPAIR achieves 64 correct repairs, approaching several LLM-based methods that require the unrealistic assumption of perfect localization. Data leakage analysis shows 86/109 StarCoder-based correct patches were not present in training data, partially validating the approach's genuine capabilities.",
    407   "red_flags": [
    408     {
    409       "flag": "No proper ablation study",
    410       "detail": "RQ3 analyzes component contribution through frequency counting of which AST abstraction rules appear in correct fixes, not by removing components and remeasuring performance — this does not establish causal importance of individual rules."
    411     },
    412     {
    413       "flag": "No statistical significance tests",
    414       "detail": "All comparative claims report raw bug counts and percentages without confidence intervals, p-values, or significance tests, making it impossible to assess whether differences are statistically meaningful given benchmark variance."
    415     },
    416     {
    417       "flag": "LLM stochasticity unreported",
    418       "detail": "LLMs use temperature=0.8 (stochastic output), but no variance across multiple generation runs is reported; all results reflect single-run outcomes with no error estimates."
    419     },
    420     {
    421       "flag": "Search budget confound unaddressed",
    422       "detail": "GIANTREPAIR generates up to 500 candidate patches per skeleton while baseline LLMs generate 200 patches; increased search budget may partially explain improvements independent of the skeleton approach."
    423     },
    424     {
    425       "flag": "GPT-4 comparison cherry-picked",
    426       "detail": "The GPT-4 comparison (Section VI.A) tests only 10 bugs selected specifically as ones GIANTREPAIR fixes but direct LLMs cannot — not a representative head-to-head evaluation."
    427     },
    428     {
    429       "flag": "Prompt not reproduced",
    430       "detail": "The paper reuses Xia et al.'s prompt without including the actual text, reducing full reproducibility."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "The plastic surgery hypothesis in the era of large language models (FitRepair)",
    436       "relevance": "Most closely related work; also leverages LLMs for APR with the plastic surgery hypothesis — direct predecessor that GIANTREPAIR extends and outperforms"
    437     },
    438     {
    439       "title": "Copiloting the copilots: Fusing large language models with completion engines for automated program repair (Repilot)",
    440       "relevance": "LLM+program-analysis hybrid APR baseline that fuses CodeT5 with completion engines — related hybrid approach"
    441     },
    442     {
    443       "title": "GAMMA: Revisiting template-based automated program repair via mask prediction",
    444       "relevance": "2023 SOTA template-based APR combining CodeBERT/UniXcoder with masked fix patterns — key baseline"
    445     },
    446     {
    447       "title": "Less training, more repairing: revisiting automated program repair via zero-shot learning (AlphaRepair)",
    448       "relevance": "Zero-shot LLM-based APR using CodeBERT for masked token replacement — important baseline in the LLM-APR space"
    449     },
    450     {
    451       "title": "Automated program repair in the era of large pre-trained language models (Xia et al., ICSE 2023)",
    452       "relevance": "Foundational LLM-based APR paper whose prompt and evaluation setup GIANTREPAIR directly reuses"
    453     },
    454     {
    455       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    456       "relevance": "Primary evaluation benchmark — the standard Java APR benchmark used throughout"
    457     },
    458     {
    459       "title": "Tare: Type-aware neural program repair",
    460       "relevance": "2023 deep-learning APR baseline with strongest performance under automated fault localization; direct comparison point in Table V"
    461     },
    462     {
    463       "title": "Shaping program repair space with existing patches and similar code (SimFix)",
    464       "relevance": "Heuristic APR baseline using existing patches to guide repair — predecessor to the 'skeleton' concept and comparison point"
    465     },
    466     {
    467       "title": "Tbar: Revisiting template-based automated program repair",
    468       "relevance": "Canonical template-based APR baseline used across multiple comparison scenarios"
    469     }
    470   ],
    471   "engagement_factors": {
    472     "practical_relevance": {
    473       "score": 2,
    474       "justification": "Open-sourced tool that measurably improves bug repair rates on real Java defects, directly applicable to developers using LLMs for APR workflows."
    475     },
    476     "surprise_contrarian": {
    477       "score": 1,
    478       "justification": "The insight that structurally incorrect LLM patches still provide useful guidance is mildly counterintuitive, but incremental within the APR field."
    479     },
    480     "fear_safety": {
    481       "score": 0,
    482       "justification": "No AI safety or risk concerns; this is a software engineering productivity tool."
    483     },
    484     "drama_conflict": {
    485       "score": 0,
    486       "justification": "No controversy or conflict angle; standard benchmark comparison paper."
    487     },
    488     "demo_ability": {
    489       "score": 2,
    490       "justification": "Code is on GitHub and Defects4J is publicly available, making the tool runnable by practitioners."
    491     },
    492     "brand_recognition": {
    493       "score": 0,
    494       "justification": "Authors from Tianjin and Chongqing universities; no famous lab, industry affiliation, or well-known product involved."
    495     }
    496   },
    497   "hn_data": {
    498     "threads": [
    499       {
    500         "hn_id": "38853706",
    501         "title": "Possible Meissner effect near room temperature: copper-substituted lead apatite",
    502         "points": 729,
    503         "comments": 318,
    504         "url": "https://news.ycombinator.com/item?id=38853706"
    505       },
    506       {
    507         "hn_id": "38850232",
    508         "title": "LK99: Possible Meissner effect near room temperature",
    509         "points": 6,
    510         "comments": 2,
    511         "url": "https://news.ycombinator.com/item?id=38850232"
    512       },
    513       {
    514         "hn_id": "39399420",
    515         "title": "A Comprehensive Survey of 400 Activation Functions",
    516         "points": 5,
    517         "comments": 0,
    518         "url": "https://news.ycombinator.com/item?id=39399420"
    519       },
    520       {
    521         "hn_id": "40053439",
    522         "title": "GPT-4 Vision Can Estimate Building Age Epoch",
    523         "points": 4,
    524         "comments": 1,
    525         "url": "https://news.ycombinator.com/item?id=40053439"
    526       },
    527       {
    528         "hn_id": "39656735",
    529         "title": "Survey of 400 Activation Functions for Neural Networks",
    530         "points": 3,
    531         "comments": 0,
    532         "url": "https://news.ycombinator.com/item?id=39656735"
    533       },
    534       {
    535         "hn_id": "31603582",
    536         "title": "DPM-Solver: A Fast Ode Solver for Diffusion Model Sampling in Around 10 Steps",
    537         "points": 3,
    538         "comments": 0,
    539         "url": "https://news.ycombinator.com/item?id=31603582"
    540       },
    541       {
    542         "hn_id": "44984327",
    543         "title": "How Random Is Random? Evaluating Randomness and Humaness of LLM Coin Flip (2024)",
    544         "points": 2,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=44984327"
    547       },
    548       {
    549         "hn_id": "44254682",
    550         "title": "LLMs Can Write Efficient CUDA Kernels",
    551         "points": 2,
    552         "comments": 0,
    553         "url": "https://news.ycombinator.com/item?id=44254682"
    554       },
    555       {
    556         "hn_id": "36396055",
    557         "title": "Exploring the Implications of Large Language Models on the Science System",
    558         "points": 2,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=36396055"
    561       },
    562       {
    563         "hn_id": "40444248",
    564         "title": "Contextual Emotion Recognition Using Large Vision Language Models",
    565         "points": 1,
    566         "comments": 0,
    567         "url": "https://news.ycombinator.com/item?id=40444248"
    568       }
    569     ],
    570     "top_points": 729,
    571     "total_points": 757,
    572     "total_comments": 321
    573   }
    574 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs