scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27890B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-shot Learning",
      6     "authors": [
      7       "Chunqiu Steven Xia",
      8       "Lingming Zhang"
      9     ],
     10     "year": 2022,
     11     "venue": "ESEC/FSE",
     12     "arxiv_id": "2207.08281",
     13     "doi": "10.1145/3540250.3549101"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The abstract claims of outperforming state-of-the-art APR and achieving 3.3X more fixes on Defects4J 2.0 are directly supported by Tables 1 and 4. The multilingual capability claim is supported by Table 5 on QuixBugs.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The ablation study (Table 3) adds components incrementally and measures the contribution of each mask strategy and comment encoding separately, providing adequate support for causal claims about which components drive performance.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper explicitly scopes to single-line bugs and tests generalization on Defects4J 2.0 and QuixBugs; multilingual claims are bounded to the Java and Python tested; the paper uses 'can potentially' language appropriately.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper does not discuss whether performance gains could be attributed to CodeBERT's larger scale compared to baselines rather than the zero-shot cloze formulation itself; no alternative explanations for the superiority over fine-tuned models are considered.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper explicitly distinguishes between plausible patches (pass test suite) and correct patches (semantically equivalent to developer patch via manual inspection), clearly separating the proxy metric from the true outcome.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 6 is a dedicated 'Threats to Validity' section with subsections for Internal and External threats, going well beyond a single sentence in the conclusion.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Specific threats are quantified: 16.6% of Defects4J 1.2 bugs and 11.0% of Defects4J 2.0 bugs overlap with CodeBERT training data; these are addressed with targeted perturbation experiments on the 15 overlapping fixed bugs.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper explicitly states 'we focus on single line patches in this work' and evaluates only the 82 single-line bugs in Defects4J 2.0, making the single-hunk scope boundary clear throughout.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No acknowledgments or funding disclosure appears anywhere in the paper text provided.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors list University of Illinois Urbana-Champaign with email addresses in the author block.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No funding is disclosed, so independence cannot be assessed.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement or declaration of financial interests appears in the paper.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key terms are defined: 'zero-shot learning' is cited and explained in the APR context, 'cloze task' is defined with reference, 'perfect fault localization' is explained as oracle knowledge of bug location, and 'plausible' vs 'correct' patches are distinguished.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper lists three explicit numbered contributions: (1) the cloze-style APR direction, (2) the AlphaRepair tool built on CodeBERT, and (3) extensive empirical evaluation on Defects4J and QuixBugs.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 2 provides detailed background on learning-based APR (NMT approaches) and large pre-trained code models; the evaluation compares against 18 baseline APR tools with explicit discussion of how AlphaRepair addresses limitations of each category.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "The paper cites Zenodo record [2] (https://zenodo.org/record/6819444) containing 'all correct patches for public evaluation along with the code to reproduce our experiments.'",
    122           "source": "haiku"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "Defects4J and QuixBugs are standard publicly available benchmarks used unmodified; no proprietary datasets are involved.",
    128           "source": "haiku"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "Hardware (Intel i7 10700KF, RTX 3080 Ti, Ubuntu 20.04.3, Java 1.8.0_312) and PyTorch are mentioned, but no requirements.txt, Dockerfile, or formal dependency specification file is provided.",
    134           "source": "haiku"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "The paper references the Zenodo artifact for reproduction but provides no step-by-step instructions within the paper text itself; the artifact may contain instructions but cannot be verified from the paper alone.",
    140           "source": "haiku"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "All results are reported as raw counts of bugs fixed with no confidence intervals or error bars; the deterministic nature of test-suite validation does not excuse the lack of uncertainty quantification across runs.",
    148           "source": "haiku"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "No statistical significance tests are applied to any comparative claims despite comparing counts across 18 baselines on benchmarks with 40-391 bugs.",
    154           "source": "haiku"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Effect sizes are reported as ratios with baseline context: '3.3X more bugs than best baseline on Defects4J 2.0' (36 vs 11) and absolute improvements like '74 vs 68' correct patches.",
    160           "source": "haiku"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "Benchmark sizes (391 bugs in Defects4J 1.2, 82 single-line bugs in Defects4J 2.0, 40 in QuixBugs) are inherited from prior work without any power analysis or justification for adequacy.",
    166           "source": "haiku"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "Results are single-point counts with no variance reported across runs; no information on whether experiments were repeated or how stochastic the patch generation process is.",
    172           "source": "haiku"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "18 baseline APR tools are compared, comprising 6 learning-based (Recoder, CURE, CoCoNuT, DLFix, SequenceR, DeepDebug) and 12 traditional tools (TBar, PraPR, AVATAR, SimFix, etc.).",
    180           "source": "haiku"
    181         },
    182         "baselines_contemporary": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "The primary learning-based baselines (Recoder 2021, CURE 2021, CoCoNuT 2020, DLFix 2020) are recent and competitive; the paper uses Recoder's updated results from the authors' GitHub.",
    186           "source": "haiku"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Table 3 provides an ablation study showing the incremental contribution of each component: complete mask (+20), partial begin (+13), partial end (+15), template (+21), and comment buggy line encoding (+5).",
    192           "source": "haiku"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "The paper reports both correct patches and plausible patches as primary metrics, plus patch ranking position (avg 612th → 418th with re-ranking) and unique bug fixes as secondary metrics.",
    198           "source": "haiku"
    199         },
    200         "human_evaluation": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Patch correctness is determined by 'manually inspecting each plausible patch for semantic equivalency' — the authors performed human evaluation of all plausible patches.",
    204           "source": "haiku"
    205         },
    206         "held_out_test_set": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Defects4J 2.0 explicitly serves as a held-out generalization test (RQ3), as AlphaRepair uses no training data and the newer Defects4J 2.0 projects are separate from the primary evaluation set.",
    210           "source": "haiku"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Table 1 and Table 4 break down results per Java project (Chart, Closure, Lang, Math, Mockito, Time for Defects4J 1.2; Cli, Codec, Collections, etc. for Defects4J 2.0).",
    216           "source": "haiku"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": false,
    221           "justification": "The paper focuses on bugs AlphaRepair uniquely fixes, providing positive examples; no discussion of bugs AlphaRepair fails to fix or categories where it underperforms.",
    222           "source": "haiku"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": false,
    227           "justification": "The ablation shows component contributions but the paper doesn't report negative results, cases where AlphaRepair generates incorrect plausible patches, or bug types where the approach fails.",
    228           "source": "haiku"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "The paper specifies CodeBERT (cited as [23] with full paper reference) and uses 'the pre-trained CodeBERT model' with 'directly reuse the model parameters of the pre-trained CodeBERT model.'",
    236           "source": "haiku"
    237         },
    238         "prompts_provided": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "The full input structure is described with formulas and Figures 3-5 showing exact tokenization structure, mask line templates (complete, partial, template strategies), and the comment encoding of the buggy line.",
    242           "source": "haiku"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Beam width (25 for perfect FL, 5 for not perfect FL), max patches (5,000), max token length (512), and 5-hour timeout per bug are all explicitly stated in Section 4.2.",
    248           "source": "haiku"
    249         },
    250         "scaffolding_described": {
    251           "applies": false,
    252           "answer": false,
    253           "justification": "NA — this paper evaluates a direct MLM-based approach without agentic scaffolding.",
    254           "source": "haiku"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Section 3.1 documents input processing in detail: BBPE tokenization, context window sizing (expanding from buggy line to max 512 tokens), comment transformation of buggy line, and mask line generation strategies.",
    260           "source": "haiku"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "The Zenodo artifact [2] contains patches and data; Defects4J and QuixBugs are publicly available standard benchmarks with full raw data.",
    268           "source": "haiku"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "The benchmarks (Defects4J, QuixBugs) are standard and well-described in their original papers; the paper follows prior work's setup with 391 bugs (removing 4 deprecated) and 82 single-line bugs from Defects4J 2.0.",
    274           "source": "haiku"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": false,
    278           "answer": false,
    279           "justification": "NA — standard benchmark evaluation with no participant recruitment.",
    280           "source": "haiku"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "The full pipeline from buggy input → fault localization → mask generation → patch generation → re-ranking → validation is documented in detail across Sections 3.1–3.5 with formulas and figures.",
    286           "source": "haiku"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": true,
    292           "answer": false,
    293           "justification": "CodeBERT's training data is described as 'over 6 million code functions from open-source projects' but no training data collection cutoff date is stated for CodeBERT.",
    294           "source": "haiku"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": true,
    298           "answer": true,
    299           "justification": "Section 6 explicitly quantifies overlap: 65/391 (16.6%) Defects4J 1.2 bugs and 9/82 (11.0%) Defects4J 2.0 bugs appear in CodeBERT training data; perturbation experiments are conducted on all 15 overlapping fixed bugs.",
    300           "source": "haiku"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": true,
    304           "answer": true,
    305           "justification": "For the 15 bugs where developer patches are in CodeBERT training data, the authors 'manually perturb the buggy code (change variable names, add empty while, if statements)' and confirm AlphaRepair still generates correct fixes, adequately addressing memorization concerns.",
    306           "source": "haiku"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "NA — no human participant study.",
    314           "source": "haiku"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "NA — no human participant study.",
    320           "source": "haiku"
    321         },
    322         "demographics_reported": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "NA — no human participant study.",
    326           "source": "haiku"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "NA — no human participant study.",
    332           "source": "haiku"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "NA — no human participant study.",
    338           "source": "haiku"
    339         },
    340         "blinding_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "NA — no human participant study.",
    344           "source": "haiku"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "NA — no human participant study.",
    350           "source": "haiku"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": true,
    356           "answer": false,
    357           "justification": "Hardware specs are provided but no per-bug or total inference cost/latency breakdown is reported; only a 5-hour maximum timeout per bug is stated.",
    358           "source": "haiku"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "The machine configuration is specified but total compute hours for the full evaluation are not reported.",
    364           "source": "haiku"
    365         }
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "AlphaRepair fixes 74 bugs on Defects4J 1.2 (perfect FL), outperforming all 18 baselines including TBar (68) and Recoder (65).",
    372       "evidence": "Table 1 shows AlphaRepair at 74/109 correct/plausible patches versus TBar 68/95 and Recoder 65/112.",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "AlphaRepair fixes 3.3X more bugs than the best baseline on Defects4J 2.0 (36 vs 11 by Recoder), demonstrating generalization beyond Defects4J 1.2.",
    377       "evidence": "Table 4 shows AlphaRepair at 36/50 correct/plausible versus Recoder 11/23 and TBar 8/25 on 82 single-line bugs.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Zero-shot APR avoids the dataset-overfitting issue of learning-based techniques that train on historical bug fixes.",
    382       "evidence": "Section 5.3.1 argues that the 3.3X improvement on Defects4J 2.0 (harder, newer bugs) shows AlphaRepair generalizes better than trained baselines; also perturbation experiments on overlapping bugs confirm no simple memorization.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Patch re-ranking with joint scores improves average correct patch rank from 612th to 418th (31.7% reduction).",
    387       "evidence": "Figure 8 and Section 5.2 report these averages; 61 of 74 correct patches are ranked higher after re-ranking.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "AlphaRepair achieves state-of-the-art results on QuixBugs in both Java (28 correct) and Python (27 correct).",
    392       "evidence": "Table 5 shows AlphaRepair outperforms CURE (26 Java), DeepDebug (21 Python), Recoder (17 Java), and CoCoNuT (13 Java, 19 Python).",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "Template mask contributes the highest single component gain (+21 correct patches) in the ablation study.",
    397       "evidence": "Table 3 shows template mask adds the most correct patches of all components; it targets conditional and method invocation patterns which are common bug types.",
    398       "supported": "strong"
    399     }
    400   ],
    401   "methodology_tags": [
    402     "benchmark-eval"
    403   ],
    404   "key_findings": "AlphaRepair introduces a cloze-style formulation for automated program repair that uses CodeBERT's masked language modeling objective directly, without any fine-tuning on bug-fixing datasets, achieving state-of-the-art results on Defects4J 1.2 (74 correct fixes), Defects4J 2.0 (3.3X over best baseline), and QuixBugs Java and Python. The zero-shot approach avoids three key limitations of learning-based APR: noisy training data, limited training data quantity, and poor context representation. The paper establishes that pre-trained code models can serve as general-purpose repair engines when repurposed as cloze tasks, with a probabilistic patch re-ranking strategy that improves average correct patch position by 31.7%.",
    405   "red_flags": [
    406     {
    407       "flag": "Single-line fix only",
    408       "detail": "AlphaRepair is restricted to single-line bug fixes, evaluating only 82 of 438 bugs in Defects4J 2.0 — this significantly limits real-world applicability and cherry-picks the easiest benchmark subset."
    409     },
    410     {
    411       "flag": "No statistical testing",
    412       "detail": "All comparative claims are based on raw bug counts with no significance tests, confidence intervals, or effect size uncertainty quantification across 18 baselines."
    413     },
    414     {
    415       "flag": "Baselines from prior papers",
    416       "detail": "Most baseline results are taken from prior publications rather than re-run under identical hardware and timeout conditions; acknowledged as a threat but not resolved."
    417     },
    418     {
    419       "flag": "No failure case analysis",
    420       "detail": "The paper only presents bugs AlphaRepair uniquely fixes; there is no systematic analysis of failure modes, bug types that are difficult for the approach, or plausible-but-incorrect patch rates."
    421     },
    422     {
    423       "flag": "Scale confound",
    424       "detail": "AlphaRepair uses CodeBERT trained on 6M+ code functions versus baselines trained on smaller bug-fix datasets; the paper does not disentangle whether gains come from zero-shot formulation or simply from training data scale."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "A Syntax-Guided Edit Decoder for Neural Program Repair (Recoder)",
    430       "relevance": "Primary learning-based APR baseline; outperformed by AlphaRepair on all benchmarks"
    431     },
    432     {
    433       "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair",
    434       "relevance": "Key learning-based baseline that pre-trains on code before fine-tuning on bug fixes; directly compared"
    435     },
    436     {
    437       "title": "CoCoNuT: Combining Context-Aware Neural Translation Models Using Ensemble for Program Repair",
    438       "relevance": "Learning-based baseline for both Java and Python repair; compared on QuixBugs"
    439     },
    440     {
    441       "title": "TBar: Revisiting Template-Based Automated Program Repair",
    442       "relevance": "Best-performing traditional baseline on Defects4J 1.2 (68 fixes); directly re-run on Defects4J 2.0"
    443     },
    444     {
    445       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    446       "relevance": "Foundation model underlying AlphaRepair; the MLM pre-training objective is repurposed for zero-shot APR"
    447     },
    448     {
    449       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    450       "relevance": "Primary evaluation benchmark providing reproducible Java bugs across 6 projects"
    451     },
    452     {
    453       "title": "DLFix: Context-Based Code Transformation Learning for Automated Program Repair",
    454       "relevance": "Learning-based APR baseline using tree-based encoding; compared on Defects4J 1.2"
    455     },
    456     {
    457       "title": "QuixBugs: A Multi-Lingual Program Repair Benchmark Set Based on the Quixey Challenge",
    458       "relevance": "Multilingual benchmark used to demonstrate Java and Python repair capability"
    459     },
    460     {
    461       "title": "Practical Program Repair via Bytecode Mutation (PraPR)",
    462       "relevance": "Template-based baseline; shows mutation-based traditional APR performance on Defects4J 1.2"
    463     },
    464     {
    465       "title": "SequenceR: Sequence-to-Sequence Learning for End-to-End Program Repair",
    466       "relevance": "Early learning-based APR baseline using LSTM encoder-decoder; 0 unique fixes on Defects4J 1.2"
    467     }
    468   ],
    469   "engagement_factors": {
    470     "practical_relevance": {
    471       "score": 3,
    472       "justification": "APR tools directly save developer debugging time; AlphaRepair is released on Zenodo and works on standard Java/Python benchmarks without retraining."
    473     },
    474     "surprise_contrarian": {
    475       "score": 2,
    476       "justification": "Challenges the prevailing assumption that APR requires fine-tuning on bug-fixing datasets; demonstrates that zero-shot outperforms trained models on harder evaluation sets."
    477     },
    478     "fear_safety": {
    479       "score": 0,
    480       "justification": "No AI safety or risk implications; purely a software engineering tool evaluation."
    481     },
    482     "drama_conflict": {
    483       "score": 1,
    484       "justification": "Implicitly challenges the work of learning-based APR researchers who spent significant effort collecting bug-fix training data, but the tone is constructive rather than adversarial."
    485     },
    486     "demo_ability": {
    487       "score": 2,
    488       "justification": "Tool is released on Zenodo with code; users can run it on Defects4J bugs with the standard setup, though the environment setup is non-trivial."
    489     },
    490     "brand_recognition": {
    491       "score": 1,
    492       "justification": "UIUC is a strong CS program; Lingming Zhang is a known APR researcher, but neither author nor institution is a household AI lab name."
    493     }
    494   },
    495   "hn_data": {
    496     "threads": [
    497       {
    498         "hn_id": "32333565",
    499         "title": "Why do tree-based models still outperform deep learning on tabular data?",
    500         "points": 315,
    501         "comments": 139,
    502         "url": "https://news.ycombinator.com/item?id=32333565"
    503       },
    504       {
    505         "hn_id": "39601710",
    506         "title": "Why do tree-based models still outperform deep learning on tabular data? (2022)",
    507         "points": 212,
    508         "comments": 111,
    509         "url": "https://news.ycombinator.com/item?id=39601710"
    510       },
    511       {
    512         "hn_id": "32213240",
    513         "title": "Why do tree-based models still outperform deep learning on tabular data?",
    514         "points": 3,
    515         "comments": 0,
    516         "url": "https://news.ycombinator.com/item?id=32213240"
    517       },
    518       {
    519         "hn_id": "38594085",
    520         "title": "Forward Laplacian: Computational Framework for NN-Based Variational Monte Carlo",
    521         "points": 3,
    522         "comments": 0,
    523         "url": "https://news.ycombinator.com/item?id=38594085"
    524       },
    525       {
    526         "hn_id": "31627937",
    527         "title": "Bankrupting DoS Attackers Despite Uncertainty",
    528         "points": 3,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=31627937"
    531       },
    532       {
    533         "hn_id": "44596069",
    534         "title": "Fluid dynamics of a liquid mirror space telescope",
    535         "points": 2,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=44596069"
    538       },
    539       {
    540         "hn_id": "32357646",
    541         "title": "On-the-Fly Syntax Highlighting Using Neural Networks",
    542         "points": 1,
    543         "comments": 1,
    544         "url": "https://news.ycombinator.com/item?id=32357646"
    545       },
    546       {
    547         "hn_id": "47328039",
    548         "title": "Inverse Occam's Razor",
    549         "points": 1,
    550         "comments": 0,
    551         "url": "https://news.ycombinator.com/item?id=47328039"
    552       },
    553       {
    554         "hn_id": "31261728",
    555         "title": "Designing Word Filter Tools for Creator-Led Comment Moderation",
    556         "points": 1,
    557         "comments": 0,
    558         "url": "https://news.ycombinator.com/item?id=31261728"
    559       }
    560     ],
    561     "top_points": 315,
    562     "total_points": 541,
    563     "total_comments": 251
    564   }
    565 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs