scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27991B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evaluating Diverse Large Language Models for Automatic and General Bug Reproduction",
      6     "authors": [
      7       "Sungmin Kang",
      8       "Juyeon Yoon",
      9       "Nargiz Askarbekkyzy",
     10       "Shin Yoo"
     11     ],
     12     "year": 2023,
     13     "venue": "IEEE Transactions on Software Engineering",
     14     "arxiv_id": "2311.04532",
     15     "doi": "10.1109/TSE.2024.3450837"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All abstract claims are verified in the body: 33.5% Defects4J reproduction (Table 5), StarCoder at 70% Codex performance (RQ4-1), 90% on GHRB holdout (RQ4-2), and size-scaling trend (Section 6.4.4).",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Claims that natural-language fine-tuning hurts performance and that model size improves reproduction are tested by comparing same-family models (StarCoder vs StarCoderPlus, Incoder-1B vs 6B, CodeGen2 family), isolating the variable of interest.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The title claims 'General Bug Reproduction' but all experiments are on Java projects (Defects4J, GHRB); the paper notes Checkstyle limitations but never explicitly bounds claims to Java or acknowledges the language restriction in conclusions.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The ChatGPT behavior-change finding explicitly considers alternative explanations (model degradation vs. prompt format change, Section 6.4.3); data leakage concerns are addressed via the GHRB holdout design.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "BRT is precisely defined as a test that fails on the buggy version and passes on the fixed version; the paper uses this metric consistently and does not conflate it with broader notions of test quality.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "There is no dedicated limitations or threats-to-validity section; limitations are scattered across Section 3.1 (data leakage), Section 6.4.3 (reproducibility of OpenAI models), and Section 7.1 (failure case discussion).",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Threats are discussed informally (e.g., Defects4J contamination, Checkstyle external-file dependency) but never in a structured threats-to-validity format; no discussion of construct validity or external validity beyond Java Java projects.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper never explicitly states that results apply only to Java or only to projects with self-contained test suites; scope constraints are implied by the experimental setup but not formally bounded in conclusions.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding acknowledgment appears anywhere in the paper; the authors are affiliated with KAIST but no grant or sponsor is disclosed.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors are identified as affiliated with KAIST (Korea Advanced Institute of Science and Technology) in the author block.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding is disclosed, so independence cannot be assessed.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "BRT (Bug Reproducing Test) is precisely defined in Section 4.2 (fails on buggy version, passes on fixed); FIB (Fail In the Buggy program) is defined in Section 3.4; LIBRO acronym is introduced in Section 3.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 1 explicitly enumerates four new contributions of this extension over the prior ICSE paper: large-scale LLM comparison, GPU-memory tradeoff analysis, model-size analysis, ChatGPT behavior change, and temperature study.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Sections 8.1 and 8.2 provide detailed related work on test generation and code synthesis, contextualizing LIBRO relative to EvoCrash, Yakusu, ReCDroid, AdbGPT, CODAMOSA, and others; baseline comparison is implemented rather than only cited.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Two GitHub repositories are provided: https://github.com/coinse/libro (tool) and https://github.com/coinse/libro-journal-artifact (replication package), both mentioned in Sections 4.3 and 5.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Defects4J v2.0 is a public benchmark; the GHRB dataset is released in the artifact repository; the replication package is explicitly stated to be publicly available.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Environment is described in prose (Ubuntu 18.04/20.04, specific CPU/GPU specs, Python 3.9, javalang library) but no requirements.txt, Dockerfile, or equivalent machine-readable spec is mentioned.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "The paper references the artifact repository but provides no step-by-step reproduction instructions within the paper itself; reproducing the 8-month GPU run would require considerable inference from the artifact.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Figure 3 shows confidence intervals for the simulation of generation attempts, but main comparison tables (Tables 4, 5, 7, 9, 10) report single point estimates with no CIs or error bars.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are used for any comparative claims (e.g., LIBRO vs EvoCrash, Codex vs StarCoder); differences are reported as raw counts without p-values or non-parametric tests.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Effect sizes are reported as percentage of Codex performance (e.g., StarCoder at 70% on Defects4J, 90% on GHRB) with baseline context provided, enabling practical interpretation.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The 750-bug Defects4J sample and 31-bug GHRB sample are used as-is from available benchmarks without power analysis or justification for statistical adequacy, particularly for the small GHRB dataset.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Main reproduction counts in Tables 4, 5, 6, 8, 9 are single values; variance across LLM sampling runs is not reported for the primary results, though Figure 3 shows distribution for one specific analysis.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Two baselines are included: EvoCrash (state-of-the-art crash reproduction) and a Copy&Paste baseline that directly uses code snippets from bug reports.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "EvoCrash is described as state-of-the-art; the paper acknowledges it only handles crash bugs (a known limitation), making the comparison fair given the different scope.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Table 4 systematically ablates prompt components (no example, one example, within-project examples, constructor info, stack traces, number of examples, n samples), isolating each contribution's effect.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Multiple metrics are used: bugs reproduced (absolute and proportion), ROC-AUC for selection, acc@n and wef@n for ranking, and precision for selection threshold analysis.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "Human evaluation is not relevant; the BRT definition (fail on buggy, pass on fixed) provides an objective, automatic oracle that is appropriate for this task.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "GHRB is a held-out dataset constructed from GitHub PRs created after the Codex training data cutoff, explicitly designed to test generalization beyond training data.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Table 5 provides per-project breakdown across 17 Defects4J projects; Table 8 provides per-project breakdown for GHRB; RQ4 provides per-LLM breakdown.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section 7.1 provides a detailed failure case analysis (Checkstyle Issue #11365, Listing 5) explaining why LIBRO failed and what future work could address.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Multiple negative results are reported: within-project examples hurt performance (Table 4); natural language fine-tuning degrades performance (StarCoderPlus, BloomZ); ChatGPT-0613 initially failed due to output format change.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Table 3 lists all 15 LLMs with exact version names (gpt-3.5-turbo-0301, gpt-3.5-turbo-0613, code-davinci-002, StarCoder-15B, etc.) and their parameters, release years, and accessibility.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Listing 1 shows the exact prompt format with a concrete example (MATH-370 bug report); the template structure including the 'public void test' suffix is described in detail in Section 3.1.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Temperature (0.7 default, varied in RQ4-5), maximum tokens (256), and number of samples (n=10 or 50) are all reported; temperature sweep covers 0.0, 0.2, 0.4, 0.6, 0.7, 0.8, 1.0.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "The LIBRO pipeline is described in full detail across Sections 3.1–3.4 with Figure 1 overview, Algorithm 1 (test postprocessing), and Algorithm 2 (selection and ranking with precise pseudocode).",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Defects4J filtering criteria are documented (750 from 814, excluding 58 poorly mapped and 6 with directory issues); GHRB construction is documented step-by-step (970→550→300→435→84→31 bugs).",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "The paper states 'We make our experimental data and analysis scripts publicly available' with a link to the artifact repository in Section 1.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "GHRB collection procedure is described in detail: 17 manually chosen GitHub repositories, PR filtering criteria (post-cutoff, test-adding, merged, single-issue), and BRT verification (fail pre-merge, pass post-merge).",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; standard benchmark and GitHub repository mining used.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The full pipeline from bug report → LIBRO prompt → LLM output → test injection (Algorithm 1) → selection/ranking (Algorithm 2) → evaluation is documented with pseudocode and example outputs.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": true,
    295           "justification": "Codex training cutoff is referenced (GHRB PRs collected after July 2022 cutoff); StarCoder's training dataset (Stack) is identified and its membership test tool is cited.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Section 4.1 explicitly discusses that Defects4J is 'likely in most code-based LLM training data' citing Lee et al., and that StarCoder's pretraining included Defects4J reproducing tests specifically.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "Contamination is directly addressed by constructing GHRB with post-cutoff PRs and verifying via StarCoder's dataset membership test that GHRB tests are not in the Stack training dataset.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": true,
    359           "justification": "Table 6 reports API query time (5.85s/query), processing time, and total time (444s for 50-test run); Section 1 reports the full study required 8+ months of GPU time and 7 months of CPU time.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": true,
    365           "justification": "Section 1 explicitly states 'more than eight months of GPU time and seven months of CPU time'; Figure 7 reports GPU memory consumption per model for practitioner guidance.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "LIBRO with code-davinci-002 reproduces 33.5% (251/750) of bugs in Defects4J using 50 test generation attempts.",
    374       "evidence": "Table 5 and Section 6.1.1 with per-project breakdown across 17 projects.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "StarCoder achieves 70% of Codex performance on Defects4J and 90% on the GHRB holdout dataset.",
    379       "evidence": "Section 6.4.1 (Figure 6a: 125 vs 173 bugs) and Section 6.4.2 (Figure 6b with 50-test evaluation on GHRB).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "LIBRO generalizes beyond training data, achieving 32.2% reproduction on GHRB (post-cutoff bugs not in training data).",
    384       "evidence": "Section 6.3.1, Table 8; GHRB verified not in StarCoder Stack dataset via membership test.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Bug reproduction performance increases logarithmically with number of test generation attempts with no plateau.",
    389       "evidence": "Figure 3 based on 1,000-run simulation resampling from 50 generated tests per bug.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Fine-tuning code LLMs on natural language hurts bug reproduction performance.",
    394       "evidence": "StarCoderPlus (natural-language fine-tuned) substantially underperforms StarCoder; BloomZ underperforms Bloom (Section 6.4.1, Figure 6a).",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "LIBRO's self-consistency selection achieves ROC-AUC of 0.82, placing a BRT first for 43% of selected bugs.",
    399       "evidence": "Figure 4 (ROC curve) and Table 7 (acc@1 = 149 out of 350 selected bugs = 43%).",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "The ChatGPT behavior change observed by Chen et al. was due to prompt format change, not model degradation.",
    404       "evidence": "Section 6.4.3 and Table 9: GPT-0613 recovered to 168 bugs with modified prompt (vs. 72 with original), matching GPT-0301's 164.",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "benchmark-eval",
    410     "empirical"
    411   ],
    412   "key_findings": "LIBRO reproduces 33.5% of Defects4J Java bugs by prompting LLMs and using self-consistency-based selection/ranking, substantially outperforming EvoCrash (crash-only baseline). Open-source StarCoder achieves 70–90% of Codex performance depending on dataset, demonstrating open-source LLMs are viable alternatives. Performance scales logarithmically with number of generation attempts and positively with model size, with a potential emergent jump in the CodeGen2 family at 7B parameters. The ChatGPT 'performance degradation' observed in prior work is shown to be a prompt-format artifact rather than genuine model decline.",
    413   "red_flags": [
    414     {
    415       "flag": "No statistical significance testing",
    416       "detail": "All comparative claims (LIBRO vs. baselines, LLM comparisons) are reported as raw counts without p-values or non-parametric tests, making it impossible to distinguish meaningful differences from noise."
    417     },
    418     {
    419       "flag": "GHRB holdout is very small",
    420       "detail": "Only 31 bugs total, 10 reproduced — results reported for individual projects (e.g., 0/2 for Jackson, 0/13 for Checkstyle) are statistically meaningless at this granularity."
    421     },
    422     {
    423       "flag": "Java-only generalization gap",
    424       "detail": "All experiments use Java projects (Defects4J, GHRB); the paper's title and conclusions claim 'general' bug reproduction without acknowledging the language restriction."
    425     },
    426     {
    427       "flag": "No variance on main results",
    428       "detail": "Main reproduction counts are single-run values; given LLM sampling stochasticity, the same run repeated with different random seeds could yield different totals, but no variance is reported."
    429     },
    430     {
    431       "flag": "Codex inaccessible at publication",
    432       "detail": "The best-performing model (code-davinci-002) was discontinued by OpenAI before the journal extension was published, limiting the reproducibility of the headline result."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    438       "relevance": "Primary benchmark used for evaluation; ground truth for bug reproduction"
    439     },
    440     {
    441       "title": "Large language models are few-shot testers: Exploring LLM-based general bug reproduction",
    442       "relevance": "Prior conference paper that this work extends; provides Codex baseline"
    443     },
    444     {
    445       "title": "StarCoder: may the source be with you!",
    446       "relevance": "Best-performing open-source LLM in the evaluation; training data details critical for contamination analysis"
    447     },
    448     {
    449       "title": "Evaluating large language models trained on code (Codex)",
    450       "relevance": "Introduces code-davinci-002, the best-performing LLM in experiments"
    451     },
    452     {
    453       "title": "Self-consistency improves chain of thought reasoning in language models",
    454       "relevance": "Theoretical basis for LIBRO's selection mechanism using output cluster agreement"
    455     },
    456     {
    457       "title": "The GitHub Recent Bugs Dataset for evaluating LLM-based debugging applications",
    458       "relevance": "Introduces GHRB holdout dataset; provides evidence that Defects4J tests are in StarCoder training data"
    459     },
    460     {
    461       "title": "Single-objective versus multi-objectivized optimization for evolutionary crash reproduction (EvoCrash)",
    462       "relevance": "Primary baseline for comparison; state-of-the-art crash reproduction technique"
    463     },
    464     {
    465       "title": "How is ChatGPT's behavior changing over time?",
    466       "relevance": "Prior work whose conclusions are challenged; motivates ChatGPT temporal analysis in RQ4-3"
    467     }
    468   ],
    469   "engagement_factors": {
    470     "practical_relevance": {
    471       "score": 3,
    472       "justification": "Open-source tool immediately usable by developers; GPU-memory tradeoff chart directly guides practitioner LLM selection decisions."
    473     },
    474     "surprise_contrarian": {
    475       "score": 2,
    476       "justification": "Counterintuitively shows natural-language fine-tuning hurts code LLMs; challenges Chen et al.'s ChatGPT degradation narrative by attributing it to prompt format."
    477     },
    478     "fear_safety": {
    479       "score": 0,
    480       "justification": "No AI safety or risk concerns; the paper addresses software testing automation."
    481     },
    482     "drama_conflict": {
    483       "score": 1,
    484       "justification": "Mild controversy in rebutting Chen et al.'s ChatGPT degradation claim; highlights reproducibility risks of building on closed-source API models."
    485     },
    486     "demo_ability": {
    487       "score": 3,
    488       "justification": "Tool is publicly available at github.com/coinse/libro and can be applied to any Java project with bug reports immediately."
    489     },
    490     "brand_recognition": {
    491       "score": 1,
    492       "justification": "KAIST is a respected institution but not a top-tier AI lab; no involvement from OpenAI, Google, Meta, or similar recognized AI brands."
    493     }
    494   },
    495   "hn_data": {
    496     "threads": [
    497       {
    498         "hn_id": "38283398",
    499         "title": "API-Driven Program Synthesis for Testing Static Typing Implementations",
    500         "points": 35,
    501         "comments": 1,
    502         "url": "https://news.ycombinator.com/item?id=38283398",
    503         "created_at": "2023-11-15T22:19:08Z"
    504       },
    505       {
    506         "hn_id": "42158451",
    507         "title": "Convolutional Differentiable Logic Gate Networks",
    508         "points": 26,
    509         "comments": 4,
    510         "url": "https://news.ycombinator.com/item?id=42158451",
    511         "created_at": "2024-11-16T19:10:54Z"
    512       },
    513       {
    514         "hn_id": "39967245",
    515         "title": "Formal Aspects of Language Modeling",
    516         "points": 4,
    517         "comments": 0,
    518         "url": "https://news.ycombinator.com/item?id=39967245",
    519         "created_at": "2024-04-08T07:47:56Z"
    520       },
    521       {
    522         "hn_id": "42115169",
    523         "title": "Convolutional Differentiable Logic Gate Networks",
    524         "points": 3,
    525         "comments": 0,
    526         "url": "https://news.ycombinator.com/item?id=42115169",
    527         "created_at": "2024-11-12T13:04:29Z"
    528       },
    529       {
    530         "hn_id": "34101211",
    531         "title": "Will we run out of data?",
    532         "points": 3,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=34101211",
    535         "created_at": "2022-12-23T01:17:13Z"
    536       },
    537       {
    538         "hn_id": "25056202",
    539         "title": "Learning Autocompletion from Real-World Datasets",
    540         "points": 3,
    541         "comments": 0,
    542         "url": "https://news.ycombinator.com/item?id=25056202",
    543         "created_at": "2020-11-11T07:17:33Z"
    544       },
    545       {
    546         "hn_id": "40939773",
    547         "title": "Formal Aspects of Language Modeling",
    548         "points": 2,
    549         "comments": 0,
    550         "url": "https://news.ycombinator.com/item?id=40939773",
    551         "created_at": "2024-07-11T19:30:45Z"
    552       },
    553       {
    554         "hn_id": "42258010",
    555         "title": "Gradient Boosting Trees and LLMs for Tabular Data Few-Shot Learning",
    556         "points": 2,
    557         "comments": 0,
    558         "url": "https://news.ycombinator.com/item?id=42258010",
    559         "created_at": "2024-11-27T17:46:47Z"
    560       },
    561       {
    562         "hn_id": "36985212",
    563         "title": "Will we run out of data to train LLMs?",
    564         "points": 2,
    565         "comments": 0,
    566         "url": "https://news.ycombinator.com/item?id=36985212",
    567         "created_at": "2023-08-03T12:53:23Z"
    568       },
    569       {
    570         "hn_id": "40610622",
    571         "title": "Will we run out of data? Limits of LLM scaling based on human-generated data",
    572         "points": 1,
    573         "comments": 1,
    574         "url": "https://news.ycombinator.com/item?id=40610622",
    575         "created_at": "2024-06-07T17:08:29Z"
    576       }
    577     ],
    578     "top_points": 35,
    579     "total_points": 81,
    580     "total_comments": 6
    581   }
    582 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs