scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28155B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "How Efficient is LLM-Generated Code? A Rigorous & High-Standard Benchmark",
      6     "authors": [
      7       "Ruizhong Qiu",
      8       "Weiliang Will Zeng",
      9       "James Ezick",
     10       "Christopher Lott",
     11       "Hanghang Tong"
     12     ],
     13     "year": 2024,
     14     "venue": "International Conference on Learning Representations",
     15     "arxiv_id": "2406.06647",
     16     "doi": "10.48550/arXiv.2406.06647"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims are backed by the paper: the eff@k metric is formally derived (§2), Rao-Blackwellization is proven in Theorem 1, expert reference solutions are described in §3.2, and evaluation of 30 LLMs with GPT-4 eff@1=0.454 is shown in Table 3.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper claims LLMs underperform 'because' they struggle with algorithm design and implementation optimization, but this is inferred from subset performance differences rather than controlled experiments isolating the causal mechanism.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper scopes conclusions to 142 HumanEval-derived Python problems and the 30 evaluated LLMs; limitations in Appendix D explicitly note the benchmark covers standalone problems, not complex multi-file software development.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper explores prompting as an alternative explanation (Appendix C.8) but does not seriously consider whether benchmark design choices (problem set, time limits, single expert) could explain observed LLM deficiencies.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper clearly distinguishes eff@k (execution time efficiency against expert reference) from pass@k (functional correctness) and explicitly discusses what the metric measures vs. what is claimed, including limitations of measuring only time efficiency.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Appendix D.2 provides a dedicated 'Other Limitations & Future Work' section listing multiple specific limitations including standalone problem scope, unverifiable reference optimality, time-only efficiency focus, and lack of advanced prompting techniques.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats are named: reference solutions may not be provably optimal (efficiency score can exceed 1.0), only standalone problems are studied (not real-world multi-file code), and single expert involvement is acknowledged as a scalability concern.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states it does NOT address space efficiency, complex software projects, or automatic time complexity measurement, and Appendix D.1 explains why crowd-sourcing and online judge solutions were excluded.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments disclose NSF (2134079), NIFA (2020-67021-32799), and IBM-Illinois Discovery Accelerator Institute as funding sources.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations with UIUC and Qualcomm AI Research are disclosed on the title page; the Qualcomm footnote clarifies that datasets were evaluated at UIUC.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "NSF and NIFA are government agencies independent of LLM outcomes; IBM funds the lab but does not make models evaluated in the paper; Qualcomm authors work on the benchmark methodology, not on evaluated models.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are formally defined: eff@k is defined in §2.3 (Eq. 5-6), 'right-censored execution time' is explained with reference to statistics literature, and 'algorithm design' vs. 'implementation optimization' are distinguished with concrete examples.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The introduction lists four explicit contributions: (1) eff@k metric with unbiased estimator, (2) expert reference solutions, (3) strong test case generators, and (4) evaluation of 30 LLMs.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper identifies five specific methodological gaps (C1-C5) in prior work (Niu et al., Huang et al., Du et al.) and shows empirically how prior metrics fail (Table 7 speedup comparison, Table 8 random test case comparison).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The abstract explicitly states 'Our benchmark is publicly available at https://github.com/q-rz/enamel.'",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The benchmark problems (derived from HumanEval/HumanEval+), expert reference solutions, and test case generators are released on GitHub; base HumanEval and HumanEval+ are standard public benchmarks.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Appendix C.1 mentions Ubuntu 20.04.6 LTS, Intel Xeon CPU @ 2.20GHz, Python 3.10.12, and 8 NVIDIA A100 80GB GPUs, but no requirements.txt, Dockerfile, or formal dependency specification is provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper does not include step-by-step instructions for reproducing results; the GitHub repository may contain them but they are not described in the paper text.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Main results tables (Table 3, Table 4) report only point estimates for eff@k and pass@k without confidence intervals or error bars; Table 11 reports standard deviation only for the estimator comparison, not for LLM comparisons.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to comparative claims between LLMs; rankings and performance differences are presented without p-values or hypothesis tests.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "The eff@k values themselves serve as effect sizes against the expert reference (1.000); GPT-4 Turbo achieving eff@1=0.470 vs expert level quantifies the gap explicitly.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The 142 problems are explained by problem selection criteria (excluding O(1) trivial problems) and the 100-200 code samples per problem by computational constraints, but no statistical power analysis justifies these numbers.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Standard deviations are reported only for the estimator comparison (Table 11); no variance across runs or problems is reported for the main LLM evaluation results.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Table 2 compares ENAMEL against HumanEval and HumanEval+ canonical solutions showing eff@1=0.455 and 0.513 respectively vs. 1.000 for ENAMEL; Table 9 compares against EffiBench and Mercury.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Compared LLMs include GPT-4 Turbo, Claude 3 Opus/Sonnet/Haiku, and Llama 3 (all 2024 models); efficiency benchmark comparisons include EffiBench and Mercury (both 2024).",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Appendix C.6 ablates hyperparameters (timeout factor α, level hardnesses h1, h2, h3); C.3 compares eff@k vs. the speedup metric; C.4 compares expert vs. random test case generators.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Both eff@k and pass@k are reported at multiple sample sizes k=1, 10, 100 for all models, providing both correctness and efficiency perspectives.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Human evaluation of LLM outputs is not applicable; the paper uses automated execution-time-based evaluation against expert reference solutions.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "This is a benchmark evaluation study, not a prediction task with train/test splits; the fixed problem set is the evaluation target.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Table 4 provides separate results for an Algorithm Design Subset (20 hard problems) and an Implementation Optimization Subset (75 problems); Figure 2 shows per-problem difficulty distribution.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Appendix C.8 provides detailed case studies of LLM failures on problem #36, showing that both Self-Refine and explicit algorithm hints fail to elicit O(log n) solutions from Llama 3 and Mixtral.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Table 12 shows that encouraging LLMs to generate 'the most efficient algorithm' barely changes eff@1 (e.g., Llama 3 70B drops from 0.421 to 0.418), and Appendix C.8 demonstrates Self-Refine fails to improve efficiency.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Appendix C.1 lists exact API snapshot IDs: claude-3-opus-20240229, claude-3-sonnet-20240229, claude-3-haiku-20240307, gpt-4-1106-preview, gpt-4-0613.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The paper states prompts from HumanEval+ are reused but does not reproduce the actual prompt text; only the prompt engineering variants in Appendix C.8 are shown, not the baseline prompts.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Appendix C.1 reports all evaluation hyperparameters: α=2, R=6, h1=h2=3, h3=4, M0=8, M1=M2=M3=4, temperature=0.8, top_p=0.95, and sample sizes (100 or 200 per problem).",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used; models generate code directly from problem prompts without multi-step reasoning pipelines.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 3.1 documents the problem selection process: starting from 164 HumanEval problems, excluding 22 with Θ(1) complexity for two documented reasons, yielding 142 problems.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "The benchmark (problems, reference solutions, test case generators) is publicly available at GitHub; the benchmark constitutes the primary data artifact.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 3 describes benchmark development in detail: problem selection criteria (§3.1), human expert algorithm design and implementation optimization process (§3.2), and test case generator curation methodology (§3.3).",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No participant recruitment is involved; the 'human expert' is a benchmark developer, not a study participant.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full evaluation pipeline is documented: code generation (temperature, sample size, model API), level-based execution (test cases, time limits, scoring formula), and eff@k computation (Algorithm 1).",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs for evaluated models are not stated; the paper treats models as black boxes and does not report when their training data was collected.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Appendix D.1 explicitly discusses contamination: online judge solutions were excluded because 'their public solutions are already in LLMs' pretraining corpuses,' and expert-written solutions are novel.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "Table 9 notes that ENAMEL is harder than EffiBench/Mercury because 'LLMs have seen the public solutions on LeetCode but have never seen our expert-written efficient solutions'; this directly addresses contamination.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "The paper mentions financial constraints motivated sampling size choices but does not report actual inference costs or latency figures for any model.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Hardware (8 NVIDIA A100 80GB GPUs, Google Cloud VMs) is mentioned but no total compute budget (GPU-hours, cost) is stated.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Most existing code generation benchmarks focus on functional correctness and overlook efficiency.",
    375       "evidence": "Related work section and C1-C5 challenge framing document that HumanEval, MBPP, APPS, EvalPlus, and others measure only correctness; only a few recent benchmarks (EffiBench, Mercury) address efficiency.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "The eff@k estimator is unbiased and has provably lower variance than naive estimation.",
    380       "evidence": "Theorem 1 provides formal proof of unbiasedness and variance reduction via Rao-Blackwellization; Table 11 empirically confirms standard deviation drops from 0.20 to 0.02 at k=1.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "LLMs fall far short of expert-level efficient code; even GPT-4 achieves only eff@1=0.454 against expert reference (1.000).",
    385       "evidence": "Table 3 shows eff@k for 30 LLMs; GPT-4 Turbo tops at eff@1=0.470, with most models below 0.3; eff@k is consistently much lower than pass@k across all models.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "LLMs that generate correct code are not necessarily equally good at efficient code (pass@k rank ≠ eff@k rank).",
    390       "evidence": "Table 3 shows GPT-4 Turbo has higher eff@1=0.470 but lower pass@1=0.796 than GPT-4 (pass@1=0.831, eff@1=0.454); Table 7 shows speedup metric inverts rankings further.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "LLMs struggle in designing advanced algorithms; ChatGPT achieves only eff@100=0.483 on the 20-problem algorithm design subset.",
    395       "evidence": "Table 4 shows all models have substantially lower eff@k on the Algorithm Design Subset; case studies in Appendix C.8 show models fail even when the efficient algorithm is explicitly revealed.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Prompt engineering (encouraging efficiency or Self-Refine) cannot significantly improve code efficiency.",
    400       "evidence": "Table 12 shows Llama 3 70B drops from eff@1=0.421 to 0.418 with efficiency prompting; Table 13 shows both LLMs remain at O(n) after Self-Refine on problem #36.",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "HumanEval and HumanEval+ canonical solutions are themselves inefficient by expert standards (eff@1=0.455 and 0.513 respectively).",
    405       "evidence": "Table 2 directly compares canonical solution performance under ENAMEL evaluation; Table 1 shows several canonical solutions have sub-optimal time complexity.",
    406       "supported": "strong"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval"
    411   ],
    412   "key_findings": "ENAMEL, a new benchmark of 142 Python problems with expert-written optimal reference solutions and strong test case generators, reveals that all 30 evaluated LLMs fall dramatically short of expert-level code efficiency. The strongest commercial model (GPT-4 Turbo) achieves only eff@1=0.470 against expert reference, with the gap widening specifically on algorithm design problems (ChatGPT eff@100=0.483 on a hard-problem subset). A key methodological contribution is the eff@k metric, which properly handles right-censored execution times via a Rao-Blackwellized estimator with provably lower variance, correcting a systematic overestimation flaw in prior efficiency metrics. Prompt engineering (including encouraging efficiency and Self-Refine) provides negligible improvement, suggesting the deficiency reflects fundamental capability limitations rather than prompting artifacts.",
    413   "red_flags": [
    414     {
    415       "flag": "Single-expert benchmark construction",
    416       "detail": "All reference solutions and test case generators were created by a single human expert, introducing potential idiosyncratic bias in what constitutes 'optimal' and 'strong' test cases; inter-rater reliability is not assessed."
    417     },
    418     {
    419       "flag": "No statistical significance tests",
    420       "detail": "All comparative claims between LLMs (e.g., rankings in Table 3, Table 7) are based on point estimates without confidence intervals or significance tests, making it impossible to assess whether performance differences are meaningful."
    421     },
    422     {
    423       "flag": "HumanEval problem contamination not fully addressed",
    424       "detail": "While expert solutions are novel, the HumanEval problem descriptions themselves have been widely published since 2021 and are almost certainly in LLM training data, potentially confounding the evaluation of problem comprehension vs. algorithmic capability."
    425     },
    426     {
    427       "flag": "Unverifiable optimality of reference solutions",
    428       "detail": "The paper acknowledges that reference solutions may not be provably optimal (efficiency scores can theoretically exceed 1.0), meaning the expert standard is aspirational rather than provably tight."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Evaluating large language models trained on code (Codex/HumanEval)",
    434       "relevance": "Primary benchmark whose problems ENAMEL extends; introduces the pass@k metric that eff@k generalizes"
    435     },
    436     {
    437       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation (EvalPlus/HumanEval+)",
    438       "relevance": "Basis for ENAMEL's problem set and stronger test cases; showed original HumanEval tests were insufficient"
    439     },
    440     {
    441       "title": "EffiBench: Benchmarking the efficiency of automatically generated code",
    442       "relevance": "Prior efficiency benchmark compared against ENAMEL; uses random GPT-generated test cases (challenge C5)"
    443     },
    444     {
    445       "title": "Mercury: An efficiency benchmark for LLM code synthesis",
    446       "relevance": "Another efficiency benchmark compared against ENAMEL; uses LeetCode problems with contamination concerns"
    447     },
    448     {
    449       "title": "On evaluating the efficiency of source code generated by LLMs (FORGE 2024)",
    450       "relevance": "Prior efficiency evaluation work; uses single code sample and uncensored execution time (challenges C1, C2)"
    451     },
    452     {
    453       "title": "Self-Refine: Iterative refinement with self-feedback",
    454       "relevance": "Prompting technique tested in Appendix C.8 as a potential approach to improve code efficiency"
    455     },
    456     {
    457       "title": "Can large language models write parallel code?",
    458       "relevance": "Related work on LLM code efficiency evaluation in a different domain (parallelism)"
    459     },
    460     {
    461       "title": "Code Llama: Open foundation models for code",
    462       "relevance": "One of the major open-source code LLMs evaluated in the benchmark study"
    463     }
    464   ],
    465   "engagement_factors": {
    466     "practical_relevance": {
    467       "score": 2,
    468       "justification": "The benchmark is publicly released and directly applicable to developers evaluating LLMs for performance-critical code generation tasks."
    469     },
    470     "surprise_contrarian": {
    471       "score": 2,
    472       "justification": "The finding that correctness rank does not predict efficiency rank (GPT-4 Turbo beats GPT-4 on efficiency despite lower correctness) challenges the common assumption that better models are uniformly better."
    473     },
    474     "fear_safety": {
    475       "score": 0,
    476       "justification": "The paper raises no AI safety or risk concerns; it is a pure capability evaluation benchmark."
    477     },
    478     "drama_conflict": {
    479       "score": 1,
    480       "justification": "The paper implicitly criticizes prior efficiency benchmarks (EffiBench, Mercury) for methodological flaws and shows canonical HumanEval solutions are themselves inefficient, creating mild controversy."
    481     },
    482     "demo_ability": {
    483       "score": 2,
    484       "justification": "The benchmark is available on GitHub and can be used immediately to evaluate any LLM on code efficiency."
    485     },
    486     "brand_recognition": {
    487       "score": 1,
    488       "justification": "UIUC and Qualcomm AI Research are known institutions but not the dominant brands in LLM evaluation; ICLR venue adds credibility."
    489     }
    490   },
    491   "hn_data": {
    492     "threads": [
    493       {
    494         "hn_id": "36371376",
    495         "title": "Starlink v2 reached the brightness reduction target recommended by astronomers",
    496         "points": 122,
    497         "comments": 76,
    498         "url": "https://news.ycombinator.com/item?id=36371376"
    499       },
    500       {
    501         "hn_id": "40739778",
    502         "title": "RAR-B: Reasoning as Retrieval Benchmark",
    503         "points": 11,
    504         "comments": 0,
    505         "url": "https://news.ycombinator.com/item?id=40739778"
    506       },
    507       {
    508         "hn_id": "40795244",
    509         "title": "How Far Can Transformers Reason? The Locality Barrier and Inductive Scratchpad",
    510         "points": 6,
    511         "comments": 0,
    512         "url": "https://news.ycombinator.com/item?id=40795244"
    513       },
    514       {
    515         "hn_id": "40043146",
    516         "title": "Why do small language models underperform?",
    517         "points": 4,
    518         "comments": 1,
    519         "url": "https://news.ycombinator.com/item?id=40043146"
    520       },
    521       {
    522         "hn_id": "40651110",
    523         "title": "Google: Towards a Personal Health Large Language Model",
    524         "points": 3,
    525         "comments": 1,
    526         "url": "https://news.ycombinator.com/item?id=40651110"
    527       },
    528       {
    529         "hn_id": "40341096",
    530         "title": "Linearizing Large Language Models",
    531         "points": 2,
    532         "comments": 0,
    533         "url": "https://news.ycombinator.com/item?id=40341096"
    534       },
    535       {
    536         "hn_id": "39132737",
    537         "title": "The \"Pac-Man'' Gripper: Tactile Sensing and Grasping Through Thin-Shell Buckling",
    538         "points": 2,
    539         "comments": 0,
    540         "url": "https://news.ycombinator.com/item?id=39132737"
    541       },
    542       {
    543         "hn_id": "23526567",
    544         "title": "Training GANs just became *a lot* easier",
    545         "points": 1,
    546         "comments": 0,
    547         "url": "https://news.ycombinator.com/item?id=23526567"
    548       }
    549     ],
    550     "top_points": 122,
    551     "total_points": 151,
    552     "total_comments": 78
    553   }
    554 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs