scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28735B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "On Leakage of Code Generation Evaluation Datasets",
      6     "authors": [
      7       "Alexandre Matton",
      8       "Tom Sherborne",
      9       "Dennis Aumiller",
     10       "Elena Tommasone",
     11       "Milad Alizadeh"
     12     ],
     13     "year": 2024,
     14     "venue": "Conference on Empirical Methods in Natural Language Processing",
     15     "arxiv_id": "2407.07565",
     16     "doi": "10.48550/arXiv.2407.07565"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All three contamination sources (direct leakage, synthetic data, overfitting) are supported by specific evidence: GitHub search data (Fig 1), similarity analysis (Fig 2), and model ranking changes (Table 2).",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about fine-tuning with evol-instruct improving HumanEval scores (Section 3.2: +9% HumanEval, +0% LBPP) and overfitting to benchmarks (evidenced by rank changes in Table 2 and performance gap in Fig 3) are supported by controlled fine-tuning experiment and comparative results.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Claims are bounded to code generation benchmarks and LLM evaluation. The title, abstract, and scope focus on HumanEval, MBPP, and code generation specifically, with generalizations limited to this domain.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Paper attributes performance gaps between LBPP and HumanEval/MBPP primarily to contamination and overfitting. While difficulty is acknowledged (Section 4: 'harder problems would naturally have lower pass rates'), other explanations (distribution shift, benchmark-specific optimizations, sample variance) are not explored.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Paper distinguishes between what was measured (Pass@1 on specific benchmarks) and what is claimed (model 'code capabilities' and contamination effects). The distinction is present though could be more explicit.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 6 titled 'Limitations' discusses that analysis was black-box without training set inspection and notes LBPP may face same contamination path as HumanEval/MBPP.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threat identified: inability to inspect model weights or training sets except for synthetic data work. This is concrete rather than boilerplate, though other threats (annotation bias, small sample, Pass@1 validity) go undiscussed.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Focus on code generation benchmarks (HumanEval, MBPP) is explicit throughout. Conclusion acknowledges the fast-paced model development context for LLMs specifically.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source is disclosed in the paper. All authors are from Cohere but no mention of funding agency or grants is provided.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors listed with Cohere affiliation in byline. However, no explicit discussion of Cohere's financial stake in how their models (Command R, Command R+) are evaluated.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "All authors are Cohere employees evaluating benchmarks that directly affect evaluation of Cohere's own Command R models (Table 2). Funder/employer is not independent of the outcome.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement provided in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Contamination is explicitly defined: 'any procedure leaking datasets during model training such that these datasets are no longer unseen at inference.' Data leakage and overfitting are used in standard ways.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three clear contributions: (1) analysis of three contamination sources in code benchmarks, (2) release of LBPP uncontaminated benchmark, (3) evidence of overfitting to HumanEval/MBPP. These are stated in abstract and introduction.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Paper engages with Riddell et al. (2024) on contamination quantification, contrasting their own ranking changes against prior findings. Discusses related benchmarks (LiveCodeBench, SWE-bench) and synthetic datasets (evol-instruct, Starcoder).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "LBPP dataset is released on HuggingFace, but analysis code for contamination detection (GitHub searches, similarity embeddings, fine-tuning experiments) is not released.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "LBPP benchmark (161 problems with unit tests) is publicly available at huggingface.co/datasets/CohereForAI/lbpp.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, or environment specifications provided. Python is implied but no dependency or version details given.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step instructions for reproducing benchmark evaluation, fine-tuning experiments, or contamination analysis. Methods are described but not as executable reproduction steps.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Table 2 reports Pass@1 scores as single numbers with no confidence intervals, error bars, or uncertainty measures. No variance across multiple evaluation runs reported.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "Model performance comparisons (Table 2) and ranking changes between LBPP and HumanEval/MBPP lack statistical significance tests. Figure 3 correlations shown without p-values.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "Percentage differences reported in text ('up to 43% worse', '+9% HumanEval increase') but not formalized as effect sizes with confidence intervals. Raw percentages without standardization.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "LBPP contains 161 problems, similar to HumanEval (164) and MBPP (427). No power analysis or justification for why 161 is adequate. Number of models (n≈25) reasonable but not explicitly justified.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Single Pass@1 scores reported per model. No variance across runs, repeated evaluations, or standard deviations provided. Benchmark evaluation typically requires multiple runs to estimate variance.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "LBPP evaluated against HumanEval and MBPP baselines. 25+ models compared including GPT-4, Claude, Llama, Mistral, Command R.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "HumanEval and MBPP are the standard evaluation benchmarks cited in all major 2023-2024 code model papers. Contemporary as de facto standards even if originally published 2021.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "No ablation study. Single LBPP benchmark evaluated without systematic ablations on problem difficulty, problem categories, or evaluation metrics.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": false,
    200           "justification": "Primary metric is Pass@1 only. Section 4 mentions per-category failure analysis (21% 2D arrays, 18% graphs, etc.) but these are qualitative observations, not quantitative metrics per category.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "Human annotators created benchmark problems (dataset construction), but no human evaluation of model outputs/solution quality is included.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "LBPP explicitly designed as held-out, previously unseen test set uncontaminated in training corpora. Purpose is to provide genuinely unseen evaluation.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "Section 4 'Challenges in LBPP' identifies failure patterns qualitatively but reports no quantitative per-category Pass@1 metrics or systematic breakdown by problem type.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Table 1 shows sample unsolved problems. Section 4 discusses common error patterns (21% 2D/3D arrays, 18% graphs, 17% complex programming concepts) across models.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "All major models perform worse on LBPP than HumanEval/MBPP (Table 2). Fine-tuning on evol-instruct shows +0% improvement on LBPP despite +9% on HumanEval (Section 3.2).",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Table 2 specifies exact model versions (e.g., 'Claude-3.5-Sonnet', 'GPT-4o', 'Mistral Large', 'Llama3 70B Instruct'). All identifiable versions with clear names/snapshots.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "Example problems shown in Table 1 and problem descriptions provided, but full evaluation prompts, system instructions, and prompt templates not included.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No temperature, top-p, max_tokens, or other inference hyperparameters reported for model evaluations. 'Zero-shot' mentioned but no sampling parameters.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Paper specifies 'zero-shot' evaluation for HumanEval, MBPP, and LBPP, indicating no agentic scaffolding. Standard benchmark evaluation without agents.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "LBPP creation process documented: annotators create problems, citation of sources verified, problems manually checked for originality/difficulty, ~1/3 rejected. Final author review performed.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "LBPP released on HuggingFace with 161 problems, unit tests per problem (median 4), and problem descriptions available for independent verification.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Detailed description of LBPP creation: competitive programming experts created novel problems, avoided web sources, cited all inspirations, verified no web matches, underwent additional review for originality/clarity.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "Annotators described as having 'competitive programming experience' but no details on how many annotators, recruitment method, or selection criteria provided. Compensation mentioned ('paid above minimum wage') but no recruitment description.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Full pipeline documented in Section 4: annotation → review for disqualification → additional review for originality/hardness/ambiguity → manual verification by authors → final dataset.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No specific training data cutoff dates stated for models being evaluated. Models released in 2024 and earlier but exact training cutoff unknown.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Entire paper addresses train/test overlap. Three sources of contamination documented: direct (GitHub), indirect (synthetic data), overfitting (model selection). Core thesis.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "Paper's main contribution is addressing HumanEval/MBPP contamination by creating LBPP designed to be uncontaminated. Evidence provided that models were trained on or optimized for existing benchmarks.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "NA — no human subjects research, no pre-registration applicable.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "NA — human annotation of benchmark is dataset creation, not human subjects research requiring ethics approval.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "NA — no human subjects research.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "NA — no human subjects research.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "NA — no human subjects research.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "NA — no human subjects research.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "NA — no human subjects research.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, latency, or computational requirements reported for evaluating models on LBPP. No cost comparison with HumanEval/MBPP.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total computational budget, API costs, or resource requirements disclosed for running the evaluation across 25+ models.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Contemporary LLMs are contaminated with HumanEval and MBPP evaluation data in their training sets",
    375       "evidence": "GitHub search showing every HumanEval prompt appears 43-1102 times in public repos (median 99, Fig 1). Riddell et al. (2024) found 12.2% in The Pile, 18.9% in The Stack. Widespread replication of benchmarks shown.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Synthetic code training datasets (evol-instruct, Starcoder) contain semantically equivalent problems to HumanEval and MBPP",
    380       "evidence": "Cosine similarity analysis (Fig 2) shows high similarity between synthetic datasets and public benchmarks. Tables 3 and 5 show concrete examples of nearly identical problems across datasets.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Fine-tuning on synthetic data (evol-instruct) improves HumanEval performance more than LBPP performance",
    385       "evidence": "Fine-tuning Command R Refresh on evol-instruct: +9% HumanEval, +2% MBPP, +0% LBPP (Section 3.2). Shows synthetic dataset overlap with HumanEval but not LBPP.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Model rankings change between HumanEval/MBPP and LBPP, indicating overfitting to existing benchmarks",
    390       "evidence": "Table 2 shows ranking inversions: e.g., Mistral Large rank 1→5 from HumanEval to LBPP, Claude-3-Haiku rank 8→14. Contrast with Riddell et al. finding no ranking changes.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "LBPP shows strong correlation with HumanEval/MBPP but models perform 27-43% worse on LBPP",
    395       "evidence": "Figure 3 shows significant correlations (Pearson r appears >0.8). Table 2 shows models 27-43% worse on LBPP: GPT-4o 90%→63% HumanEval→LBPP, Mistral 7B 31%→11%.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Overfitting to evaluation sets occurs through model selection prioritizing narrow metrics",
    400       "evidence": "Section 3.3 argues incentive structure favors HumanEval/MBPP performance. Model ranking changes (Table 2) and performance drops (Table 2, Fig 3) suggest checkpoint selection over-influenced by contaminated benchmarks.",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "Standard decontamination methods have low recall for code datasets",
    405       "evidence": "Elazar et al. (2024) found only 1.22% verbatim HumanEval in OSCAR despite widespread presence. Paper argues n-gram and hashing methods inadequate for code's semantic variations.",
    406       "supported": "moderate"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval",
    411     "observational",
    412     "case-study"
    413   ],
    414   "key_findings": "The paper demonstrates three sources of benchmark contamination in code generation evaluation: (1) direct leakage via widespread GitHub availability of HumanEval/MBPP problems (median 99 occurrences per problem); (2) indirect leakage through synthetic training datasets with semantically similar problems to public benchmarks; (3) overfitting through model selection pressure. LBPP, a new 161-problem uncontaminated benchmark, reveals leading models perform 27-43% worse than on HumanEval/MBPP with different model rankings, suggesting prior metrics were inflated by contamination.",
    415   "red_flags": [
    416     {
    417       "flag": "No statistical significance testing",
    418       "detail": "Model comparisons lack confidence intervals, error bars, or significance tests. Pass@1 scores reported as single numbers without variance measures or uncertainty quantification."
    419     },
    420     {
    421       "flag": "Undisclosed funding/conflict of interest",
    422       "detail": "All authors are Cohere employees evaluating benchmarks that directly impact evaluation of Cohere's own Command R models (Table 2). No funding disclosure or conflict statement provided."
    423     },
    424     {
    425       "flag": "Small benchmark sample size",
    426       "detail": "LBPP contains 161 problems (vs HumanEval's 164, MBPP's 427). No power analysis or justification for adequacy of this sample size."
    427     },
    428     {
    429       "flag": "Annotation bias not addressed",
    430       "detail": "Human annotators with 'competitive programming experience' created problems; no discussion of potential selection bias or inter-annotator agreement metrics."
    431     },
    432     {
    433       "flag": "Black-box evaluation limits mechanistic understanding",
    434       "detail": "No inspection of model training data, weights, or actual reasoning. Contamination and overfitting inferred from performance gaps rather than directly measured."
    435     },
    436     {
    437       "flag": "Incomplete failure analysis",
    438       "detail": "Per-category error analysis (21% 2D arrays, 18% graphs) is qualitative only. No quantitative per-category Pass@1 metrics or systematic error categorization."
    439     },
    440     {
    441       "flag": "Hyperparameters not specified",
    442       "detail": "No temperature, top-p, max_tokens, or other inference parameters reported for model evaluations, limiting reproducibility."
    443     },
    444     {
    445       "flag": "Alternative explanations underdiscussed",
    446       "detail": "Performance gap between LBPP and HumanEval/MBPP attributed to contamination, but difficulty difference alone could explain much of the gap."
    447     }
    448   ],
    449   "cited_papers": [
    450     {
    451       "title": "Evaluating Large Language Models Trained on Code",
    452       "relevance": "Original HumanEval benchmark definition and evaluation paradigm that paper builds on"
    453     },
    454     {
    455       "title": "Program Synthesis with Large Language Models",
    456       "relevance": "MBPP benchmark definition; one of two primary benchmarks analyzed for contamination"
    457     },
    458     {
    459       "title": "Quantifying Contamination in Evaluating Code Generation Capabilities of Language Models",
    460       "relevance": "Prior work on code benchmark contamination; paper contrasts its ranking-change findings with this work"
    461     },
    462     {
    463       "title": "What's in My Big Data?",
    464       "relevance": "Data contamination analysis in pretraining datasets; foundational contamination detection methodology"
    465     },
    466     {
    467       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    468       "relevance": "Alternative code evaluation benchmark attempting to address limitations of HumanEval/MBPP"
    469     },
    470     {
    471       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    472       "relevance": "Concurrent work on continuously updated uncontaminated code benchmark"
    473     },
    474     {
    475       "title": "Deduplicating Training Data Makes Language Models Better",
    476       "relevance": "Methods and limitations of n-gram/hashing based decontamination in training data"
    477     },
    478     {
    479       "title": "Octopack: Instruction Tuning Code Large Language Models",
    480       "relevance": "Code instruction-tuning dataset creation and evaluation methodology"
    481     }
    482   ],
    483   "engagement_factors": {
    484     "practical_relevance": {
    485       "score": 2,
    486       "justification": "Practitioners need reliable benchmarks, and LBPP addresses real evaluation validity concerns, but it's one benchmark among many and doesn't fundamentally solve contamination problem long-term."
    487     },
    488     "surprise_contrarian": {
    489       "score": 2,
    490       "justification": "Challenges assumption that HumanEval/MBPP are reliable, but contamination in benchmarks is well-known since Riddell et al. (2024) and earlier work. Main surprise is ranking changes, not contamination existence."
    491     },
    492     "fear_safety": {
    493       "score": 1,
    494       "justification": "Relates to AI evaluation integrity and trustworthiness but not directly about safety risks, AI failures, or existential concerns. Methodological rather than risk-focused."
    495     },
    496     "drama_conflict": {
    497       "score": 1,
    498       "justification": "Technical benchmark paper without major controversy, heated disagreement, or conflict narrative. Constructive contribution rather than polemical."
    499     },
    500     "demo_ability": {
    501       "score": 2,
    502       "justification": "LBPP is public and runnable against any model, but requires coding infrastructure. No interactive web demo or playable interface for casual exploration."
    503     },
    504     "brand_recognition": {
    505       "score": 2,
    506       "justification": "Published at EMNLP 2024 (top venue), from Cohere (known LLM company), but authors not universally famous. Moderate recognition in code evaluation community."
    507     }
    508   },
    509   "hn_data": {
    510     "threads": [
    511       {
    512         "hn_id": "41541888",
    513         "title": "Complexity as Design Material",
    514         "points": 5,
    515         "comments": 0,
    516         "url": "https://news.ycombinator.com/item?id=41541888"
    517       },
    518       {
    519         "hn_id": "40960419",
    520         "title": "Free Will and Falling Cats",
    521         "points": 4,
    522         "comments": 2,
    523         "url": "https://news.ycombinator.com/item?id=40960419"
    524       },
    525       {
    526         "hn_id": "40788653",
    527         "title": "Inference Acceleration for Large Language Models on CPUs",
    528         "points": 3,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=40788653"
    531       },
    532       {
    533         "hn_id": "40703874",
    534         "title": "An Image Is Worth 32 Tokens for Reconstruction and Generation [pdf]",
    535         "points": 3,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=40703874"
    538       },
    539       {
    540         "hn_id": "46775133",
    541         "title": "Hallucination Stations: On Some Basic Limitations of Transformer-Based Language",
    542         "points": 2,
    543         "comments": 1,
    544         "url": "https://news.ycombinator.com/item?id=46775133"
    545       },
    546       {
    547         "hn_id": "39016301",
    548         "title": "Towards Conversational Diagnostic AI",
    549         "points": 2,
    550         "comments": 1,
    551         "url": "https://news.ycombinator.com/item?id=39016301"
    552       },
    553       {
    554         "hn_id": "40906498",
    555         "title": "MuMath-Code on ArXiv",
    556         "points": 2,
    557         "comments": 0,
    558         "url": "https://news.ycombinator.com/item?id=40906498"
    559       },
    560       {
    561         "hn_id": "40722136",
    562         "title": "An Image Is Worth 32 Tokens for Reconstruction and Generation",
    563         "points": 2,
    564         "comments": 0,
    565         "url": "https://news.ycombinator.com/item?id=40722136"
    566       },
    567       {
    568         "hn_id": "39030639",
    569         "title": "Towards Conversational Diagnostic AI",
    570         "points": 2,
    571         "comments": 0,
    572         "url": "https://news.ycombinator.com/item?id=39030639"
    573       },
    574       {
    575         "hn_id": "46765383",
    576         "title": "Some Basic Limitations of Transformer-Based Language Models",
    577         "points": 1,
    578         "comments": 0,
    579         "url": "https://news.ycombinator.com/item?id=46765383"
    580       }
    581     ],
    582     "top_points": 5,
    583     "total_points": 26,
    584     "total_comments": 4
    585   }
    586 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs