scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28835B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Learn to Code Sustainably: An Empirical Study on LLM-based Green Code Generation",
      6     "authors": [
      7       "Tina Vartziotis",
      8       "Ippolyti Dellatolas",
      9       "George Dasoulas",
     10       "Maximilian Schmidt",
     11       "Florian Schneider"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2403.03344",
     16     "doi": "10.48550/arXiv.2403.03344"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract's core claims—evaluating AI code generators on sustainability metrics and proposing a Green Capacity definition—are directly supported by the paper's content and methodology.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper makes implicit causal claims such as 'optimization techniques applied to ChatGPT significantly enhance its performance,' but the study uses no random assignment, no controls for confounds, and only 6 problems, making causal inference unsupported.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The conclusion broadly asserts 'LLMs do not yet have the sustainability awareness that would allow for reductions in carbon emissions' based on only 6 LeetCode algorithm problems—an unjustified generalization far beyond the tested scope.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No alternative explanations are offered for observed results, such as why ChatGPT fails on certain problems, why Copilot outperforms others, or whether results reflect tool design vs. training data contamination.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper measures runtime, memory, FLOPs, and energy on 6 short algorithmic problems, then claims broad implications for 'sustainable software development'—without acknowledging that these proxy metrics on toy problems may not reflect real-world software sustainability.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion briefly lists future work items but does not formally enumerate study limitations.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed; obvious threats such as training data contamination of LeetCode problems, cherry-picked human baselines, and the tiny sample of 6 problems are not addressed.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper notes it focuses on energy efficiency of code itself rather than grid carbon intensity or hardware PUE, but does not explicitly bound what conclusions cannot be drawn from 6 problems with a single human submission each.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The acknowledgements mention the work 'inspired a development project between Mercedes-Benz and TWT' and thank IBM, Microsoft, and GitHub, but no formal funding disclosure or grant is stated.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: TWT GmbH Science & Innovation, NTUA, MIT, Harvard, and Mercedes-Benz.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "The work was 'inspired by' a Mercedes-Benz/TWT project, and the paper acknowledges discussions with Microsoft and GitHub while evaluating Microsoft's GitHub Copilot—the funder is not independent of the tools being assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "There is no competing interests statement or declaration of financial interests anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are formally defined: 'green coding' is defined in Section II.A, 'green capacity' receives a precise mathematical definition (Equations 1-3), and each sustainability metric (runtime, memory, FLOPs, energy) is defined with units.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper clearly states it contributes (1) the Green Capacity metric and (2) an empirical evaluation of ChatGPT, Copilot, and CodeWhisperer on sustainability benchmarks.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section II reviews prior sustainability metrics and green coding practices with 60+ citations, situating the work relative to existing energy measurement methods and code quality evaluations of AI tools.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper states 'Our evaluation code is available here' (hyperlink in paper), indicating a release claim—though the actual URL is not visible in the plain text version.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The LeetCode problems are publicly accessible, but the specific generated code samples, measurement data, and human submissions selected are not released as a dataset.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Hardware is specified (Intel Core i7-6700k, 32GB RAM, Linux Mint 21.2, kernel 5.17.0-79-generic, perf 5.15.111, Python 3.12.0rc1) but no dependency files (requirements.txt, Dockerfile) are provided for the evaluation code.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The methodology is described at a high level but no step-by-step reproduction instructions are provided—a reader would need to guess how to run the evaluation pipeline.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "No confidence intervals or error bars are reported anywhere; results are presented as single aggregate values despite 10 measurement samples being taken per run.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are used; all comparisons between tools and between initial/optimized code are purely descriptive.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "The Performance Delta (PD) function computes normalized percentage differences between initial and optimized metrics, which function as effect sizes; these are reported in heatmap figures.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Only 6 LeetCode problems are used with no power analysis or justification for why this sample size is sufficient to support the conclusions drawn.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Ten measurement samples are taken for runtime and energy but only averaged values are reported; no standard deviation, variance, or spread is shown.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Human-written code (top 0.05% of LeetCode submissions by runtime) and initial AI-generated code are used as baselines for comparison against optimized AI output.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "ChatGPT-3.5, GitHub Copilot Individual, and Amazon CodeWhisperer Individual were all contemporary leading tools at the time of the study (early 2024).",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": false,
    193           "answer": false,
    194           "justification": "No ablation study; the comparison between initial and optimized prompts is not an ablation of system components but a prompt variation study.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Five sustainability metrics are evaluated: code correctness, runtime, memory, FLOPs, and energy consumption, combined into the Green Capacity composite measure.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Human-written code is used as a performance baseline, not as human evaluation of AI system outputs; no raters judge code quality.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "This is not a prediction task; the evaluation tests code execution on fixed problem instances, so a held-out test set is not applicable.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down per coding problem (Cookies, Median, Network, Search, Sort, 3Sum) and per tool in both Figure 2 (GC) and Figures 3-5 (PD heatmaps by metric).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Failures are explicitly reported: 'ChatGPT fails to produce a valid code output for 3 out of 6 coding problems' and CodeWhisperer fails on 3 problems for energy optimization.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Negative PD values are reported and displayed in heatmaps, showing cases where optimization prompts produce worse results than the initial unoptimized code.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "ChatGPT version 3.5 is named but without a snapshot date; GitHub Copilot and CodeWhisperer are only identified as 'Individual Version/Tier' with no version number or date.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The prompting approach is described (copy LeetCode task, add function header, prepend optimization instruction) but exact prompt text used for each problem is not provided.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No generation hyperparameters (temperature, top-p, max tokens, etc.) are reported for any of the three models.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "The study uses tools directly via their standard interfaces with no agentic scaffolding.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Problem selection criteria are described (LeetCode algorithm domain, difficulty distribution, ability to generate test cases), and code generation procedure for each tool interface is explained.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Raw measurement data (per-run runtime, energy, memory, FLOPs values) is not released; only aggregated figures are presented.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "The measurement procedure is described in detail: perf tool with specific events (power/energy-pkg/, fp_arith_inst_retired.scalar_double), tracemalloc for memory, 10 repeated measurements averaged for runtime and energy.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants are recruited; the study uses publicly available code generators and LeetCode community submissions.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Figure 1 depicts the full pipeline from problem selection through code generation, correctness evaluation, metric measurement, and GC calculation, with supporting methodological description.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoffs are stated for ChatGPT-3.5, GitHub Copilot, or Amazon CodeWhisperer, despite evaluating them on publicly available LeetCode problems.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The paper acknowledges using 'popular problems' from LeetCode but never discusses the near-certainty that these problems appeared in model training data, which would inflate performance.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "LeetCode problems predate all evaluated models' training cutoffs; the paper selects 'popular problems' that are almost certainly in training corpora but does not address this threat.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "The paper measures energy consumption of generated code execution but does not report the API inference cost (latency or cost) of generating the code itself from the three AI tools.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total computational budget is stated for running the study; only the test hardware is described.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "AI code generation tools (ChatGPT, Copilot, CodeWhisperer) show partial understanding of sustainability optimization criteria when explicitly prompted.",
    375       "evidence": "Green Capacity heatmaps show positive PD values for some tool-problem combinations when prompted to optimize, but failures are common (ChatGPT fails valid output on 3/6 problems for runtime).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "GitHub Copilot achieves the best sustainability performance among the three AI tools evaluated.",
    380       "evidence": "Figure 2 shows Copilot has the highest GC scores in most problems; Figure 3 shows Copilot achieves the best energy optimization behavior across coding problems.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Human top-submissions from LeetCode (top 0.05% by runtime) generally outperform AI-generated code on sustainability metrics.",
    385       "evidence": "GChuman values exceed GCAI values in most problem-tool combinations in Figure 2, and human submissions achieve higher GC than Copilot in the majority of coding problems.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Current LLMs lack sufficient sustainability awareness to drive meaningful carbon emissions reductions through code generation.",
    390       "evidence": "The conclusion states LLMs 'do not yet have the sustainability awareness that would allow for reductions in carbon emissions,' inferred from AI code being heavier than top human submissions on 6 problems.",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "Memory optimization is not significantly achieved by current AI tools when prompted for sustainability.",
    395       "evidence": "Figure 5 shows a high number of zero PD values for memory metric across all tools and problems, indicating memory is not a significant contributor to Green Capacity gains.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Sustainability metrics (runtime, memory, energy, FLOPs) are correlated such that optimizing one tends to improve others.",
    400       "evidence": "Authors note that runtime-optimized human submissions 'can surpass multiple times the optimized code instances' across multiple sustainability metrics simultaneously, implying correlation.",
    401       "supported": "weak"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "case-study"
    407   ],
    408   "key_findings": "When prompted with specific sustainability optimization requests, AI code generators (ChatGPT-3.5, GitHub Copilot, Amazon CodeWhisperer) show partial but inconsistent improvement on sustainability metrics across 6 LeetCode algorithm problems. GitHub Copilot performs best among the three tools, while ChatGPT-3.5 frequently fails to produce valid code for optimization prompts. Human-written top solutions from LeetCode consistently outperform AI-generated code across sustainability metrics, often improving multiple metrics simultaneously without explicit optimization. Memory optimization is largely unresponsive to AI prompting, while energy and runtime show more mixed results.",
    409   "red_flags": [
    410     {
    411       "flag": "Trivial sample size",
    412       "detail": "Only 6 LeetCode algorithm problems are evaluated, yet the paper draws broad conclusions about LLMs' sustainability awareness in software development."
    413     },
    414     {
    415       "flag": "Training data contamination unaddressed",
    416       "detail": "The paper explicitly selects 'popular problems' from LeetCode that almost certainly appeared in training data for all three models; inflated code generation performance is never discussed."
    417     },
    418     {
    419       "flag": "Cherry-picked human baseline",
    420       "detail": "The human baseline is the top 0.05% of LeetCode submissions by runtime—not representative human performance—making AI comparisons misleading."
    421     },
    422     {
    423       "flag": "No statistical tests",
    424       "detail": "All comparisons between tools, problems, and metrics are purely descriptive with no significance testing, CIs, or variance reported despite 10 repeated measurements being taken."
    425     },
    426     {
    427       "flag": "Unspecified model versions",
    428       "detail": "GitHub Copilot and CodeWhisperer are identified only as 'Individual Version/Tier' with no version numbers or snapshot dates, making replication impossible."
    429     },
    430     {
    431       "flag": "Potential COI undisclosed",
    432       "detail": "The work was 'inspired by' a Mercedes-Benz/TWT project; acknowledgements thank Microsoft and GitHub for discussions while GitHub Copilot (Microsoft) is one of the evaluated tools."
    433     },
    434     {
    435       "flag": "Exact prompts not provided",
    436       "detail": "Prompt structure is described qualitatively but exact prompts for each of the six problems and five optimization targets are not shown, preventing replication."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "Energy and Policy Considerations for Deep Learning in NLP",
    442       "relevance": "Foundational green AI paper quantifying carbon costs of NLP model training; directly motivates this work's sustainability framing."
    443     },
    444     {
    445       "title": "Green AI",
    446       "relevance": "CACM paper by Schwartz et al. defining the Green AI concept and carbon handprint; central framing reference."
    447     },
    448     {
    449       "title": "Carbon Emissions and Large Neural Network Training",
    450       "relevance": "Patterson et al. estimating training emissions for large models; motivates the paper's energy focus."
    451     },
    452     {
    453       "title": "Towards the Systematic Reporting of the Energy and Carbon Footprints of Machine Learning",
    454       "relevance": "Henderson et al. providing framework for ML energy reporting; informs the sustainability metrics used."
    455     },
    456     {
    457       "title": "Evaluating the Code Quality of AI-Assisted Code Generation Tools: An Empirical Study on GitHub Copilot, Amazon CodeWhisperer, and ChatGPT",
    458       "relevance": "Yetistiren et al. most directly related prior work evaluating the same three tools on code quality dimensions."
    459     },
    460     {
    461       "title": "Green AI: Do Deep Learning Frameworks Have Different Costs?",
    462       "relevance": "Georgiou et al. evaluating energy costs of different DL frameworks; closely related green software evaluation methodology."
    463     },
    464     {
    465       "title": "Energy Efficiency Across Programming Languages: How Do Energy, Time, and Memory Relate?",
    466       "relevance": "Pereira et al. studying cross-language energy/runtime/memory relationships; foundational for sustainability metric design."
    467     },
    468     {
    469       "title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions",
    470       "relevance": "Nguyen & Nadi empirical Copilot evaluation; direct prior work on AI code generation quality."
    471     },
    472     {
    473       "title": "Measuring the Carbon Intensity of AI in Cloud Instances",
    474       "relevance": "Dodge et al. on carbon intensity measurement; related methodology for AI sustainability assessment."
    475     }
    476   ],
    477   "engagement_factors": {
    478     "practical_relevance": {
    479       "score": 2,
    480       "justification": "Directly addresses whether popular AI coding tools produce energy-efficient code—a decision practitioners face when choosing tooling."
    481     },
    482     "surprise_contrarian": {
    483       "score": 1,
    484       "justification": "Confirms the expected finding that AI tools are less sustainable than top human solutions; no counterintuitive result, though failure rates may surprise."
    485     },
    486     "fear_safety": {
    487       "score": 1,
    488       "justification": "Raises concern that AI code generation may worsen software carbon footprints, but framing is measured rather than alarming."
    489     },
    490     "drama_conflict": {
    491       "score": 0,
    492       "justification": "No controversy angle; the paper neutrally compares tools without competitive framing or conflicting prior claims."
    493     },
    494     "demo_ability": {
    495       "score": 2,
    496       "justification": "All three evaluated tools (ChatGPT, Copilot, CodeWhisperer) are publicly accessible; readers can easily replicate the prompt approach on LeetCode problems."
    497     },
    498     "brand_recognition": {
    499       "score": 2,
    500       "justification": "Evaluates OpenAI ChatGPT, GitHub Copilot, and Amazon CodeWhisperer—three high-profile commercial AI coding tools with strong brand recognition."
    501     }
    502   },
    503   "hn_data": {
    504     "threads": [
    505       {
    506         "hn_id": "41470074",
    507         "title": "Hardware Acceleration of LLMs: A comprehensive survey and comparison",
    508         "points": 266,
    509         "comments": 68,
    510         "url": "https://news.ycombinator.com/item?id=41470074",
    511         "created_at": "2024-09-06T22:09:14Z"
    512       },
    513       {
    514         "hn_id": "26382160",
    515         "title": "Lord of the Ring(s): Side Channel Attacks on the CPU On-Chip Ring Interconnect",
    516         "points": 188,
    517         "comments": 55,
    518         "url": "https://news.ycombinator.com/item?id=26382160",
    519         "created_at": "2021-03-08T03:55:27Z"
    520       },
    521       {
    522         "hn_id": "42960989",
    523         "title": "Pre-Trained Large Language Models Use Fourier Features for Addition (2024)",
    524         "points": 149,
    525         "comments": 40,
    526         "url": "https://news.ycombinator.com/item?id=42960989",
    527         "created_at": "2025-02-06T10:31:06Z"
    528       },
    529       {
    530         "hn_id": "22908089",
    531         "title": "AutoML-Zero: Evolving Machine Learning Algorithms from Scratch",
    532         "points": 5,
    533         "comments": 0,
    534         "url": "https://news.ycombinator.com/item?id=22908089",
    535         "created_at": "2020-04-18T14:43:47Z"
    536       },
    537       {
    538         "hn_id": "23371555",
    539         "title": "Evolving Machine Learning Algorithms from Primitive Mathematical Operations",
    540         "points": 4,
    541         "comments": 0,
    542         "url": "https://news.ycombinator.com/item?id=23371555",
    543         "created_at": "2020-05-31T18:38:11Z"
    544       },
    545       {
    546         "hn_id": "47309151",
    547         "title": "Building AI Coding Agents for the Terminal",
    548         "points": 3,
    549         "comments": 0,
    550         "url": "https://news.ycombinator.com/item?id=47309151",
    551         "created_at": "2026-03-09T13:59:16Z"
    552       },
    553       {
    554         "hn_id": "22895579",
    555         "title": "AutoML-Zero: Evolving Machine Learning Algorithms from Scratch",
    556         "points": 3,
    557         "comments": 0,
    558         "url": "https://news.ycombinator.com/item?id=22895579",
    559         "created_at": "2020-04-17T02:08:58Z"
    560       },
    561       {
    562         "hn_id": "43065682",
    563         "title": "Scaling Test-Time Compute Can Be More Effective Than Scaling Parameters (2024)",
    564         "points": 2,
    565         "comments": 0,
    566         "url": "https://news.ycombinator.com/item?id=43065682",
    567         "created_at": "2025-02-16T05:48:45Z"
    568       },
    569       {
    570         "hn_id": "40008963",
    571         "title": "Quantified CSPs are either PSPACE-complete or inside Pi_2",
    572         "points": 2,
    573         "comments": 0,
    574         "url": "https://news.ycombinator.com/item?id=40008963",
    575         "created_at": "2024-04-12T02:51:09Z"
    576       },
    577       {
    578         "hn_id": "39991305",
    579         "title": "First detection in space of the high-energy isomer of cyanomethanimine",
    580         "points": 2,
    581         "comments": 0,
    582         "url": "https://news.ycombinator.com/item?id=39991305",
    583         "created_at": "2024-04-10T14:48:08Z"
    584       }
    585     ],
    586     "top_points": 266,
    587     "total_points": 624,
    588     "total_comments": 163
    589   }
    590 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs