scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26996B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "On Evaluating the Efficiency of Source Code Generated by LLMs",
      6     "authors": [
      7       "Changan Niu",
      8       "Ting Zhang",
      9       "Chuanyi Li",
     10       "Bin Luo",
     11       "Vincent Ng"
     12     ],
     13     "year": 2024,
     14     "venue": "2024 IEEE/ACM First International Conference on AI Foundation Models and Software Engineering (Forge)",
     15     "arxiv_id": "2404.06041",
     16     "doi": "10.1145/3650105.3652295"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims evaluation on HumanEval/MBPP and LeetCode, and prompting strategies are all demonstrated in Sections 2.1 and 2.2 with supporting evidence.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "RQ2 tests causal relationship between prompts and code efficiency via controlled experiments across three prompt variants (Figure 3, Table 4), showing differential effects.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Results scoped to three benchmarks (HumanEval, MBPP, LeetCodeEval) and Python/C++ respectively. Paper acknowledges differences across benchmarks (Section 2.1.5: 'LLM performs differently across benchmarks').",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Paper discusses why prompting works better on LeetCode (more diverse test cases), why training strategy affects efficiency (DeepSeek Base vs Instruct), and attributes benchmark differences to data distribution.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Paper explicitly measures runtime via gem5 simulator (HumanEval/MBPP) and LeetCode platform submissions. Clear distinction between correctness (Pass@10) and efficiency (runtime) metrics reported separately.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 3 'Threats to Validity' is dedicated to limitations, discussing data leakage, runtime instability, and mitigation strategies.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats identified: (1) data leakage mitigated by selecting LeetCode problems post-May 2023 cutoff; (2) runtime instability mitigated via gem5 simulator and 10 repeated runs.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Paper states C++ focus for LeetCodeEval, acknowledges hard subset cannot be evaluated (0 problems passing all models), and notes results only for problems where all LLMs pass.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments state support from 'Cooperation Fund of Huawei-NJU Creative Laboratory', 'CCF-Huawei Populus Grove Fund', and 'NSF award 2034508'.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "Author affiliations listed (Nanjing University, Singapore Management University, UT Dallas) but no disclosure whether any authors are affiliated with evaluated model providers (OpenAI, Meta, Microsoft, DeepSeek).",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "NSF funding is independent. Huawei funding is manufacturer but not provider of evaluated LLMs, so reasonable independence, though Huawei could benefit from benchmark insights.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement provided. No declaration of patents, equity, consulting relationships, or other financial interests.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Paper defines 'efficiency' as runtime (measured via gem5 or LeetCode platform), formally defines 'average normalized runtime' metric and 'Pass@10' metric in Section 2.1.4.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit contributions stated in introduction: (1) evaluate LLM code efficiency, (2) propose LeetCodeEval benchmark, (3) investigate prompting strategies for efficient code generation.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Related Work section cites DeepDev-PERF, Madaan et al.'s PIE work on code optimization, Self-Refine, and code quality evaluation papers, explaining how this work differs (efficiency focus vs quality focus).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Paper states 'We also make code, data and other artifacts available online' with GitHub reference [1] pointing to https://github.com/NougatCA/EfficencyEval.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "HumanEval and MBPP are publicly available benchmarks. LeetCodeEval problem selection and raw results claimed to be on GitHub. Public benchmark data is accessible.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Gem5 simulator used but no configuration details, version, or specifications provided. No requirements.txt, Dockerfile, or dependency specification for reproduction.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "Steps described: generate k responses, execute via gem5/LeetCode, repeat 10 times for HumanEval/MBPP or 3 times for LeetCodeEval. GitHub repo likely contains detailed scripts but paper provides outline.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Tables 2, 3, 4 report single averaged values (average normalized runtime, speedup) with no confidence intervals or error bars despite running evaluations 10 times.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "Paper makes comparative claims ('GPT-4 has highest efficiency', 'Prompt 3 best for medium problems') without reporting p-values or statistical significance tests.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Speedup rates reported in Table 4 (e.g., 1.06x for GPT-4 Prompt 1) serve as effect sizes. Normalized runtime comparisons show relative magnitudes.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Paper acknowledges only 70/164 HumanEval and 242/399 MBPP problems pass all models, making sample very small for comparisons. No power analysis or sample size justification provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Despite running each evaluation 10 times, paper reports only average runtime values in tables. Standard deviation, variance across runs, or confidence intervals are not reported.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "RQ1 evaluates 6 different models (GPT-4, GPT-3.5, Phi-2, Code Llama, WizardCoder, DeepSeek) as baselines for comparison. RQ2 compares 3 prompt variants against baseline.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Models from 2023-2024 (GPT-4-1106-preview, Code Llama 2023, DeepSeek Coder 2024) are contemporary with the 2024 paper publication.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "RQ2 ablates prompting strategy across three variants (direct instruction vs two chain-of-thought approaches), showing impact of prompting method on efficiency.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "RQ1 uses normalized runtime and Pass@10. RQ2 uses speedup and percentage beats on LeetCode. Multiple metrics enable multifaceted evaluation.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human participants or human evaluation of code. Efficiency measured automatically via simulator and LeetCode platform submissions.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "HumanEval and MBPP use standard held-out test cases. LeetCodeEval leverages LeetCode's official test suites for correctness and runtime verification.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results broken down by difficulty (easy/medium/hard), model variants (Tables 2-3), prompting methods (Table 4), and language (Python vs C++).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "Paper mentions treating failures as speedup=1 but does not discuss which problems failed, why, or patterns in failures across models and prompts.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Paper reports DeepSeek Coder 33B Base poor speedup (1.00-1.05), Phi-2 slower code in some cases, and hard subset has 0 passing problems (making evaluation impossible).",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "GPT models specified with exact version IDs (gpt-3.5-turbo-1106, gpt-4-1106-preview). Code Llama, WizardCoder, DeepSeek specified with parameter counts and training variant.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Figure 2 shows LeetCodeEval prompt template with placeholders. Figure 3 describes three prompting methods with example structure. Exact optimization prompts for RQ2 not fully shown.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Temperature, top_p, or other sampling parameters not reported. Paper mentions generating k responses (Pass@10 context suggests k≥10) but exact value and sampling settings absent.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding used. Models queried directly with prompts. Not applicable to this study.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "LeetCodeEval preprocessing documented: filter problems with images and more downvotes than upvotes, split by difficulty. Code from Liu et al. used but preprocessing details external.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "Paper claims to release 'data and other artifacts' on GitHub. Raw runtime measurements and problem lists likely available though not explicitly confirmed in paper.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Data collection for HumanEval/MBPP via LLM API calls and gem5 simulation described. LeetCodeEval collection via problem selection and platform submission clearly described.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants recruited. Not applicable.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Pipeline clear: generate code → verify correctness → measure runtime (HumanEval/MBPP via gem5, LeetCodeEval via platform) → repeat and average. High-level documentation present.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "LeetCodeEval uses May 2023 problems 'this is the latest GPT-4 knowledge cutoff'. Other models' training cutoffs not explicitly stated, only GPT covered.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Data leakage addressed for GPT via problem date cutoff. Paper does not discuss whether HumanEval/MBPP existed before training cutoff or contamination for other models.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "LeetCodeEval explicitly avoids contamination by selecting post-cutoff problems. HumanEval/MBPP are standard benchmarks but potential pre-training contamination not discussed.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants. Not applicable.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants. Not applicable.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants. Not applicable.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants. Not applicable.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants. Not applicable.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants. Not applicable.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants. Not applicable.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No API call costs reported for GPT models. Local model inference cost or latency not documented. Only runtime of generated code measured, not inference latency.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Total computational budget (API costs, GPU hours, simulator CPU time) not reported. Scale of evaluation (10 runs × multiple models × hundreds of problems) not quantified in resource terms.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Code generation ability (correctness) is not positively correlated with code efficiency ability",
    375       "evidence": "GPT-4 highest Pass@10 (98.2% HumanEval) but GPT-3.5 generates faster code (8.35 vs 8.61 normalized runtime). Phi-2 lowest Pass@10 (62.8%) but generates fastest or near-fastest code.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Model parameter size does not determine code efficiency",
    380       "evidence": "Code Llama series (7B, 13B, 34B) shows runtime 9.95→9.87→9.93 (stable). WizardCoder similar pattern 9.35→9.18→9.04 without clear scaling.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Training strategy and data significantly impact efficiency of generated code",
    385       "evidence": "DeepSeek Coder 33B Base vs Instruct: 9.40 vs 7.54 runtime on HumanEval (22% difference from instruction-tuning alone).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Chain-of-thought prompting enables more efficient code generation on complex problems",
    390       "evidence": "Prompts 2&3 show 1.16-1.18x speedup on LeetCode medium vs Prompt 1 at 1.07x for GPT-4. Effect stronger on harder problems due to larger optimization space.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Prompting effectiveness varies by problem complexity and benchmark",
    395       "evidence": "LeetCodeEval shows larger speedups (1.03-1.18x) vs HumanEval/MBPP (1.00-1.06x). Medium subset gap wider than easy due to constrained vs large optimization space.",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "methodology_tags": [
    400     "benchmark-eval",
    401     "observational"
    402   ],
    403   "key_findings": "The paper demonstrates that code efficiency in LLM-generated code is orthogonal to correctness and model size, driven instead by training strategy. Chain-of-thought prompting yields 3-18% speedups on complex problems, though gains diminish on simple problems. Benchmark choice matters: LeetCode's larger test cases reveal efficiency differences invisible to HumanEval/MBPP.",
    404   "red_flags": [
    405     {
    406       "flag": "Gem5 simulator validity unvalidated",
    407       "detail": "Paper uses gem5 simulator to measure runtime but does not validate correlation with actual wall-clock runtime. Simulator could introduce systematic biases not present in real execution."
    408     },
    409     {
    410       "flag": "Severe sample attrition in comparisons",
    411       "detail": "Only 70/164 HumanEval and 242/399 MBPP problems pass all LLMs (43% and 61% retention). Hard subset has 0 problems passing, making it impossible to evaluate on hardest tasks."
    412     },
    413     {
    414       "flag": "No statistical significance testing",
    415       "detail": "Differences in normalized runtime are reported without p-values or confidence intervals despite small samples and potential runtime variance. Risk of noise being reported as signal."
    416     },
    417     {
    418       "flag": "Hyperparameters not reported",
    419       "detail": "Temperature, top_p, and sampling method not disclosed. These significantly impact output and could explain differences between models or prompts."
    420     },
    421     {
    422       "flag": "Inconsistent model coverage in prompting study",
    423       "description": "RQ1 evaluates 6 models but RQ2 prompting only tests 3 models (GPT-4, GPT-3.5, DeepSeek Coder). No prompting data for Code Llama or WizardCoder variants."
    424     },
    425     {
    426       "flag": "Limited mechanistic understanding",
    427       "detail": "Paper observes that correctness and efficiency decouple but does not investigate why—are models not trained for efficiency? Is this a random variation? Do different architectures handle trade-offs differently?"
    428     },
    429     {
    430       "flag": "Narrow efficiency definition",
    431       "detail": "Only runtime measured. Memory efficiency, code size, maintainability, and readability not addressed despite being relevant efficiency dimensions."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "Program Synthesis with Large Language Models",
    437       "relevance": "Foundational work on LLM code generation capabilities and benchmarks"
    438     },
    439     {
    440       "title": "Evaluating Large Language Models Trained on Code",
    441       "relevance": "Introduces HumanEval benchmark and correctness evaluation methodology"
    442     },
    443     {
    444       "title": "Is Your Code Generated by ChatGPT Really Correct?",
    445       "relevance": "Prior work evaluating correctness of LLM code generation across models"
    446     },
    447     {
    448       "title": "Learning Performance-Improving Code Edits",
    449       "relevance": "PIE dataset and chain-of-thought prompting for code optimization (directly cited for Prompt 2&3)"
    450     },
    451     {
    452       "title": "DeepDev-PERF: a deep learning-based approach for improving software performance",
    453       "relevance": "Alternative approach to code efficiency improvement using deep learning"
    454     },
    455     {
    456       "title": "Code Llama: Open Foundation Models for Code",
    457       "relevance": "Description of Code Llama architecture and capabilities, one of evaluated models"
    458     },
    459     {
    460       "title": "DeepSeek LLM: Scaling Open-Source Language Models with Longtermism",
    461       "relevance": "DeepSeek Coder model description and capabilities"
    462     },
    463     {
    464       "title": "Evaluating the code quality of ai-assisted code generation tools",
    465       "relevance": "Prior work on code quality metrics and LLM evaluation beyond correctness"
    466     }
    467   ],
    468   "engagement_factors": {
    469     "practical_relevance": {
    470       "score": 2,
    471       "justification": "Practitioners care about efficiency, but recommendations (use newer models, use chain-of-thought) are limited and context-dependent. Hard subset unevaluable limits real-world applicability."
    472     },
    473     "surprise_contrarian": {
    474       "score": 2,
    475       "justification": "Decoupling of correctness and efficiency is somewhat unexpected, but efficiency not correlating with raw capability is intuitive. No major paradigm shift challenged."
    476     },
    477     "fear_safety": {
    478       "score": 0,
    479       "justification": "No safety concerns, vulnerabilities, or alignment issues raised. Purely performance-oriented study."
    480     },
    481     "drama_conflict": {
    482       "score": 0,
    483       "justification": "Straightforward empirical study with no controversy, competitive comparison drama, or contentious claims."
    484     },
    485     "demo_ability": {
    486       "score": 2,
    487       "justification": "Could demonstrate by running LLM code through LeetCode or simulator, but requires API access and setup. Not immediately reproducible for casual readers."
    488     },
    489     "brand_recognition": {
    490       "score": 1,
    491       "justification": "Nanjing University (moderate tier internationally), Singapore Management University, UT Dallas. FORGE 2024 is a specialized venue, not a top-tier conference. Limited brand visibility."
    492     }
    493   },
    494   "hn_data": {
    495     "threads": [
    496       {
    497         "hn_id": "40370779",
    498         "title": "Simultaneous Many-Row Activation in Off-the-Shelf DRAM Chips",
    499         "points": 7,
    500         "comments": 0,
    501         "url": "https://news.ycombinator.com/item?id=40370779",
    502         "created_at": "2024-05-15T18:44:38Z"
    503       },
    504       {
    505         "hn_id": "39368490",
    506         "title": "Keyframer: Empowering Animation Design Using Large Language Models (Apple)",
    507         "points": 6,
    508         "comments": 1,
    509         "url": "https://news.ycombinator.com/item?id=39368490",
    510         "created_at": "2024-02-14T10:48:19Z"
    511       },
    512       {
    513         "hn_id": "40286055",
    514         "title": "Forklift: An Extensible Neural Lifter",
    515         "points": 3,
    516         "comments": 0,
    517         "url": "https://news.ycombinator.com/item?id=40286055",
    518         "created_at": "2024-05-07T14:39:26Z"
    519       },
    520       {
    521         "hn_id": "43426799",
    522         "title": "Aardvark weather: end-to-end data-driven weather forecasting",
    523         "points": 2,
    524         "comments": 0,
    525         "url": "https://news.ycombinator.com/item?id=43426799",
    526         "created_at": "2025-03-20T18:10:12Z"
    527       },
    528       {
    529         "hn_id": "43211832",
    530         "title": "Heat as a Witness of Quantum Properties",
    531         "points": 2,
    532         "comments": 0,
    533         "url": "https://news.ycombinator.com/item?id=43211832",
    534         "created_at": "2025-02-28T21:48:33Z"
    535       },
    536       {
    537         "hn_id": "41245268",
    538         "title": "Dwellers in the Deep: Biological Consequences of Dark Oxygen",
    539         "points": 2,
    540         "comments": 0,
    541         "url": "https://news.ycombinator.com/item?id=41245268",
    542         "created_at": "2024-08-14T12:25:02Z"
    543       },
    544       {
    545         "hn_id": "40948891",
    546         "title": "Fast-moving stars around an intermediate-mass black hole in Omega Centauri",
    547         "points": 2,
    548         "comments": 0,
    549         "url": "https://news.ycombinator.com/item?id=40948891",
    550         "created_at": "2024-07-12T20:03:03Z"
    551       },
    552       {
    553         "hn_id": "39050109",
    554         "title": "Mission: Impossible Language Models",
    555         "points": 2,
    556         "comments": 0,
    557         "url": "https://news.ycombinator.com/item?id=39050109",
    558         "created_at": "2024-01-19T00:38:50Z"
    559       },
    560       {
    561         "hn_id": "39026660",
    562         "title": "Mission: Impossible Language Models",
    563         "points": 2,
    564         "comments": 0,
    565         "url": "https://news.ycombinator.com/item?id=39026660",
    566         "created_at": "2024-01-17T12:11:54Z"
    567       },
    568       {
    569         "hn_id": "41284222",
    570         "title": "Assessing the Learning Limits of LLMs with Synthetic Impossible Languages",
    571         "points": 1,
    572         "comments": 0,
    573         "url": "https://news.ycombinator.com/item?id=41284222",
    574         "created_at": "2024-08-18T18:27:15Z"
    575       }
    576     ],
    577     "top_points": 7,
    578     "total_points": 29,
    579     "total_comments": 1
    580   }
    581 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs